Repository: OpenCCG/openccg Branch: master Commit: 9968e814a490 Files: 981 Total size: 8.0 MB Directory structure: gitextract_fqyg89tm/ ├── .gitignore ├── AUTHORS ├── CHANGES ├── LICENSE ├── README.md ├── SAMPLE_GRAMMARS ├── TODO ├── bin/ │ ├── ccg-build │ ├── ccg-build.bat │ ├── ccg-cvr │ ├── ccg-cvr.bat │ ├── ccg-draw-graph │ ├── ccg-draw-graph.bat │ ├── ccg-draw-tree │ ├── ccg-draw-tree.bat │ ├── ccg-env │ ├── ccg-env.bat │ ├── ccg-grammardoc │ ├── ccg-grammardoc.bat │ ├── ccg-gt │ ├── ccg-gt.bat │ ├── ccg-ht-factors │ ├── ccg-hypertagger │ ├── ccg-hypertagger.bat │ ├── ccg-parse │ ├── ccg-parse.bat │ ├── ccg-postagger │ ├── ccg-postagger.bat │ ├── ccg-realize │ ├── ccg-realize.bat │ ├── ccg-supertagger │ ├── ccg-supertagger.bat │ ├── ccg-test │ ├── ccg-test.bat │ ├── ccg-update │ ├── ccg-update.bat │ ├── ccg2xml │ ├── ccg2xml.bat │ ├── ccg_draw_tree.py │ ├── dlf_parser.py │ ├── tccg │ ├── tccg.bat │ ├── visccg │ ├── visccg.bat │ └── wccg ├── build.xml ├── ccg-format-grammars/ │ ├── arabic/ │ │ └── arabic.ccg │ ├── inherit/ │ │ └── inherit.ccg │ ├── tiny/ │ │ └── tiny.ccg │ └── tinytiny/ │ └── tinytiny.ccg ├── ccgbank/ │ ├── bin/ │ │ ├── american-to-logical-quotes.py │ │ ├── convert-mtc-systems.py │ │ ├── convert-spaces-to-newlines.py │ │ ├── convert_all │ │ ├── correlate-to-judgments.py │ │ ├── filter_feats.py │ │ ├── find-betas-no-gold.py │ │ ├── gen_parser_events_a │ │ ├── gen_parser_events_b │ │ ├── gen_parser_events_c │ │ ├── gen_parser_events_d │ │ ├── gen_parser_events_e │ │ ├── gen_realizer_events_a │ │ ├── gen_realizer_events_b │ │ ├── gen_realizer_events_c │ │ ├── gen_realizer_events_d │ │ ├── gen_realizer_events_e │ │ ├── get-text-from-mtc-style.py │ │ ├── get-truecase-list.py │ │ ├── get-uniq-nbest.py │ │ ├── get_factors_from_parse.py │ │ ├── get_just_words_from_ner_text.py │ │ ├── lowercase_tagged_text.py │ │ ├── merge-mtc-ids.py │ │ ├── merge-stanford-morpha-with-pos.py │ │ ├── merge_pos_ne.py │ │ ├── my_unicode.py │ │ ├── nbest-mtc-to-bleu-nist.py │ │ ├── ner/ │ │ │ ├── NERApp/ │ │ │ │ └── src/ │ │ │ │ └── nerapp/ │ │ │ │ └── NERApp.java │ │ │ ├── build-ner-api.properties │ │ │ ├── build-ner-api.xml │ │ │ ├── ner-tag.sh │ │ │ ├── ner_word.py │ │ │ └── post-process-stanford-ner.py │ │ ├── normalize_text.py │ │ ├── post-process-metricsmatr.py │ │ ├── prepare-for-stanford-morpha.py │ │ ├── reverse-spaces-to-newlines.py │ │ ├── run-all-bleu.sh │ │ ├── stem_nns_vbx │ │ ├── toUTF-8.py │ │ └── write_morph.py │ ├── build-ht.properties │ ├── build-ht.xml │ ├── build-models.properties │ ├── build-models.xml │ ├── build-original.properties │ ├── build-original.xml │ ├── build-ps.properties │ ├── build-ps.xml │ ├── build-release.xml │ ├── build-rz.properties │ ├── build-rz.xml │ ├── build-st.properties │ ├── build-st.xml │ ├── build.properties │ ├── build.xml │ ├── data/ │ │ ├── README │ │ ├── get_wsj_nns_vb │ │ ├── novel/ │ │ │ └── two-sents │ │ ├── sample/ │ │ │ └── AUTO/ │ │ │ └── 00/ │ │ │ └── wsj_0001.auto │ │ ├── stem_wsj_nns_vb │ │ ├── wsj-nns-vb │ │ ├── wsj-nns-vb-stems │ │ └── wsj_0595Corrected.auto │ ├── extract/ │ │ ├── add-chunks.xsl │ │ ├── convert-to-graph.xsl │ │ ├── convert-to-hlds.xsl │ │ ├── grammar.xml │ │ └── raise-nodes.xsl │ ├── models/ │ │ ├── hypertagger/ │ │ │ ├── ht-prior.flm │ │ │ ├── ht.config │ │ │ ├── ht2.config │ │ │ ├── ht2.train.config │ │ │ ├── pos.config │ │ │ ├── posprior.flm │ │ │ └── vocab.flm │ │ ├── parser/ │ │ │ ├── binary.flm │ │ │ ├── gen-events.prefs │ │ │ ├── leaf.flm │ │ │ ├── model.init │ │ │ ├── parse.prefs │ │ │ ├── top.flm │ │ │ ├── unary.flm │ │ │ └── vocab.flm │ │ ├── realizer/ │ │ │ ├── alph.init │ │ │ ├── gen-events.prefs │ │ │ ├── model.init │ │ │ ├── rz-test.prefs │ │ │ └── stp3.flm │ │ └── supertagger/ │ │ ├── pos.config │ │ ├── posprior.flm │ │ ├── st.config │ │ ├── st.config.train │ │ ├── st.noprior.config │ │ ├── stprior.flm │ │ └── vocab.flm │ ├── original/ │ │ └── models/ │ │ ├── postagger/ │ │ │ ├── pos.config │ │ │ └── posprior.flm │ │ └── supertagger/ │ │ ├── st.config │ │ ├── stprior.flm │ │ └── vocab.flm │ ├── plugins/ │ │ ├── MyGenSynScorer.java │ │ ├── MyNgramCombo.java │ │ ├── MyNgramGenSynProduct.java │ │ ├── MyNgramPrecisionBaselineGenInterp.java │ │ ├── MyNgramPrecisionPerceptronInterp.java │ │ ├── MyParserPerceptronScorer.java │ │ ├── MyRealizerPerceptronScorer.java │ │ ├── MySynAgrFeatureExtractor.java │ │ ├── MySynSemAgrFeatureExtractor.java │ │ └── MySynSemFeatureExtractor.java │ ├── stanford-nlp/ │ │ ├── classifiers/ │ │ │ └── stanfordner-README │ │ └── stanfordnlp-README │ └── templates/ │ ├── addFilterLexFeats.xsl │ ├── addStems.xsl │ ├── adjustAppos.xsl │ ├── adjustCandCcats1.xsl │ ├── adjustCats.xsl │ ├── adjustParenthetical.xsl │ ├── adjustReportedSpeech.xsl │ ├── adjustRoles.xsl │ ├── adv-placement.xsl │ ├── agr-macroInsert.xsl │ ├── allotIdLeaf.xsl │ ├── allotIdTree.xsl │ ├── allotIndexRel.xsl │ ├── anim-macroInsert.xsl │ ├── annotateAppos-Dash.xsl │ ├── annotateAppos1.xsl │ ├── annotateAppos2.xsl │ ├── annotateAppos3.xsl │ ├── annotateBrackets.xsl │ ├── annotateColons.xsl │ ├── annotateDots.xsl │ ├── annotateExtraposedAppos.xsl │ ├── annotateNom-AdjConj.xsl │ ├── annotateParentheticals1.xsl │ ├── annotateParentheticals2.xsl │ ├── annotatePlace.xsl │ ├── annotatePrtConjs.xsl │ ├── annotateQuotes.xsl │ ├── annotateReportedSpeech.xsl │ ├── annotateStrayAppos.xsl │ ├── annotateVPCommas.xsl │ ├── balanceAppos.xsl │ ├── balanceDash-Paren.xsl │ ├── catCheck.xsl │ ├── ccgRules.xsl │ ├── changePunct.xsl │ ├── closedCatInsert.xsl │ ├── collapseMWUFull.xsl │ ├── collapseMWUPart.xsl │ ├── collapseMWUSharedTask.xsl │ ├── computeCats.xsl │ ├── convTags.xsl │ ├── correctMistakes1.xsl │ ├── correctPPHeads.xsl │ ├── exportToAuto.xsl │ ├── filterLex.xsl │ ├── find-s-back-n.xsl │ ├── genchal11-out.xsl │ ├── inferConjRules.xsl │ ├── insertLF.xsl │ ├── insertOrigPunctsLF.xsl │ ├── insertPTBInfo.xsl │ ├── insertPunctLF-PosMod.xsl │ ├── insertPunctLF.xsl │ ├── insertQuoteSemClassInfo.xsl │ ├── insertSemFeats.xsl │ ├── introduceMMExtns.xsl │ ├── labelAppos.xsl │ ├── labelConj1.xsl │ ├── labelConj2.xsl │ ├── labelConj3.xsl │ ├── labelPlace1.xsl │ ├── labelPlace2.xsl │ ├── labelPuncts.xsl │ ├── lexExtr.xsl │ ├── macroInsert.xsl │ ├── macroLexDef.xsl │ ├── markMistakes.xsl │ ├── markUnmatched.xsl │ ├── mergeMorph.xsl │ ├── morphExtr.xsl │ ├── normPTBTags.xsl │ ├── normPunctPos.xsl │ ├── origPunctRules.xsl │ ├── overtWHLexRels.xsl │ ├── overtWHPronouns.xsl │ ├── phraseExtractor.xsl │ ├── preSentAdj.xsl │ ├── punctLexConjRules.xsl │ ├── reinsertPTBInfo.xsl │ ├── repairUnmatched.xsl │ ├── replaceColons.xsl │ ├── rulesExtr.xsl │ ├── sentFinalPuncts.xsl │ ├── trueCaser.xsl │ └── uncurryBareParse.xsl ├── devel/ │ ├── BEN.TODO │ └── schedule.txt ├── docs/ │ ├── build.xml │ ├── ccgbank-README │ ├── guide/ │ │ ├── build.xml │ │ ├── cgloss4e.sty │ │ ├── gb4e.sty │ │ ├── guide.tex │ │ ├── openccg.bib │ │ └── openccg.sty │ ├── index.html │ ├── maxent.cpp.patch │ ├── realizer/ │ │ ├── build.xml │ │ ├── cgloss4e.sty │ │ ├── gb4e.sty │ │ ├── manual.tex │ │ ├── openccg.sty │ │ └── refs.bib │ ├── style.css │ └── taggers-README ├── grammars/ │ ├── add-chunks.xsl │ ├── add-family-members.xsl │ ├── append.xsl │ ├── categories.xsd │ ├── comic/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── dict.xsl │ │ ├── grammar.xml │ │ ├── lexicon-base.xsl │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── rules-base.xml │ │ ├── rules.xml │ │ ├── testbed.xml │ │ ├── types-extras.xml │ │ └── types.xml │ ├── convert-lists.xsl │ ├── convert-to-graph.xsl │ ├── convert-to-hlds.xsl │ ├── core-en/ │ │ ├── add-chunks.xsl │ │ ├── add-intonation-info.xsl │ │ ├── adj.xsl │ │ ├── adv.xsl │ │ ├── auxv.xsl │ │ ├── cats.xsl │ │ ├── conj.xsl │ │ ├── derive-features.xsl │ │ ├── det.xsl │ │ ├── dict.xsl │ │ ├── drop-features.xsl │ │ ├── lexicon.xsl │ │ ├── misc.xsl │ │ ├── np.xsl │ │ ├── pp.xsl │ │ ├── punct.xsl │ │ ├── raise-nodes.xsl │ │ ├── templates.xsl │ │ ├── types.xml │ │ ├── unary-rules.xsl │ │ └── v.xsl │ ├── dict.xsd │ ├── extract-morph.xsl │ ├── flights/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── dict.xsl │ │ ├── flairs.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xsl │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── nina107.xml │ │ ├── rules-base.xml │ │ ├── rules.xml │ │ ├── testbed.xml │ │ ├── types-extras.xml │ │ ├── types.xml │ │ └── vera.xml │ ├── grammar.xsd │ ├── hlds.xsd │ ├── lexicon.xsd │ ├── mini-basque/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ ├── rules.xml │ │ ├── testbed.out │ │ ├── testbed.xml │ │ └── types.xml │ ├── mini-dyirbal/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ ├── rules.xml │ │ ├── testbed.out │ │ ├── testbed.xml │ │ └── types.xml │ ├── mini-english/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ ├── rules.xml │ │ ├── testbed.out │ │ ├── testbed.xml │ │ └── types.xml │ ├── mini-inuit/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ ├── rules.xml │ │ ├── testbed.out │ │ ├── testbed.xml │ │ └── types.xml │ ├── mini-nezperce/ │ │ ├── build.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ └── types.xml │ ├── mini-tagalog/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ ├── rules.xml │ │ ├── testbed.out │ │ ├── testbed.xml │ │ └── types.xml │ ├── mini-turkish/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── parameters.xml │ │ ├── preset-families.xml │ │ ├── rules.xml │ │ ├── testbed.out │ │ ├── testbed.xml │ │ └── types.xml │ ├── morph.xsd │ ├── parameters.xsd │ ├── parametric-lexicon.xsl │ ├── parametric-types.xsl │ ├── raise-nodes.xsl │ ├── routes/ │ │ ├── build.xml │ │ ├── dict.xml │ │ ├── dlf_test.xml │ │ ├── grammar.xml │ │ ├── lexicon-base.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── rules.xml │ │ ├── testbed.xml │ │ └── types.xml │ ├── rules.xsd │ ├── simplify-lists.xsl │ ├── tiny/ │ │ ├── build.xml │ │ ├── grammar.xml │ │ ├── lexicon.xml │ │ ├── morph.xml │ │ ├── rules.xml │ │ ├── testbed.xml │ │ └── types.xml │ ├── tokens.xsd │ ├── treeify-lists.xsl │ ├── types.xsd │ └── worldcup/ │ ├── add-chunks.xsl │ ├── build.xml │ ├── dict.xml │ ├── grammar.xml │ ├── lexicon-base.xsl │ ├── lexicon.xml │ ├── morph.xml │ ├── raise-nodes.xsl │ ├── rules.xml │ └── testbed.xml ├── lib/ │ ├── ASL │ ├── LGPL │ ├── LIBNOTES │ ├── MIT │ ├── SUN │ ├── jdom.license │ └── jline.license ├── pom.xml ├── src/ │ ├── ccg2xml/ │ │ ├── README │ │ ├── Tree.py │ │ ├── arabic.ccg │ │ ├── build.xml │ │ ├── ccg.ply │ │ ├── ccg_editor.py │ │ ├── convert-ply.py │ │ ├── grammar_template.ccg │ │ ├── lex.py │ │ └── yacc.py │ ├── kenlm/ │ │ ├── COPYING │ │ ├── COPYING.LESSER │ │ ├── LICENSE │ │ ├── README │ │ ├── build_jnilib.sh │ │ ├── clean_query_only.sh │ │ ├── jni/ │ │ │ └── wrap.cc │ │ ├── lm/ │ │ │ ├── Jamfile │ │ │ ├── bhiksha.cc │ │ │ ├── bhiksha.hh │ │ │ ├── binary_format.cc │ │ │ ├── binary_format.hh │ │ │ ├── blank.hh │ │ │ ├── build_binary_main.cc │ │ │ ├── config.cc │ │ │ ├── config.hh │ │ │ ├── enumerate_vocab.hh │ │ │ ├── facade.hh │ │ │ ├── fragment_main.cc │ │ │ ├── kenlm_max_order_main.cc │ │ │ ├── left.hh │ │ │ ├── left_test.cc │ │ │ ├── lm_exception.cc │ │ │ ├── lm_exception.hh │ │ │ ├── max_order.hh │ │ │ ├── model.cc │ │ │ ├── model.hh │ │ │ ├── model_test.cc │ │ │ ├── model_type.hh │ │ │ ├── ngram_query.hh │ │ │ ├── partial.hh │ │ │ ├── partial_test.cc │ │ │ ├── quantize.cc │ │ │ ├── quantize.hh │ │ │ ├── query_main.cc │ │ │ ├── read_arpa.cc │ │ │ ├── read_arpa.hh │ │ │ ├── return.hh │ │ │ ├── search_hashed.cc │ │ │ ├── search_hashed.hh │ │ │ ├── search_trie.cc │ │ │ ├── search_trie.hh │ │ │ ├── sizes.cc │ │ │ ├── sizes.hh │ │ │ ├── state.hh │ │ │ ├── test.arpa │ │ │ ├── test_nounk.arpa │ │ │ ├── trie.cc │ │ │ ├── trie.hh │ │ │ ├── trie_sort.cc │ │ │ ├── trie_sort.hh │ │ │ ├── value.hh │ │ │ ├── value_build.cc │ │ │ ├── value_build.hh │ │ │ ├── virtual_interface.cc │ │ │ ├── virtual_interface.hh │ │ │ ├── weights.hh │ │ │ └── word_index.hh │ │ └── util/ │ │ ├── Jamfile │ │ ├── bit_packing.cc │ │ ├── bit_packing.hh │ │ ├── bit_packing_test.cc │ │ ├── double-conversion/ │ │ │ ├── Jamfile │ │ │ ├── LICENSE │ │ │ ├── bignum-dtoa.cc │ │ │ ├── bignum-dtoa.h │ │ │ ├── bignum.cc │ │ │ ├── bignum.h │ │ │ ├── cached-powers.cc │ │ │ ├── cached-powers.h │ │ │ ├── diy-fp.cc │ │ │ ├── diy-fp.h │ │ │ ├── double-conversion.cc │ │ │ ├── double-conversion.h │ │ │ ├── fast-dtoa.cc │ │ │ ├── fast-dtoa.h │ │ │ ├── fixed-dtoa.cc │ │ │ ├── fixed-dtoa.h │ │ │ ├── ieee.h │ │ │ ├── strtod.cc │ │ │ ├── strtod.h │ │ │ └── utils.h │ │ ├── ersatz_progress.cc │ │ ├── ersatz_progress.hh │ │ ├── exception.cc │ │ ├── exception.hh │ │ ├── fake_ofstream.hh │ │ ├── file.cc │ │ ├── file.hh │ │ ├── file_piece.cc │ │ ├── file_piece.hh │ │ ├── file_piece_test.cc │ │ ├── getopt.c │ │ ├── getopt.hh │ │ ├── have.hh │ │ ├── joint_sort.hh │ │ ├── joint_sort_test.cc │ │ ├── mmap.cc │ │ ├── mmap.hh │ │ ├── multi_intersection.hh │ │ ├── multi_intersection_test.cc │ │ ├── murmur_hash.cc │ │ ├── murmur_hash.hh │ │ ├── pcqueue.hh │ │ ├── pool.cc │ │ ├── pool.hh │ │ ├── probing_hash_table.hh │ │ ├── probing_hash_table_test.cc │ │ ├── proxy_iterator.hh │ │ ├── read_compressed.cc │ │ ├── read_compressed.hh │ │ ├── read_compressed_test.cc │ │ ├── scoped.cc │ │ ├── scoped.hh │ │ ├── sized_iterator.hh │ │ ├── sorted_uniform.hh │ │ ├── sorted_uniform_test.cc │ │ ├── string_piece.cc │ │ ├── string_piece.hh │ │ ├── string_piece_hash.hh │ │ ├── thread_pool.hh │ │ ├── tokenize_piece.hh │ │ ├── tokenize_piece_test.cc │ │ ├── usage.cc │ │ └── usage.hh │ ├── opennlp/ │ │ ├── ccg/ │ │ │ ├── Parse.java │ │ │ ├── Realize.java │ │ │ ├── TextCCG.java │ │ │ ├── WebCCG.java │ │ │ ├── alignment/ │ │ │ │ ├── AbstractEncodingScheme.java │ │ │ │ ├── Alignment.java │ │ │ │ ├── Alignments.java │ │ │ │ ├── EncodingScheme.java │ │ │ │ ├── IdentifiedPhraseReader.java │ │ │ │ ├── IdentifiedPhraseWriter.java │ │ │ │ ├── IndexBase.java │ │ │ │ ├── Mapping.java │ │ │ │ ├── MappingFormat.java │ │ │ │ ├── MappingGroup.java │ │ │ │ ├── MappingReader.java │ │ │ │ ├── MappingWriter.java │ │ │ │ ├── MosesEncodingScheme.java │ │ │ │ ├── NAACLEncodingScheme.java │ │ │ │ ├── Phrase.java │ │ │ │ ├── PhrasePosition.java │ │ │ │ ├── PhraseReader.java │ │ │ │ ├── PhraseWriter.java │ │ │ │ ├── Status.java │ │ │ │ └── package.html │ │ │ ├── disjunctivizer/ │ │ │ │ ├── AlignedEdgeFilter.java │ │ │ │ ├── Disjunctivizer.java │ │ │ │ ├── EdgeMatchFilter.java │ │ │ │ ├── FilteredLFEdgeSet.java │ │ │ │ ├── LFGraphDifference.java │ │ │ │ ├── LabelMatchFilter.java │ │ │ │ ├── MatchType.java │ │ │ │ ├── MatchTypeFilter.java │ │ │ │ ├── VertexMatchFilter.java │ │ │ │ └── package.html │ │ │ ├── grammar/ │ │ │ │ ├── AbstractApplicationRule.java │ │ │ │ ├── AbstractCompositionRule.java │ │ │ │ ├── AbstractRule.java │ │ │ │ ├── AbstractSubstitutionRule.java │ │ │ │ ├── AbstractTypeRaisingRule.java │ │ │ │ ├── BackwardApplication.java │ │ │ │ ├── BackwardComposition.java │ │ │ │ ├── BackwardSubstitution.java │ │ │ │ ├── BackwardTypeRaising.java │ │ │ │ ├── ForwardApplication.java │ │ │ │ ├── ForwardComposition.java │ │ │ │ ├── ForwardSubstitution.java │ │ │ │ ├── ForwardTypeRaising.java │ │ │ │ ├── FragmentJoining.java │ │ │ │ ├── GlueRule.java │ │ │ │ ├── Grammar.java │ │ │ │ ├── Rule.java │ │ │ │ ├── RuleGroup.java │ │ │ │ ├── TypeChangingRule.java │ │ │ │ ├── Types.java │ │ │ │ └── to-apml.xsl │ │ │ ├── grammardoc/ │ │ │ │ ├── AbstractDocumenter.java │ │ │ │ ├── Documenter.java │ │ │ │ ├── DocumenterContext.java │ │ │ │ ├── DocumenterException.java │ │ │ │ ├── DocumenterFactory.java │ │ │ │ ├── DocumenterName.java │ │ │ │ ├── DocumenterNotFoundException.java │ │ │ │ ├── DocumenterSourceException.java │ │ │ │ ├── GrammarDoc.java │ │ │ │ ├── GrammarDocException.java │ │ │ │ ├── SourceGrammar.java │ │ │ │ ├── SourceGrammarFile.java │ │ │ │ ├── SourceGrammarFileType.java │ │ │ │ └── html/ │ │ │ │ ├── HTMLDocumenter.java │ │ │ │ ├── base.xsl │ │ │ │ ├── categories.xsl │ │ │ │ ├── comments.xsl │ │ │ │ ├── grammar.xsl │ │ │ │ ├── grammardoc.css │ │ │ │ ├── lexicon.js │ │ │ │ ├── lexicon.xsl │ │ │ │ ├── morph.xsl │ │ │ │ ├── navigation.xsl │ │ │ │ ├── rules.xsl │ │ │ │ └── types.xsl │ │ │ ├── hylo/ │ │ │ │ ├── Alt.java │ │ │ │ ├── Box.java │ │ │ │ ├── Compacter.java │ │ │ │ ├── Converter.java │ │ │ │ ├── Diamond.java │ │ │ │ ├── EPsScorer.java │ │ │ │ ├── EnglishAgreementExtractor.java │ │ │ │ ├── Flattener.java │ │ │ │ ├── HyloAtom.java │ │ │ │ ├── HyloFormula.java │ │ │ │ ├── HyloHelper.java │ │ │ │ ├── HyloVar.java │ │ │ │ ├── LexDepFeatureExtractor.java │ │ │ │ ├── LexDependency.java │ │ │ │ ├── ModalOp.java │ │ │ │ ├── Mode.java │ │ │ │ ├── ModeLabel.java │ │ │ │ ├── ModeVar.java │ │ │ │ ├── Nominal.java │ │ │ │ ├── NominalAtom.java │ │ │ │ ├── NominalVar.java │ │ │ │ ├── Op.java │ │ │ │ ├── Proposition.java │ │ │ │ ├── SatOp.java │ │ │ │ └── graph/ │ │ │ │ ├── DefaultLFEdgeFactory.java │ │ │ │ ├── LFEdge.java │ │ │ │ ├── LFEdgeFactory.java │ │ │ │ ├── LFEdgeLabel.java │ │ │ │ ├── LFGraph.java │ │ │ │ ├── LFGraphFactory.java │ │ │ │ ├── LFVertex.java │ │ │ │ ├── LFVertexType.java │ │ │ │ └── package.html │ │ │ ├── lexicon/ │ │ │ │ ├── DataItem.java │ │ │ │ ├── DefaultTokenizer.java │ │ │ │ ├── EnglishExpander.java │ │ │ │ ├── EntriesItem.java │ │ │ │ ├── FactorChainWord.java │ │ │ │ ├── FactorKey.java │ │ │ │ ├── Family.java │ │ │ │ ├── FullWord.java │ │ │ │ ├── LexException.java │ │ │ │ ├── Lexicon.java │ │ │ │ ├── LicensingFeature.java │ │ │ │ ├── ListPairWord.java │ │ │ │ ├── MacroAdder.java │ │ │ │ ├── MacroItem.java │ │ │ │ ├── MorphItem.java │ │ │ │ ├── SimpleWord.java │ │ │ │ ├── SupertaggerAdapter.java │ │ │ │ ├── Tokenizer.java │ │ │ │ ├── TrueCaser.java │ │ │ │ ├── Word.java │ │ │ │ └── WordWithPitchAccent.java │ │ │ ├── ngrams/ │ │ │ │ ├── AAnFilter.java │ │ │ │ ├── AbstractStandardNgramModel.java │ │ │ │ ├── ConditionalProbabilityTable.java │ │ │ │ ├── FactoredNgramModel.java │ │ │ │ ├── FactoredNgramModelFamily.java │ │ │ │ ├── KenNgramModel.java │ │ │ │ ├── LinearNgramScorerCombo.java │ │ │ │ ├── NgramDiversityPruningStrategy.java │ │ │ │ ├── NgramFilter.java │ │ │ │ ├── NgramPrecisionModel.java │ │ │ │ ├── NgramScorer.java │ │ │ │ ├── RepetitionScorer.java │ │ │ │ ├── Reversible.java │ │ │ │ ├── SRILMNgramModel.java │ │ │ │ ├── SRILMNgramModelType.java │ │ │ │ ├── SRILM_FactoredScorerMaker.java │ │ │ │ ├── SRILM_ScorerMaker.java │ │ │ │ ├── SelfParaphraseBiaser.java │ │ │ │ ├── SignScorerInterpolation.java │ │ │ │ ├── SignScorerProduct.java │ │ │ │ ├── StandardNgramModel.java │ │ │ │ └── kenlm/ │ │ │ │ ├── MurmurHash.java │ │ │ │ └── jni/ │ │ │ │ └── KenLM.java │ │ │ ├── parse/ │ │ │ │ ├── Chart.java │ │ │ │ ├── DerivationHistory.java │ │ │ │ ├── Edge.java │ │ │ │ ├── EdgeHash.java │ │ │ │ ├── ParseException.java │ │ │ │ ├── Parser.java │ │ │ │ ├── Supertagger.java │ │ │ │ ├── postagger/ │ │ │ │ │ ├── BasicPOSTagger.java │ │ │ │ │ ├── DummyPOSTagger.java │ │ │ │ │ ├── POSTagSequenceGetter.java │ │ │ │ │ ├── POSTagger.java │ │ │ │ │ └── ml/ │ │ │ │ │ ├── POSPriorModel.java │ │ │ │ │ └── POSTagFex.java │ │ │ │ ├── supertagger/ │ │ │ │ │ ├── JavaSupertaggingApp.java │ │ │ │ │ ├── LabellingStrategy.java │ │ │ │ │ ├── WordAndPOSDictionaryLabellingStrategy.java │ │ │ │ │ ├── io/ │ │ │ │ │ │ ├── XMLPOSDictionaryReader.java │ │ │ │ │ │ └── XMLWordDictionaryReader.java │ │ │ │ │ ├── ml/ │ │ │ │ │ │ ├── FeatureExtractor.java │ │ │ │ │ │ ├── STFex.java │ │ │ │ │ │ ├── STPriorModel.java │ │ │ │ │ │ └── ZhangLeTrainingExtractor.java │ │ │ │ │ └── util/ │ │ │ │ │ ├── PipedTokenizer.java │ │ │ │ │ ├── ProbPairComparator.java │ │ │ │ │ ├── STTaggerDictionary.java │ │ │ │ │ ├── STTaggerPOSDictionary.java │ │ │ │ │ ├── STTaggerWordDictionary.java │ │ │ │ │ ├── SupertagSequenceGetter.java │ │ │ │ │ └── TaggingDictionaryExtractor.java │ │ │ │ └── tagger/ │ │ │ │ ├── Constants.java │ │ │ │ ├── ProbIndexPair.java │ │ │ │ ├── TaggedWord.java │ │ │ │ ├── io/ │ │ │ │ │ ├── CorpusIterator.java │ │ │ │ │ ├── PipeDelimitedFactoredBundleCorpusIterator.java │ │ │ │ │ └── SRILMFactoredBundleCorpusIterator.java │ │ │ │ ├── ml/ │ │ │ │ │ ├── MaxentModel.java │ │ │ │ │ ├── TaggerFeature.java │ │ │ │ │ ├── ZLMEM.java │ │ │ │ │ └── ZLMaxentModel.java │ │ │ │ ├── sequencescoring/ │ │ │ │ │ ├── Backpointer.java │ │ │ │ │ ├── FBNode.java │ │ │ │ │ ├── SequenceScorer.java │ │ │ │ │ └── Trellis.java │ │ │ │ └── util/ │ │ │ │ ├── CCGBankToSRILMFLM.java │ │ │ │ ├── ConfigFileProcessor.java │ │ │ │ └── ResultSink.java │ │ │ ├── perceptron/ │ │ │ │ ├── Alphabet.java │ │ │ │ ├── ComposedFeatureExtractor.java │ │ │ │ ├── ComposedFeatureVector.java │ │ │ │ ├── EventFile.java │ │ │ │ ├── FeatureExtractor.java │ │ │ │ ├── FeatureList.java │ │ │ │ ├── FeatureMap.java │ │ │ │ ├── FeatureVector.java │ │ │ │ ├── Model.java │ │ │ │ ├── PerceptronScorer.java │ │ │ │ ├── ReRankingPerceptronScorer.java │ │ │ │ └── Trainer.java │ │ │ ├── realize/ │ │ │ │ ├── Chart.java │ │ │ │ ├── DiversityPruningStrategy.java │ │ │ │ ├── Edge.java │ │ │ │ ├── EdgeCombos.java │ │ │ │ ├── EdgeFactory.java │ │ │ │ ├── EdgeHash.java │ │ │ │ ├── FeatureLicenser.java │ │ │ │ ├── Hypertagger.java │ │ │ │ ├── LexicalDiversityPruningStrategy.java │ │ │ │ ├── NBestPruningStrategy.java │ │ │ │ ├── PruningStrategy.java │ │ │ │ ├── Realizer.java │ │ │ │ ├── RuleInstance.java │ │ │ │ ├── StemPruningStrategy.java │ │ │ │ ├── Tracker.java │ │ │ │ └── hypertagger/ │ │ │ │ ├── FeatureExtractionException.java │ │ │ │ ├── LFInfo.java │ │ │ │ ├── LFLoader.java │ │ │ │ ├── LMFactorExtractor.java │ │ │ │ ├── TagExtract.java │ │ │ │ ├── TagExtractor.java │ │ │ │ ├── ZLMaxentHypertagger.java │ │ │ │ ├── ZLMaxentModel.java │ │ │ │ └── ZLPOSTagger.java │ │ │ ├── synsem/ │ │ │ │ ├── AbstractCat.java │ │ │ │ ├── Arg.java │ │ │ │ ├── ArgStack.java │ │ │ │ ├── AtomCat.java │ │ │ │ ├── BasicArg.java │ │ │ │ ├── CatReader.java │ │ │ │ ├── Category.java │ │ │ │ ├── CategoryFcn.java │ │ │ │ ├── CategoryFcnAdapter.java │ │ │ │ ├── ComplexCat.java │ │ │ │ ├── DerivationHandler.java │ │ │ │ ├── Dollar.java │ │ │ │ ├── GenerativeSyntacticModel.java │ │ │ │ ├── LF.java │ │ │ │ ├── LexLogProbFeatureExtractor.java │ │ │ │ ├── LexSemOrigin.java │ │ │ │ ├── Modality.java │ │ │ │ ├── ReRankingScorer.java │ │ │ │ ├── SetArg.java │ │ │ │ ├── Sign.java │ │ │ │ ├── SignHash.java │ │ │ │ ├── SignScorer.java │ │ │ │ ├── Slash.java │ │ │ │ ├── SlashMode.java │ │ │ │ ├── SyntacticFeatureExtractor.java │ │ │ │ ├── TargetCat.java │ │ │ │ └── VarModality.java │ │ │ ├── test/ │ │ │ │ ├── CrossValidateRealizer.java │ │ │ │ ├── DerivMaker.java │ │ │ │ ├── GenTargets.java │ │ │ │ ├── Regression.java │ │ │ │ ├── RegressionInfo.java │ │ │ │ ├── ScorerMaker.java │ │ │ │ ├── TimingMap.java │ │ │ │ ├── UpdateTestbed.java │ │ │ │ └── Validator.java │ │ │ ├── unify/ │ │ │ │ ├── EmptySubstitution.java │ │ │ │ ├── Feature.java │ │ │ │ ├── FeatureStructure.java │ │ │ │ ├── GFeatStruc.java │ │ │ │ ├── GFeatVar.java │ │ │ │ ├── GSubstitution.java │ │ │ │ ├── GUnifier.java │ │ │ │ ├── Indexed.java │ │ │ │ ├── ModFcn.java │ │ │ │ ├── Mutable.java │ │ │ │ ├── SelfCondensingSub.java │ │ │ │ ├── SimpleSubstitution.java │ │ │ │ ├── SimpleType.java │ │ │ │ ├── Substitution.java │ │ │ │ ├── Unifiable.java │ │ │ │ ├── Unifier.java │ │ │ │ ├── UnifyControl.java │ │ │ │ ├── UnifyFailure.java │ │ │ │ └── Variable.java │ │ │ └── util/ │ │ │ ├── ArrayListWithIdentityEquals.java │ │ │ ├── CompositeFilter.java │ │ │ ├── DelegatedFilter.java │ │ │ ├── DisplayPrefs.java │ │ │ ├── Filter.java │ │ │ ├── FilteredMap.java │ │ │ ├── FilteredSet.java │ │ │ ├── GroupMap.java │ │ │ ├── IntHashSetMap.java │ │ │ ├── Interner.java │ │ │ ├── InverseFilter.java │ │ │ ├── JLineReader.java │ │ │ ├── LineReader.java │ │ │ ├── ListMap.java │ │ │ ├── MembershipFilter.java │ │ │ ├── Pair.java │ │ │ ├── SingletonList.java │ │ │ ├── StructureSharingList.java │ │ │ ├── TrieMap.java │ │ │ ├── VisitedFilter.java │ │ │ ├── Visualizer.java │ │ │ └── XmlScanner.java │ │ └── ccgbank/ │ │ ├── CCGBankConvert.java │ │ ├── CCGBankExtract.java │ │ ├── CCGBankTask.java │ │ ├── CCGBankTaskFileGroup.java │ │ ├── CCGBankTaskSources.java │ │ ├── CCGBankTaskTemplates.java │ │ ├── CCGBankTaskTestbed.java │ │ ├── InputSourceAdapter.java │ │ ├── TemplatesProcessor.java │ │ ├── XMLFilterProcessor.java │ │ ├── XSLTProcessor.java │ │ ├── ccgbank.properties │ │ ├── convert/ │ │ │ ├── ApposTally.java │ │ │ ├── DiscrCheck.java │ │ │ ├── GenChal11Adjuster.java │ │ │ ├── GenConjRule.java │ │ │ ├── InfoHelper.java │ │ │ ├── Javafns.java │ │ │ ├── MWHelper.java │ │ │ ├── MorphLookup.java │ │ │ ├── OrigPunctRules.java │ │ │ ├── PunctHelper.java │ │ │ ├── RoleAdjuster.java │ │ │ └── XSLTTrueCaser.java │ │ ├── extract/ │ │ │ ├── CatNode.java │ │ │ ├── DebugHelper.java │ │ │ ├── DefaultLFHelper.java │ │ │ ├── ExtractGrammar.java │ │ │ ├── FreqTally.java │ │ │ ├── InsertLFHelper.java │ │ │ ├── LexExtract.java │ │ │ ├── MorphExtrHelper.java │ │ │ ├── MorphExtract.java │ │ │ ├── RulesExtract.java │ │ │ ├── RulesTally.java │ │ │ └── Testbed.java │ │ ├── lexicon-base.xsl │ │ ├── parse/ │ │ │ ├── CCGbankDerivation.jjt │ │ │ ├── SimpleNode.java │ │ │ └── grammarInsert │ │ └── rules-base.xsl │ ├── pom.xml │ └── srilmbridge/ │ ├── Makefile │ └── srilmbridge.cpp └── test/ ├── grammar.xml ├── lexicon.xml ├── morph.xml ├── opennlp/ │ └── ccg/ │ ├── alignment/ │ │ ├── AlignmentTest.java │ │ ├── IdentifiedPhraseReaderWriterTest.java │ │ ├── IndexBaseTest.java │ │ ├── MappingFormatTest.java │ │ ├── MappingGroupTest.java │ │ ├── MappingReaderWriterTest.java │ │ ├── MappingTest.java │ │ ├── PhraseReaderWriterTest.java │ │ └── PhraseTest.java │ ├── disjunctivizer/ │ │ ├── AlignedEdgeFilterTest.java │ │ ├── DisjunctivizerTest.java │ │ ├── EdgeMatchFilterTest.java │ │ ├── FilteredLFEdgeSetTest.java │ │ ├── LFGraphDifferenceTest.java │ │ ├── LabelMatchFilterTest.java │ │ └── VertexMatchFilterTest.java │ ├── hylo/ │ │ └── graph/ │ │ ├── LFBaseTest.java │ │ ├── LFEdgeFactoryTest.java │ │ ├── LFEdgeTest.java │ │ ├── LFGraphTest.java │ │ └── LFVertexTest.java │ └── util/ │ ├── CompositeFilterTest.java │ ├── DelegatedFilterTest.java │ ├── FilteredMapTest.java │ ├── FilteredSetTest.java │ ├── InverseFilterTest.java │ ├── MembershipFilterTest.java │ └── VisitedFilterTest.java ├── output.xml ├── paraphrases.xml ├── rules.xml └── testlf.xml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.sw? *~ *.jar *.so *.class *.pyc *.tgz *.gz *.lm *.3bo *.mod vocab.* !vocab.flm .project .classpath bin/ccg2xml.py bin/lex.py bin/yacc.py bin/ccg_editor.py bin/Tree.py ccgbank/convert/ ccgbank/data/novel/two-sents.dir/ ccgbank/extract/* !ccgbank/extract/grammar.xml !ccgbank/extract/*.xsl ccgbank/feats/hypertagger/ ccgbank/feats/parser/ ccgbank/feats/realizer/ ccgbank/feats/supertagger/ ccgbank/logs/ ccgbank/models/*/*dict* ccgbank/models/realizer/excl/ ccgbank/original/corpus/ ccgbank/original/feats/postagger/ ccgbank/original/feats/supertagger/ ccgbank/original/logs/ ccgbank/propccgbank/ ccgbank/stanford-nlp/*.jar ccgbank/stanford-nlp/classifiers/*.prop docs/api/ docs/grammars-rough-guide.pdf docs/realizer-manual.pdf docs/guide/guide.* !docs/guide/guide.tex docs/realizer/manual.* !docs/realizer/manual.tex lib/openccg.jar output/ src/ccg2xml/ccg2xml.py src/srilmbridge/*.h ================================================ FILE: AUTHORS ================================================ Main Authors: Core Java Code: Jason Baldridge Gann Bierner Michael White CCG-to-XML: Ben Wing Hypertagger: Dominic Espinosa Supertagger: Dennis Mehay Disjunctivizer: Scott Martin Additional Contributors: Jonathan Barker semantic graph visualization tool Cem Bozsahin grammars from Bozsahin and Steedman (2003) Gunes Erkan handling of type hierachies Dennis Mehay KenLM interface (in addition to supertagger) Scott Martin GrammarDoc incoporating the SRILM toolkit for scoring build process for CCGbank grammar extraction (in addition to disjunctivizer) Rajakrishnan Rajkumar build files and XSLT transforms for CCGbank grammar extraction English agreement model David Reitter command completion and per grammar history "tiny" grammar Alexandros Triantafyllidis visualization of derivations via latex Ben Wing wccg and WebCCG code; CCG-format grammars (in addition to ccg2xml) ================================================ FILE: CHANGES ================================================ 0.9.6 - ... ----------- * Updated .gitignore, CHANGES and docs/index.html for transition to GitHub 0.9.5 - dependency length minimization, disjunctivizer, KenLM ------------------------------------------------------------- * Added features for dependency ordering and dependency length minimization in realization. * Added disjunctivizer package, for creating a disjunctive LF XML structure based on an LF graph difference. * Added support for using a very large 5-gram memory-mapped language model with KenLM on linux. * Added n-best parser output. * Added option for in-memory perceptron training. 0.9.4 - broad coverage paraphrasing, CCGbank training ----------------------------------------------------- * Added Hockenmaier-style generative probability model for parsing and realization. * Added supertagger and use of adaptive supertagging in parsing. * Added build files for CCGbank training, documented in docs/ccgbank-README, as well as ones for parsing and realizing novel text (thereby generating grammatical paraphrases). * Added release targets for CCGbank data and pre-built English models. * Added use of Stanford tokenizer, morphological analyzer and named entity recognizer in parsing novel text. * Added use of ordinary hashing for lex signs, so that signs that differ only in the pos tag can be distinguished (for robustness). * Added hypertagger input option and derivation history output to ccg-realize. * Added n-best realization output to ccg-test. * Added tracking of lex heads to signs via modifier attr on slashes. * Added gold standard pred info for training with hypertagger. * Added initial syntactic feature extractor. * Added caching of supertags in cats. * Added option to use word positions in converting atoms in the LFs, which is now the default. Added :nowordpos command in tccg to change the preference to the lexical naming option. * Changed tccg to also update Grammar.theGrammar.prefs, which seems to have fixed issue with :nosem option not working. * Refactored feature extraction to use a trie for representing features as a sequence of interned string keys, to allow for lazy feature extraction that more quickly filters features not in the alphabet. * Added serialization of signs. * Added python script for drawing derivs in .auto files as trees (uses NLTK). * Added cell pruning limit in realization. * Added support for 'magic tokens' (like numbers) in ccg2xml, contributed by vanjena@users.sourceforge.net. * Turned off caching of category hash codes b/c of problems with stale values (a method of checking for staleness might be added later). * Improved utf8 support (esp. for macs). Note that utf8 support seems hopelessly broken for the Windows command-line, in that none of the available terminal apps (including for cygwin) both display characters correctly and work with tccg. I/O to files works fine though. * Added xml escaping for bleu and nbest output. * Added ccg-draw-graph tool for visualizing semantic dependency graphs. 0.9.3 - minor changes ------------------- * Added runCommand method in Visualizer so that the latex visualization works on Linux * Added id info to test items and bleu output. * Changed default lex licensing feature to be last in Lexicon.loadLicensingFeatures. * Added loop for computing closure of licensed no sem edges in EdgeFactory. * Changed FeatureLicenser to unify feat strucs instead of cats. 0.9.2 - VisCCG release plus initial hypertagger support ------------------------------------------------------- * Added check for unary rule cycles in parser and realizer. * Added initial version of greedy fragment assembly in realization when a complete realization is not found. * Added case for composition of X/Y Y/Z where Y has arity 2. * Added option to filter rule apps by observed supercat-rule combos. * VisCCG: Please see the list of changes in the archives at http://comp.ling.utexas.edu/wiki/doku.php/openccg/dev * Added LexSemOrigin interface for tracking of origin of lexical predications back to a sign or unary type changing rule. * Removed unused LF in DataItem. * Added supertagger-based filtering to lexical lookup. * Upgraded to JDOM 1.1. * Upgraded parser to use ambiguity packing. * Added scoring and n-best pruning to parser. * Refactored SignScorer to synsem package, for shared use by the realizer and parser. NB: This may require minor refactoring of imports and recompilation of realizer clients. * Changed realizer to check instantiation of outermost args by default, thereby improving completness at minor cost to efficiency. Accordingly, renamed checkInstantiation flag in EdgeFactory to debugInstantiation, which now controls whether to report such cats to System.err. * Added hypertagger (realizer supertagger) interface and initial version of beta-best realization using it. * Changed Family.deriveSupertag to remove the semantic part of a cat name following a colon. 0.9.1 - New tools: grammardoc, ccg2xml; other misc updates ---------------------------------------------------------- * Changed dateFormatNoYear to "*.MM.dd" to avoid ambiguity with numbers. * Changed Grammar.initializeTransformers to set indenting more robustly by adding try-catch blocks for illegal argument exceptions. * Refactored RuleGroup to apply unary and binary rules separately. * Refactored Lexicon and RuleGroup to load lex/morph/rule info incrementally, using a new XmlScanner utility class. These changes avoid the need to store large XML docs all in memory at once, while keeping the refactoring to a minimum. * Revised LF flattening to propapage the alts, opts & chunks based on the expression structure, rather than the graph structure. This change makes the 'shared' attribute (on nominal references) more transparent in how it works with disjunctions that operate on different levels of the tree. * Revised LF compaction to allow duplicate predications, where an attempt is made to attach them in different locations if possible. * Added GrammarDoc, which generates HTML documentation from a source grammar. See README, under `Generating Grammar Documentation' for more information. * Added initial version of ccg2xml, for specifying grammars in the more human-friendly .ccg format. * Changed build system - Made separate build files for ccg2xml and documentation - Made the `release' target of the main build file create a binary for distribution, instead of just the source 0.9.0 - Disjunctive LFs ----------------------- * Refactored realizer to put all no-sem edges on the agenda, which requires making an exception for edges with no indices in the implementation of the index filter, but otherwise yields a more uniform approach to creating edges. * Refactored realizer to use representative edges (one per cat) instead of edge groups, which ends up being simpler on the whole and should be easier to explain. * Refactored categories to allow for equality checks with and without taking the LFs into consideration. * Refactored edge equiv classes to use coverage bit vector and cat sans LF to check equality. * Refactored lex instantiation to produce all possible instantiations that respect the alt exclusivity constraints. * Changed Sign, DerivationHistory to store rule object. * Changed alt edge construction to create new LF from input signs and rule, since signs in equiv class of alts can now have different LFs. * Added active alt tracking and completing of edges with optional bits. * Changed HyloVar to check for equal types when checking for equality up to var renaming. * Refactored generics to avoid type warnings in Eclipse. * Relaxed LF chunking constraints to allow combinations with edges (or trackers more generally) that are shared across multiple alt set options. * Added "shared" attribute to nominal terms to indicate references to nodes that are shared across alternatives in a disjunctive LF; then revamped and reinforced the LF chunking constraints. * Fixed problem with signMap not pointing to opt-completed edge. * Improved edge printing from realizer chart to show derivations. * Updated realizer to keep edges whose signs have the simplest derivation, among those with the same surface words. * Added filter for ungrammatical test cases in ccg-test text output. * Added first draft of realizer manual. 0.8.6 - Java 1.5 switch, n-gram scoring improvements ---------------------------------------------------- * Added propagation of reverse flag on n-gram models. * Refactored LinearNgramScorerCombo and n-gram models to support interpolation at the word level. * Added caching of log probs in NgramScorer, to avoid recomputing log prob of words for a sign's initial sign. * Added n-gram diversity pruning strategy. * Changed SignHash to only keep signs that are unique up to surface words, thereby ignoring different POS or supertags; also changed it to keep signs with lower derivational complexity during insertion. * Added reverse flag for loaded n-gram models with ccg-test, ccg-realize. * Fixed sentence delimiter text output for reversible standard n-gram models; made AAnFilter reversible. * Added Xalan 2.6.0 jars, to support Java 1.5 builds. * Added support for duration special tokens; note that the implementation has an unavoidable dependency on Java 1.5. 0.8.5 - "Rough Guide", sem types, command history/completion, and more ---------------------------------------------------------------------- * Added initial core-en/types.xml. * Generalized feature licensing to allow for selective listing of supertypes in the also-licensed-by attribute. * Fixed bug in unifying two vars with simple types. * Removed useless SignHash.values method; clarified intention to eventually remove this class. * Streamlined lexical access for realization. * Removed superfluous unique stamps in var classes. * Added support for using simple types (aka sorts) with semantic features and nominals. During category instantiation, a morph item's class is assigned to the nominal var(s) for the [*DEFAULT*] proposition, and the types of all nominal vars are then propagated to all other nominal vars with the same name, throughout the category. * Changed tokenizer keep-words-with-sem-classes option in grammar.xsd to replacement-sem-classes option, where all semantic classes to use in replacing words with sem classes for language models are listed. Also changed semantic class replacement routine to uppercase semantic class names. * Added initial sem types to core-en, comic, and flights grammars. * Fixed bug in constructing type hierarchies with multiple inheritance. * Added ccg-update tool, with initial task to add full words (pre-parsed) to the testbed file; also updated ccg-test to use the pre-parsed words when writing training text files. * Updated ccg-cvr tool to use full words when present. Also added filter to remove test item duplicates from cross-validation training sets. * Added reporting of mean reciprocal rank to ccg-test, as well as residual mean reciprocal rank, based on the cases that do not match the target exactly. * Updated ccg-cvr tool to work with factored language models. * Fixed null pointer exception in DefaultTokenizer.format, Word.setW methods. * Added timing of lex lookup to realization metrics. * Added David's JLine console support to tccg, with command completion and per grammar history. * Added handling of coarticulations in the lexicon. * Added caching of lex lookup during realization. * Updated to-apml.xsl to handle 'and' in multiword elements. * Updated visualizer to handle word lists and to ignore coarts. * Added repetition scorer, for discounting repetitive realizations. * Added scorer class, pruning strategy class options to ccg-realize. * Added workaround for saving command history correctly with Java 1.4 on Linux. * Added 'tiny' grammar. * Added grammars "rough guide". * Added supertag as another word attr. * Revamped LMs to use trie maps, for better speed & scalability. * Improved handling of nulls in FLMs. * Cleaned up word representations. * Added even/odd selection for scoring too in ccg-test. * Added -reverse and -scorer options to ccg-test. * Added reverse LM capability. * Made supertag attrs configurable. * Switched to JDOM 1.0. 0.8.4 - Factored language models (initial support), packing/unpacking, and more --------------------------------------------------------------- * Added Alex's latex visualization of derivations (nb: launch of previewer works better on Windows than Linux) * Added customizable tokenization and expansion routines for dates/times/nums/amounts and other named entities. * Added -2apml option to ccg-test. * Added Word class and many related changes to tokenization. * Added -textf|-textfsc options to ccg-test, for writing files in the format expected by the SRILM toolkit for factored language models. * Updated copyright notices. * Changed ngram model to use canonical lists of words as keys, removing size restriction. * Added -aanfilter option to ccg-test, with an optional list of exceptions, which may be culled from bigram counts. * Added keep-words-with-sem-classes option to grammar.xsd, to specify exceptional semantic classes where the word form is also considered relevant for scoring models. NB: Also changed grammar.xsd to specify a custom tokenizer class name and/or keep-words-with-sem-classes on a separate tokenizer element. * Added support for factored language models with fixed backoff paths, arranged into families of models for different child variables, and with the option to have secondary models for shorter available histories. Also added corresponding -flm|-flmsc options to ccg-test. * Added option to do scoring in a second stage, starting from a packed representation. * Switch from cached combos to collected combos, making the anytime case more like the packed case. * Added compacting of gen forest when unpacking is turned off. * Added pretty-printing of regex-like gen forest. 0.8.3 - New efficiency methods, Cem-* grammars, and more --------------------------------------------------------------- * Added grammars from Bozsahin and Steedman (2003). * Improved instantiation of unary rules, ensuring that the first pred is used for indexing, and fixing a bug whereby a rule indexed by a lex pred would be missed. * Added initial capability to use semantic classes in n-gram scoring, as shown in ccg-realize. * Added LF chunking rules, which yields the most dramatic improvement in efficiency. * Added systematic feature-based licensing and instantiation. * Added caching of category combinations. * Added labeling of the phrase in the XML output headed by the index associated with the + semantic feature. * Added feature filtering and LF indenting to tccg display options. * Added XML configuration of LF relation sorting. * Added :2tb (to testbed) command for adding the current parse to the testbed. * Fixed grammar loading so it no longer has to be from the current directory. * Made it possible to list a stem as a member of an open class family with a separate pred, without getting an entry with the default pred too. * Enabled indexRel to be declared at the level of entries or families. * Added prefs import/export to tccg. * Added ccg-cvr tool for cross-validating realizer. * Reconfigured ccg-test with various new switches. * Put feature licensing on a switch. * Made pruning strategy configurable. * Changed representation of coord to work better with chunking (though less concise). * Added option to stop realizer after new best time limit (past first complete realization) is exceeded, via :nbtl N command 0.8.2 - Edge pruning during realization, XML/APML I/O, and more --------------------------------------------------------------- * Changed build to ccg-build, in bin directory; also added separate build.xml files to each sample grammar directory. This way, a call to ccg-build either builds the system or the current grammar, depending on what directory you're in. * Changed realizer to no longer allow unmatched attr preds (ie sem features). This way, the presence of certain sem features can be used to control realization choices, instead of requiring these features to always be present. To underspecify these choices, the idea is to eventually allow for their optional inclusion. * Added more options to turn settings off individually in tccg. * Enabled realizer to handle type changing rules with their own semantics in the result category. * Added configurable edge pruning per category during realization, which controls the number of edges with equivalent categories to keep in the chart. * Fixed unification bug by adding occurs checks to Dollar's fill method, needed at least in part b/c ArgStack doesn't quite implement Unifiable. * Replaced hashString with hashCode and equals up to var names, yielding a 4-5% improvement in efficiency. * Switched to grammar.xml file. If none exists, an attempt is made to load from the default files lexicon.xml, morph.xml and rules.xml. See grammar.xsd for format. * Added LF load/save from/to XML via a sequence of transformations specified in the grammar.xml file. * Added save-to-xml (:2xml) option for saving LFs to XML files from tccg. * Added save-to-apml (:2apml) option for saving last input string to APML files from tccg. * Updated parser to apply unary rules repeatedly. * Various updates to flights grammar, including use of FrameNet roles. 0.8.1 - OpenCCG Release with XML Schemas (!) ---------------------------------------------------- This release adds XML Schema validation to the grammar build process, where the comments in the XML schemas also serve as reference documentation for the grammar formats (wahoo!). The release also contains several bug fixes to the unification routines, and a more substantial "flights" grammar with semantic control over pitch accents and boundary tones. 0.8.0 - First OpenNLP CCG Library Release ---------------------------------------------------- Reorganized directories and renamed packages and tools. Added build target for worldcup sample grammar. Rewrote scripts for simplicity and parallelism. Cut out pre-processing components and any classes and libraries that looked like dead wood. Started removing unnecessary interfaces. Grok 0.7.0 - Towards a CCG Realizer ---------------------------------------------------- Mike is taking over Grok development and repurposing it for primary use as a CCG Realizer in limited domain dialogue systems. See http://www.iccs.informatics.ed.ac.uk/~mwhite/White-Baldridge-ENLG-2003-to-appear.pdf for a description of the effort so far. Version 0.7.0 will be the last Grok release. After this version, Grok will be split into separately usable and separately developed OpenNLP components. Tom Morton will be responsible for further development of the pre-processing components. Mike will be responsible for further development of the CCG parser and realizer. Grok 0.6.0 - Multi-Modal CCG ---------------------------------------------------- For more information, see Jason's dissertation available at: http://www.iccs.inf.ed.ac.uk/~jmb/dissertation See Grok site for further history ... ================================================ FILE: LICENSE ================================================ GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 (The master copy of this license lives on the GNU website.) Copyright (C) 1991, 1999 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave ou. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS ================================================ FILE: README.md ================================================ # OpenCCG OpenCCG is a system for parsing and generating text using [combinatory categorial grammar](https://en.wikipedia.org/wiki/Combinatory_categorial_grammar) for syntax and [hybrid logic dependency semantics](https://www.aclweb.org/anthology/P02-1041) for, well, the semantic representation. If that seems like a mouthful, don't worry too much about the details right now. You can get started [installing OpenCCG](https://davehowcroft.com/post/installing-openccg/) and [working with OpenCCG using the `tccg` utility](https://davehowcroft.com/post/getting-started-with-openccg/) right now. If, on the other hand, you want to start understanding what that mouthful means, Johanna Moore at the University of Edinburgh has some [helpful course notes on NLG in general and OpenCCG in particular](https://www.inf.ed.ac.uk/teaching/courses/nlg/). # Project information See CHANGES for a description of the project status. Also see the OpenCCG web site and wiki at UT Austin: * http://openccg.sf.net * http://www.utcompling.com/wiki/openccg This `README.md` file contains the configuration and build instructions. Next you'll probably want to look at the tutorial on writing grammars in the human-friendly 'dot ccg' syntax on [the UT Austin OpenCCG wiki](http://www.utcompling.com/wiki/openccg/visccg-tutorial). After that it may be helpful to look at the "native" grammar specification in "Specifying Grammars for OpenCCG: A Rough Guide" in `docs/grammars-rough-guide.pdf`, as well as the `SAMPLE_GRAMMARS` file for descriptions of the sample grammars that come with the distribution, including ones using the DotCCG syntax. A (somewhat dated) programmer's guide to using the OpenCCG realizer appears in `docs/realizer-manual.pdf`. This release also includes a broad English coverage grammar from the CCGBank and associated statistical models; see `docs/ccgbank-README` for details. # Requirements * Version 1.6 or later of the Java 2 SDK (http://java.sun.com) * For ccg2xml and other tools, Python version 2.4 to 2.7 (http://www.python.org) # Libraries If you're working with the latest source version from GitHub, you'll need to download the external libraries from the latest release, as GitHub discourages including binaries in their repos: * Download the [latest release of OpenCCG from sourceforge](https://sourceforge.net/projects/openccg/) * Unpack the archive and copy over the files from `openccg/lib/`, as well as `openccg/ccgbank/bin/ner/NERApp.jar` * Build the latest source as described further below # Configuring your environment variables The easiest thing to do is to set the environment variables `JAVA_HOME` and `OPENCCG_HOME` to the relevant locations on your system. Set `JAVA_HOME` to match the top level directory containing the Java installation you want to use. For example, on Windows: ``` C:\> set JAVA_HOME=C:\Program Files\jdk1.6.0_04 ``` or on Unix: ``` % setenv JAVA_HOME /usr/local/java (csh) > export JAVA_HOME=/usr/java (ksh, bash) ``` On Windows, to get these settings to persist, it's actually easiest to set your environment variables through the System Properties from the Control Panel. For example, under WinXP, go to Control Panel, click on System Properties, choose the Advanced tab, click on Environment Variables, and add your settings in the User variables area. Next, likewise set `OPENCCG_HOME` to be the top level directory where you unzipped the download. In Unix, type `pwd` in the directory where this file is and use the path given to you by the shell as `OPENCCG_HOME`. You can set this in the same manner as for `JAVA_HOME` above. Next, add the directory `OPENCCG_HOME/bin` to your path. For example, you can set the path in your `.bashrc` file as follows: ``` > export PATH="$PATH:$OPENCCG_HOME/bin" ``` On Windows, you should also add the python main directory to your path. Finally, if you are going to use [KenLM](https://kheafield.com/code/kenlm/) with very large language models for realization with CCGbank-extracted grammars on linux, you'll also need to set the library load path: ``` > export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OPENCCG_HOME/lib ``` Once you have taken care of these things, you should be able to build and use the OpenCCG Library. **Note**: Spaces are allowed in `JAVA_HOME` but not in `OPENCCG_HOME`. To set an environment variable with spaces in it, you need to put quotes around the value when on Unix, but you must *NOT* do this when under Windows. # Increasing Java memory limit If you're working with a broad coverage grammar and statistical parsing or realization models, you'll probably need to increase the default memory limit for running OpenCCG's tools. You can do so by editing `bin/ccg-env[.bat]`, increasing the JAVA_MEM environment variable at the end of this script. For training perceptron models in memory, you may need 16g; for realization with the very large gigaword 5-gram model, you may need 8g; otherwise, for parsing and realization with CCGbank-derived models, 4g or possibly even 2g should suffice; finally, for small grammars 512m or 256m should be ok. # Trying it out If you've managed to configure the system, you should be able to change to the directory for the "tiny" sample grammar and run `tccg` (for text ccg), the command-line tool for interactively testing grammars: ``` > cd grammars > cd tiny > tccg (Windows/Unix) ``` Provided tccg starts properly, it loads the grammar files, parses them, and shows the command-line interface (at which point you can type `:h` for help or `:q` to quit). If you trouble starting up tccg, make sure you have set the environment variables properly, and that the tccg script (located in `openccg/bin`) calls the right shell environment (top-line of the script; to solve the problem, either comment out this line or correct the path). # Visualizing semantic graphs Semantic dependency graphs in testbed files can be visualized with the help of Graphviz's dot tool. First, download and install [Graphviz](http://www.graphviz.org/). Then, use tccg to create a testbed files with logical forms in it. For example, you can try some examples in the worldcup sample grammar and save them to a file using the command ':2tb tb.xml'. Then make a directory to store the visualized graphs. Finally, run the ccg-draw-graph tool as shown below: ``` > cd grammars/worldcup > tccg (parse examples, save using ':2tb tb.xml') > mkdir graphs > ccg-draw-graph -i tb.xml -v graphs/g ``` You can also show the semantic classes or word indices using the `-c` or `-w` options, respectively. The graphs can be displayed with any PDF display tool. Note that the graph visualization requires the logical forms to be stored in an xml node-rel format for graphs, as in the worldcup or routes sample grammars. See `SAMPLE_GRAMMARS` for more information. # Creating disjunctive logical forms This release includes a new disjunctivizer package, for creating a disjunctive LF XML structure based on an LF graph difference. An LF graph difference is a characterization of the difference between two Hybrid Logic Dependency Semantics graphs and an alignment between them in terms of the edits needed to make one into the other: inserts, deletes, and substitutions. See the build file for junit tests that illustrate how to use the package. # Generating grammar documentation OpenCCG includes a tool for generating HTML documentation of the XML files that specify a grammar. It can be run either from the `ccg-grammardoc` script in the `bin/` directory, or as an Ant task. An example of how to incorporate GrammarDoc into an Ant build file is given in the "tiny" grammar (`grammars/tiny/build.xml`), in a build target called `document`. # Building the system from source The OpenCCG build system is based on Apache Ant. Ant is a little but very handy tool that uses a build file written in XML (`build.xml`) as building instructions. Building the Java portion of OpenCCG is accomplished using the script `ccg-build`; this works under Windows and Unix, but requires that you run it from the top-level directory (where the `build.xml` file is located). If everything is right and all the required packages are visible, this action will generate a file called openccg.jar in the `./lib` directory. Note that you should *not* build from source by invoking 'ant' directly. Instead, you should use `ccg-build` as shown below (Unix), after ensuring that you've set `OPENCCG_HOME`, `JAVA_HOME` and updated your `PATH` (the `ccg-build` script invokes ant with various parameters that aren't set properly if ant is invoked from the command line): ``` > cd $OPENCCG_HOME > ccg-build ``` # Working with the Eclipse IDE The Eclipse IDE can be used for editing the Java source code, though setup can be a bit tricky. The most reliable method seems to be as follows. First, follow the instructions above for building the source from the command line. Then, in Eclipse, choose File|New|Java Project to create a new Java Project, and give it a name, such as 'openccg'. Leave the default settings as they are, and click Next. Then choose Link Additional Source and browse to the folder `src/` in the directory where you installed OpenCCG (i.e. `$OPENCCG_HOME/src`). You'll need to give this location a new name, such as 'src2' ('src' is already taken by default). The final step is to Add External JARs under the Libraries tab. From OpenCCG's lib directory (i.e. `$OPENCCG_HOME/lib`), choose all of the `.jar` files. At this point, you should be able to hit Finish and the code should compile in Eclipse. Note that with Eclipse's default settings, the code will compile in your Eclipse workspace, which is separate from your OpenCCG installation (this is a good thing, as Eclipse uses a `bin/` directory for compiled Java classes, whereas OpenCCG uses `bin/` for command-line scripts). Thus, once you have made a round of changes in Eclipse and are ready to try them out in OpenCCG, go back to the command line in `$OPENCCG_HOME` and invoke `ccg-build` to re-build the `openccg.jar` file. This will make your changes available in OpenCCG's programs, such as `tccg`. # Bug Reports Please report bugs at by creating [an issue with a description of the problem](https://github.com/OpenCCG/openccg/issues). ================================================ FILE: SAMPLE_GRAMMARS ================================================ This SAMPLE_GRAMMARS file describes the sample grammars that come with the distribution, and provides an overview of how the grammars are organized. Grammars written directly in the XML format used by OpenCCG appear in separate directories under grammars/. There are currently four small English grammars -- tiny, worldcup, flights, and comic -- plus a series of related grammars, mini-*, for Basque, Dyirbal, English, Inuit, Tagalog and Turkish, which are from Bozsahin and Steedman's (2003) study of ergativity. The worldcup grammar includes the English examples from Baldridge (2002). (The Dutch, Turkish, Tagalog, and Toba Batak grammars have not been updated from Grok version 0.6.) The flights and comic grammars (used in the FLIGHTS and COMIC systems) make use of a shared grammar of core English, in the core-en dir, and contain categories for pitch accents and boundary tones. Grammars written in the front-end `dot CCG' format, which attempts to provide a more powerful and easier-to-use format than the raw XML, are in separate directories under ccg-format-grammars/. There are currently three grammars here -- tiny, tinytiny, and arabic. `tiny' is a grammar originally based on the `tiny' English grammar contained in the grammars/ directory and documented above. It has been significantly expanded so as to demonstrate the various features of the CCG format. `tinytiny' is a smaller English grammar extracted from `tiny', which attempts to demonstrate a minimal-size useful grammar. `arabic' is a grammar of a large chunk of Classical Arabic, written by Ben Wing. It was created in particular to demonstrate the power of CCG-format macros in handling complex morphology, and contains a nearly full grammar of Arabic verbs. Dot CCG grammars are compiled using ccg2xml; run ccg2xml -h for usage. The best place to look for more info on the dot CCG format is in ccg-format-grammars/tiny/tiny.ccg and in src/ccg2xml/README. This release also includes a broad English coverage grammar from the CCGBank and associated statistical models; see docs/ccgbank-README for details. Note that with all the grammars, there is the option to store logical forms in an xml node-rel format for graphs. Conversion to this graph format is done using a couple of XSLT transforms specified in the grammar.xml file; see grammars/worldcup/grammar.xml for an example. When using this graph format, it is also possible to visualize the semantic graphs, as described in the main README file. At present, ccg2xml does not support writing grammar.xml files with the XSLT transforms for the node-rel graph format. As a workaround, you can add these transforms to your own version of the file which you then copy over the generated grammar.xml file, as shown below. > ccg2xml --prefix= mygram.ccg > cp mygram-grammar.xml grammar.xml ================================================ FILE: TODO ================================================ General OpenCCG development: ---------------------------- - Add check for target LF when adding/writing full words (incl. supertags). - Look into better handling of optional args. - Extend feature hierarchy biz to work with category types; would make sense to also add category vars. - Add final bits to grammars rough guide (esp. feature licensing). - Add option to update testbed. - Binding theory? - Get agreement to work with anaphors in appositives. - Add more dynamic checks, eg for non-existent indexRel values or licensing attrs. - Add well-formedness check for unique roles -- ie, that role must occur only once per semantic head -- and associated method for declaring that roles must be unique. - Interface to morph transducers. - Make UnifyControl etc thread friendly. Could try tying global vars to current thread. - Improve unification efficiency. Could try indexing, caching across calls to parser or realizer, and structure sharing with delayed copying. Realizer-oriented development: ------------------------------ - Try using coarticulations with pitch accents. - Add orthographic post-processing (capitalization, spacing of punctuation). - Look into instantiating outermost args. TODO prior to Feb 16: --------------------- Tabs for testbed, lexicon, features Nice tree graph for features Get Arabic to compile Issues of colors, fonts, etc. (Alexis help) PNG's of the slash modalities Option menu for different magnification (50%, 100%, 200% ...) Get pretty buttons from Justin Make sure all path, etc. issues are working Figure out what's the deal with __init__.py -- importing from another dir Possibly: help-over descriptions of families, provided by Javadoc-style comments in the source Alexis -- help with more specific user-interface issues, overall management involving various people Sudipta -- -- PNG's of slash modalities; add to editor -- TeX/Tk font conversion -- find a Tkinter package for displaying tree graphs; use it to add a graph for features to the editor -- figure out what's wrong with arabic? semantics: RED features: BLUE categories: BLACK, sans serif font background: WHITE Ben TODO!!!!! ------------- Create professional-looking web page off of comp.ling.utexas.edu ================================================ FILE: bin/ccg-build ================================================ #!/bin/sh . ccg-env ANT_HOME="$OPENCCG_LIB" PROPS="-Dant.home=$ANT_HOME -Dopenccg.home=$OPENCCG_HOME" case `uname` in CYGWIN* ) XALAN_JARS="$OPENCCG_LIB/xalan.jar;$OPENCCG_LIB/xercesImpl.jar;$OPENCCG_LIB/xml-apis.jar;$OPENCCG_LIB/xsltc.jar;$OPENCCG_LIB/serializer.jar" ANT_JARS="$OPENCCG_LIB/ant.jar;$OPENCCG_LIB/ant-launcher.jar;$OPENCCG_LIB/ant-contrib.jar" ANT_JARS="$ANT_JARS;$OPENCCG_LIB/ant-junit.jar;$OPENCCG_LIB/ant-junit4.jar;$OPENCCG_LIB/junit-4.10.jar" CP="$JAVA_HOME/lib/tools.jar;$OPENCCG_JAR;$ANT_JARS;$XALAN_JARS;$DIRLIBS;." PROPS="$PROPS -Dcygwin=true" ;; * ) XALAN_JARS="$OPENCCG_LIB/xalan.jar:$OPENCCG_LIB/xercesImpl.jar:$OPENCCG_LIB/xml-apis.jar:$OPENCCG_LIB/xsltc.jar:$OPENCCG_LIB/serializer.jar" ANT_JARS="$OPENCCG_LIB/ant.jar:$OPENCCG_LIB/ant-launcher.jar:$OPENCCG_LIB/ant-contrib.jar" ANT_JARS="$ANT_JARS:$OPENCCG_LIB/ant-junit.jar:$OPENCCG_LIB/ant-junit4.jar:$OPENCCG_LIB/junit-4.10.jar" CP="$JAVA_HOME/lib/tools.jar:$OPENCCG_JAR:$ANT_JARS:$XALAN_JARS:$DIRLIBS:." ;; esac "$JAVA" $JAVA_MEM -classpath "$CP" $PROPS org.apache.tools.ant.launch.Launcher $@ ================================================ FILE: bin/ccg-build.bat ================================================ @echo off call ccg-env set ANT_HOME=%OPENCCG_LIB% set PROPS=-Dant.home=%ANT_HOME% -Dopenccg.home=%OPENCCG_HOME% set XALAN_JARS=%OPENCCG_LIB%\xalan.jar;%OPENCCG_LIB%\xercesImpl.jar;%OPENCCG_LIB%\xml-apis.jar;%OPENCCG_LIB%\xsltc.jar;%OPENCCG_LIB%\serializer.jar set ANT_JARS=%OPENCCG_LIB%\ant.jar;%OPENCCG_LIB%\ant-launcher.jar;%OPENCCG_LIB%\ant-contrib.jar set ANT_JARS=%ANT_JARS%;%OPENCCG_LIB%\ant-junit.jar;%OPENCCG_LIB%\ant-junit4.jar;%OPENCCG_LIB%\junit-4.10.jar set CP="%JAVA_HOME%\lib\tools.jar";%OPENCCG_JAR%;%ANT_JARS%;%XALAN_JARS%;%DIRLIBS%;. %JAVA% %JAVA_MEM% -classpath %CP% %PROPS% org.apache.tools.ant.launch.Launcher %* ================================================ FILE: bin/ccg-cvr ================================================ #!/bin/sh # For usage, do: ccg-cvr -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.test.CrossValidateRealizer $@ ================================================ FILE: bin/ccg-cvr.bat ================================================ @echo off rem For usage, do: ccg-cvr -h call ccg-env %JAVA_CMD% opennlp.ccg.test.CrossValidateRealizer %* ================================================ FILE: bin/ccg-draw-graph ================================================ #!/bin/sh python "$OPENCCG_HOME/bin/dlf_parser.py" "$@" ================================================ FILE: bin/ccg-draw-graph.bat ================================================ @echo off python %OPENCCG_HOME%/bin/dlf_parser.py %* ================================================ FILE: bin/ccg-draw-tree ================================================ #!/bin/sh python "$OPENCCG_HOME/bin/ccg_draw_tree.py" "$@" ================================================ FILE: bin/ccg-draw-tree.bat ================================================ @echo off python %OPENCCG_HOME%/bin/ccg_draw_tree.py %* ================================================ FILE: bin/ccg-env ================================================ #!/bin/sh # sets OpenCCG environment variables if [ "$JAVA_HOME" = "" ] ; then echo echo "Error: JAVA_HOME not found in your environment." echo echo "Please set the JAVA_HOME variable in your environment to match the" echo "location of the Java Virtual Machine you want to use." exit 1 fi if [ "$OPENCCG_HOME" = "" ] ; then echo echo "Error: OPENCCG_HOME not found in your environment." echo echo "Please set the OPENCCG_HOME variable in your environment to match the" echo "location of your OpenNLP CCG Library distribution." exit 1 fi case `uname` in CYGWIN* ) OPENCCG_HOME="`cygpath -w $OPENCCG_HOME`" ;; esac OPENCCG_LIB="$OPENCCG_HOME/lib" OPENCCG_SRC="$OPENCCG_HOME/src" OPENCCG_CLASSES="$OPENCCG_HOME/output/classes" OPENCCG_JAR="$OPENCCG_HOME/lib/openccg.jar" case `uname` in CYGWIN* ) DIRLIBS="$OPENCCG_LIB/trove.jar;$OPENCCG_LIB/jdom.jar;$OPENCCG_LIB/jline.jar;$OPENCCG_LIB/jopt-simple.jar" CP="${OPENCCG_JAR};${DIRLIBS};." ;; * ) DIRLIBS="$OPENCCG_LIB/trove.jar:$OPENCCG_LIB/jdom.jar:$OPENCCG_LIB/jline.jar:$OPENCCG_LIB/jopt-simple.jar" CP="${OPENCCG_JAR}:${DIRLIBS}:." ;; esac # variant for use with 'build compile' option, if desired: #CP="${OPENCCG_CLASSES}:${OPENCCG_SRC}:${DIRLIBS}" JAVA="$JAVA_HOME/bin/java" JAVA_MEM="-Xmx256m" #JAVA_MEM="-Xmx2048m" #JAVA_MEM="-Xmx8g" #JAVA_MEM="-Xmx16g" JAVA_ARGS="$JAVA_MEM -classpath $CP -Dfile.encoding=UTF8" ================================================ FILE: bin/ccg-env.bat ================================================ @echo off rem sets OpenCCG environment variables if not exist "%JAVA_HOME%" goto no_JAVA_HOME if not exist "%OPENCCG_HOME%" goto no_OPENCCG_HOME set OPENCCG_LIB=%OPENCCG_HOME%\lib set DIRLIBS=%OPENCCG_LIB%\trove.jar;%OPENCCG_LIB%\jdom.jar;%OPENCCG_LIB%\jline.jar;%OPENCCG_LIB%\jopt-simple.jar set XMLLIBS=%OPENCCG_LIB%\xml-apis.jar;%OPENCCG_LIB%\xercesImpl.jar;%OPENCCG_LIB%\xalan.jar set OPENCCG_SRC=%OPENCCG_HOME%\src set OPENCCG_CLASSES=%OPENCCG_HOME%\output\classes set OPENCCG_JAR=%OPENCCG_HOME%\lib\openccg.jar rem variant without XMLLIBS rem set CP=%OPENCCG_JAR%;%DIRLIBS%;. rem variant with XMLLIBS set CP=%OPENCCG_JAR%;%DIRLIBS%;%XMLLIBS%;. rem variant for use with 'build compile' option, if desired: rem set CP=%OPENCCG_CLASSES%;%OPENCCG_SRC%;%DIRLIBS% set JAVA="%JAVA_HOME%\bin\java" set JAVA_MEM=-Xmx256m rem set JAVA_MEM=-Xmx2048m set JAVA_CMD=%JAVA% %JAVA_MEM% -classpath %CP% -Dfile.encoding=UTF8 goto end :no_JAVA_HOME echo. echo Error: JAVA_HOME not found in your environment. echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of the Java Virtual Machine you want to use. echo. exit /b 1 :no_OPENCCG_HOME echo. echo Error: OPENCCG_HOME not found in your environment. echo. echo Please set the OPENCCG_HOME variable in your environment to match the echo location of your OpenNLP CCG Library distribution. echo. exit /b 1 :end ================================================ FILE: bin/ccg-grammardoc ================================================ #!/bin/sh # # $Id: ccg-grammardoc,v 1.2 2006/12/03 17:14:23 mwhite14850 Exp $ # Script to run grammardoc from the command line. # Author: Scott Martin (http://www.ling.osu.edu/~scott/) # # Usage: ccg-grammardoc [-s|--source sourceDir] [-d|--dest destDir] # . ccg-env ANT_HOME="$OPENCCG_HOME/lib" case `uname` in CYGWIN* ) CP="$CP;$ANT_HOME/ant.jar" ;; * ) CP="$CP:$ANT_HOME/ant.jar" ;; esac JAVA_ARGS="-Xmx128m -classpath $CP" "$JAVA" $JAVA_ARGS opennlp.ccg.grammardoc.GrammarDoc $@ ================================================ FILE: bin/ccg-grammardoc.bat ================================================ @echo off rem Usage: ccg-grammardoc [-s|--source sourceDir] [-d|--dest destDir] call ccg-env set ANT_HOME=%OPENCCG_HOME%\lib set CP=%CP%;%ANT_HOME%\ant.jar set JAVA_ARGS=-Xmx128m -classpath %CP% %JAVA% %JAVA_ARGS% opennlp.ccg.grammardoc.GrammarDoc %* ================================================ FILE: bin/ccg-gt ================================================ #!/bin/sh # For usage, do: ccg-gt -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.test.GenTargets $@ ================================================ FILE: bin/ccg-gt.bat ================================================ @echo off rem For usage, do: ccg-gt -h call ccg-env %JAVA_CMD% opennlp.ccg.test.GenTargets %* ================================================ FILE: bin/ccg-ht-factors ================================================ #!/bin/sh . ccg-env #CP=$CP:$OPENCCG_HOME/lib/jopt-simple.jar #echo $JAVA_ARGS "$JAVA" $JAVA_ARGS opennlp.ccg.realize.hypertagger.LMFactorExtractor $@ ================================================ FILE: bin/ccg-hypertagger ================================================ #!/bin/sh . ccg-env #CP=$CP:$OPENCCG_HOME/lib/jopt-simple.jar #echo $JAVA_ARGS "$JAVA" $JAVA_ARGS opennlp.ccg.realize.hypertagger.TagExtract $@ ================================================ FILE: bin/ccg-hypertagger.bat ================================================ @echo off call ccg-env %JAVA_CMD% opennlp.ccg.realize.hypertagger.TagExtract %* ================================================ FILE: bin/ccg-parse ================================================ #!/bin/sh # Usage: ccg-parse -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.Parse $@ ================================================ FILE: bin/ccg-parse.bat ================================================ @echo off rem Usage: ccg-parse -h call ccg-env %JAVA_CMD% opennlp.ccg.Parse %1 %2 %3 %4 %5 %6 %7 %8 %9 ================================================ FILE: bin/ccg-postagger ================================================ #!/bin/sh . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.parse.postagger.BasicPOSTagger $@ ================================================ FILE: bin/ccg-postagger.bat ================================================ @echo off call ccg-env %JAVA_CMD% opennlp.ccg.parse.postagger.BasicPOSTagger %* ================================================ FILE: bin/ccg-realize ================================================ #!/bin/sh # Usage: ccg-realize (-g ) () . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.Realize $@ ================================================ FILE: bin/ccg-realize.bat ================================================ @echo off rem Usage: ccg-realize (-g ) () call ccg-env rem set HPROF=-Xrunhprof:cpu=times,file=hmm-prof.txt %JAVA_CMD% opennlp.ccg.Realize %1 %2 %3 %4 %5 %6 %7 %8 %9 ================================================ FILE: bin/ccg-supertagger ================================================ #!/bin/sh . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.parse.supertagger.WordAndPOSDictionaryLabellingStrategy $@ ================================================ FILE: bin/ccg-supertagger.bat ================================================ @echo off call ccg-env %JAVA_CMD% opennlp.ccg.parse.supertagger.WordAndPOSDictionaryLabellingStrategy %* ================================================ FILE: bin/ccg-test ================================================ #!/bin/sh # For usage, do: ccg-test -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.test.Regression "$@" ================================================ FILE: bin/ccg-test.bat ================================================ @echo off rem For usage, do: ccg-test -h call ccg-env rem set HPROF=-Xrunhprof:cpu=times,file=hmm-prof.txt %JAVA_CMD% opennlp.ccg.test.Regression %* ================================================ FILE: bin/ccg-update ================================================ #!/bin/sh # For usage, do: ccg-update -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.test.UpdateTestbed $@ ================================================ FILE: bin/ccg-update.bat ================================================ @echo off rem For usage, do: ccg-update -h call ccg-env %JAVA_CMD% opennlp.ccg.test.UpdateTestbed %* ================================================ FILE: bin/ccg2xml ================================================ #!/bin/sh python "$OPENCCG_HOME/bin/ccg2xml.py" "$@" ================================================ FILE: bin/ccg2xml.bat ================================================ @echo off python %OPENCCG_HOME%/bin/ccg2xml.py %* ================================================ FILE: bin/ccg_draw_tree.py ================================================ # # ccg_draw_tree uses nltk.Tree to draw a tree from a CCGbank .auto file, # or to draw two trees from two .auto files # import sys if len(sys.argv) == 1 or sys.argv[1] == '-h' or sys.argv[1] == '--help': print 'Usage: ccg_draw_tree () ()' sys.exit(0) autofile = sys.argv[1] deriv_id = sys.argv[2] autofile2 = None deriv_id2 = None if len(sys.argv) >= 4: autofile2 = sys.argv[3] deriv_id2 = deriv_id if len(sys.argv) >= 5: deriv_id2 = sys.argv[4] def get_deriv(autofile, deriv_id): print 'reading ' + deriv_id + ' from ' + autofile found_it = False file = open(autofile, 'rU') for line in file: if found_it == True: return line if line[0:2] == 'ID': if line.split()[0].split('=')[1] == deriv_id: found_it = True raise NameError('could not find ' + deriv_id + '!') deriv = get_deriv(autofile, deriv_id) deriv2 = None if autofile2 != None: deriv2 = get_deriv(autofile2, deriv_id2) print 'importing nltk.Tree' from nltk import Tree from nltk.draw.tree import draw_trees ccgbank_node_pattern = r'' ccgbank_leaf_pattern = r'' # nb: the parens around leaves ends up creating blank nodes above leaves def parse_ccgbank_node(s): if s =='': return '' return s.split(' ')[1] def parse_ccgbank_leaf(s): tokens = s.split(' ') return Tree(tokens[1], [tokens[4]]) def excise_empty_nodes(t): if not isinstance(t,Tree): return t if t.node == '': return excise_empty_nodes(t[0]) return Tree(t.node, [excise_empty_nodes(st) for st in t]) # nb: returns tree with blank nodes excised def parse_ccgbank_tree(s): t = Tree.parse(s, parse_node=parse_ccgbank_node, parse_leaf=parse_ccgbank_leaf, node_pattern=ccgbank_node_pattern, leaf_pattern=ccgbank_leaf_pattern) return excise_empty_nodes(t) print print 'parsing: ' + deriv t = parse_ccgbank_tree(deriv) print t t2 = None if deriv2 != None: print print 'parsing: ' + deriv2 t2 = parse_ccgbank_tree(deriv2) print t2 print if t2 == None: print 'drawing tree' draw_trees(t) else: print 'drawing trees' draw_trees(t,t2) ================================================ FILE: bin/dlf_parser.py ================================================ # # dlf_parser.py (invoked by ccg-draw-graph) uses graphviz's dot to visualize (D)LF graphs # # author: Jonathan Barker (with minor contributions by Michael White) # license: LGPL # from xml.etree.ElementTree import ElementTree import optparse, sys, codecs, xml, os from collections import defaultdict # Parse arguments op = optparse.OptionParser() op.add_option("-i", "--input", type="string", help="input source: file or (default)", default=sys.stdin) op.add_option("-m", "--moses", type="string", help="file/directory prefix for moses output", default=None) op.add_option("-v", "--visualize", type="string", help="file/directory prefix for .pdf output", default=None) op.add_option("-w", "--wordindices", action="store_true", help="include word indices", default=False) op.add_option("-c", "--classnames", action="store_true", help="include semantic class names", default=False) (ops, args) = op.parse_args(sys.argv) # Parse input file input_source = ops.input if ops.input is sys.stdin else open(ops.input, "rt") raw = xml.etree.ElementTree.XML(input_source.read()) snum = "None" att_id = 0 # Get word number def wordNum(wid): if wid.startswith("x"): return -1 else: return int(wid[1:].strip("f")) # Get node span def span(nid, graph, w): if wordNum(nid) in w: return [] w.append(wordNum(nid)) for n, e in graph[nid]: if wordNum(n) not in w: w.append(wordNum(n)) w.extend(span(n, graph, w)) return w # findall wrapper def findAll(elem, match): return max(elem.findall(match), []) # Class for representing predicates and attributes class Pred: def __init__(self): self.attrib = [] self.one_of = [] self.opt = [] # Class for representing nodes, contains predicate and attribute information class Node: def __init__(self): self.id = "" self.className = "" self.preds = defaultdict(Pred) def addPred(self, pred, attrib, one_of, opt): self.preds[pred].attrib.extend(attrib) self.preds[pred].one_of.extend(one_of) self.preds[pred].opt.extend(opt) def moses(self, graph): tree = " " return tree def dot(self): dot_node = self.id+" [label=<" withClassName = ops.classnames and len(self.className) > 0 if ops.wordindices: dot_node += self.id if withClassName: dot_node += ":" if withClassName: dot_node += self.className if len(self.preds) > 0: if ops.wordindices or withClassName: dot_node += ":" labels = [] for pname, p in self.preds.items(): label = "" # pred label += ""+pname+"" # att atts = [] if len(p.attrib) > 0: atts.append(",".join(["<"+k.upper()+">"+v for (k, v) in p.attrib])) if len(p.one_of) > 0: atts.append("|".join(["<"+k.upper()+">"+v for (k, v) in p.one_of])) if len(p.opt) > 0: atts.append("("+",".join(["<"+k.upper()+">"+v for (k, v) in p.opt])+")?") if len(atts) > 0: label += ""+",".join(atts)+"" labels.append(label) dot_node += " | ".join(labels) dot_node += ">];\n" return dot_node def info(self): print "Node id:",self.id for pname, p in self.preds: print "\tPred:",self.pred print "\t\tAttrib:",p.attrib print "\t\tOne_of:",p.one_of print "\t\tOpt:",p.opt print "----------------" # Returns just the id, stripping the class (if any) def parseId(str): colonIndex = str.find(":") if colonIndex > 0: return str[:colonIndex] else: return str # Returns the class from the id, or the empty string if none def parseClass(str): colonIndex = str.find(":") if colonIndex > 0: return str[colonIndex+1:] else: return "" # Method for parsing def parseNode(node, graph, nodes): n = nodes[node.get("id")] n.id = parseId(node.get("id")) n.className = parseClass(node.get("id")) attrib = [(k, v) for (k, v) in node.items() if k not in ["id", "pred"]] if node.get("pred") is not None: n.addPred(node.get("pred"), attrib, [], []) nodes[n.id] = n for elem in list(node): if elem.tag == "rel": parseRel(elem, n.id, graph, nodes, "") elif elem.tag == "one-of": parseOneOf(elem, n, attrib, node.get("pred"), graph, nodes) elif elem.tag == "opt": parseOpt(elem, n, graph, nodes) elif elem.tag == "node": parseNode(elem, graph, nodes) else: print snum+": Unexpected tag <"+elem.tag+"> after " quit() # Method for parsing def parseOpt(opt, node, graph, nodes): for elem in list(opt): if elem.tag == "atts": for pname, p in node.preds.items(): node.addPred(pname, [], [], [(k, v) for (k, v) in elem.items() if k not in ["id", "pred"]]) elif elem.tag == "rel": parseRel(elem, node.id, graph, nodes, "style=dotted, ") else: print snum+": Unexpected tag <"+elem.tag+"> after " quit() # Method for parsing def parseOneOf(oneof, node, attrib, pred, graph, nodes): global att_id num_att = 0 for elem in list(oneof): if elem.tag == "atts": if pred is not None: node.addPred(pred, [], [(k, v) for (k, v) in elem.items() if k not in ["id", "pred"]], []) else: node.addPred(elem.get("pred"), [], [(k, v) for (k, v) in elem.items() if k not in ["id", "pred"]], []) if len(list(elem)) > 0: num_att += 1 new_att = Node() new_att.id = "att"+str(att_id) att_id += 1 new_att.addPred(str(num_att), [], [], []) nodes[new_att.id] = new_att graph[node.id].append((new_att.id, " [style=dashed];\n")) for rel in list(elem): parseRel(rel, new_att.id, graph, nodes, "") elif elem.tag == "rel": num_att += 1 new_att = Node() new_att.id = "att"+str(att_id) att_id += 1 new_att.addPred(str(num_att), [], [], []) nodes[new_att.id] = new_att graph[node.id].append((new_att.id, " [style=dashed];\n")) parseRel(elem, new_att.id, graph, nodes, "") else: print snum+": Unexpected tag <"+elem.tag+"> after " quit() # Method for parsing def parseRel(rel, nid, graph, nodes, style): # for subnode in list(rel): if subnode.tag == "node": edge_label = " ["+style+"label = \""+rel.get("name")+"\"];\n" if subnode.get("id") is None: graph[nid].append((parseId(subnode.get("idref")), edge_label)) else: graph[nid].append((parseId(subnode.get("id")), edge_label)) parseNode(subnode, graph, nodes) elif subnode.tag == "one-of": subnode.set("name", rel.get("name")) parseRel(subnode, nid, graph, nodes, "style=dashed, ") else: print snum+": Unexpected tag <"+subnode.tag+"> after " quit() # item_no = 0 for item in findAll(raw, "item"): item_no += 1 if item.get("numOfParses") == "0": print "Removing "+item.get("info") else: snum = item.get("info") # lf_num = 0 for lf in findAll(item, "lf"): graph = defaultdict(list) nodes = defaultdict(Node) # for node in list(lf): if node.tag == "node": parseNode(node, graph, nodes) else: print snum+": Unexpected tag <"+node.tag+"> after " quit() # Plot the graph with GraphViz if ops.visualize != None: viz_name = "" if type(item.get("info")) != type("string"): viz_name = ops.visualize+".item"+str(item_no)+"."+str(lf_num) else: viz_name = ops.visualize+"."+item.get("info")+"."+str(lf_num) viz = codecs.open(viz_name+".dot", "w", "utf-8") viz.write("digraph lf {\n") for (k, v) in nodes.items(): viz.write(v.dot()) for (left, rights) in graph.items(): for right in rights: viz.write(left+"->"+right[0]+right[1]) viz.write("}\n") viz.close() os.system("dot -Tpdf "+viz_name+".dot -o "+viz_name+".pdf") os.system("rm "+viz_name+".dot") lf_num += 1 ================================================ FILE: bin/tccg ================================================ #!/bin/sh # For usage, do: tccg -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.TextCCG "$@" ================================================ FILE: bin/tccg.bat ================================================ @echo off rem For usage, do: tccg -h call ccg-env %JAVA_CMD% opennlp.ccg.TextCCG %* ================================================ FILE: bin/visccg ================================================ #!/bin/sh python "$OPENCCG_HOME/bin/ccg_editor.py" "$@" ================================================ FILE: bin/visccg.bat ================================================ @echo off python %OPENCCG_HOME%/bin/ccg_editor.py %* ================================================ FILE: bin/wccg ================================================ #!/bin/sh # For usage, do: tccg -h . ccg-env "$JAVA" $JAVA_ARGS opennlp.ccg.WebCCG "$@" ================================================ FILE: build.xml ================================================ ================================================ FILE: ccg-format-grammars/arabic/arabic.ccg ================================================ ############################################################# # # # arabic.ccg # # # ############################################################# # Author: Ben Wing # Date: April 2006 # This is a grammar for a fragment of Arabic. It's particularly # useful for demonstrating the extended use of macros to handle # complicated morphological inflections. # See the `tiny' grammar (tiny.ccg) for more info about the format # of this file. feature { CASE<2>: nom, acc, gen; NUM<2>: sg, du, pl; GEND<2>: m, f; STATE<2>: cons, non-cons {indef, def}; ANIM<2>: hum, nonhum; PERS<2>: 1st, 2nd, 3rd; RESUMPTIVE<2>: nonres, res; SEM-NUM: sg-X, du-X, pl-X; SEM-PERS: 1st-X, 2nd-X, 3rd-X; TENSE: past, pres; MOOD: indic, subj, juss; # Here's a more complicated hierarchy, from the original tiny grammar. ontology: sem-obj { phys-obj { animate-being { person }, thing }, situation { change { action }, state } }; } rule { no typeraise; typeraise +: n => s; typeraise - $: n => s; typeraise - $: pp => s; typeraise - $: pp/n => s; typechange: s$1 | n[nom] => s$1 ; typechange: n<~2>[cons] => n<2>[3rd,def] /* n[gen,def] ; typechange: n<~2>[cons] => n<2>[3rd,indef] /* n[gen,indef] ; } ########################################################################## # Morphological entries # # (morph.xml) # ########################################################################## word wa:Conj; # "and" word anna:Comp; # "that", introducing sentential complements word inna:Comp; # same, but only after the verb qaal "say" word maa:InterrogPro(thing): 3rd; # "what" word man:InterrogPro(person): 3rd; # "who" word li:Prep; # "what" word fii:Prep; # "who" # This word means "this". word haadhaa { *: sg, m; haadhihi: sg, f; ha_ulaahi: pl; # Bizarrely, this word declines for case only in the dual. haadhaani: du, m, nom; haadhayni: du, m, acc; haadhayni: du, m, gen; haataani: du, f, nom; haatayni: du, f, acc; haatayni: du, f, gen; } # This word means "that". word dhaalik { *: sg, m; tilka: sg, f; ulaa_ika: pl; # Bizarrely, this word declines for case only in the dual. dhaanika: du, m, nom; dhaynika: du, m, acc; dhaynika: du, m, gen; taanika: du, f, nom; taynika: du, f, acc; taynika: du, f, gen; } # This is the relative pronoun. word al-ladhii { *: sg, m; al-latii: sg, f; al-ladhiina: pl, m; al-laati: pl, f; # Bizarrely, this word declines for case only in the dual. al-ladhaani: du, m, nom; al-ladhayni: du, m, acc; al-ladhayni: du, m, gen; al-lataani: du, f, nom; al-latayni: du, f, acc; al-latayni: du, f, gen; } word pro:Pro { ana: 1st, 1st-X, sg, sg-X; anta: 2nd, 2nd-X, sg, sg-X, m; anti: 2nd, 2nd-X, sg, sg-X, f; huwa: 3rd, 3rd-X, sg, sg-X, m; hiya: 3rd, 3rd-X, sg, sg-X, f; naHnu: 1st, 1st-X, pl, pl-X; antun: 2nd, 2nd-X, pl, pl-X, m; antunna: 2nd, 2nd-X, pl, pl-X, f; hum: 3rd, 3rd-X, pl, pl-X, m; hunna: 3rd, 3rd-X, pl, pl-X, f; } word ii:: 1st, 1st-X, sg, sg-X; word nii:: 1st, 1st-X, sg, sg-X; word ka:: 2nd, 2nd-X, sg, sg-X, m; word ki:: 2nd, 2nd-X, sg, sg-X, f; word hu:: 3rd, 3rd-X, sg, sg-X, m; word haa:: 3rd, 3rd-X, sg, sg-X, f; word naa:: 1st, 1st-X, pl, pl-X; word kum:: 2nd, 2nd-X, pl, pl-X, m; word kunna:: 2nd, 2nd-X, pl, pl-X, f; word hum:: 3rd, 3rd-X, pl, pl-X, m; word hunna:: 3rd, 3rd-X, pl, pl-X, f; ############################################# # Nouns # ############################################# # This shows how a reasonably complicated morphology can be accommodated. # It is certainly possible that some of this may (and probably should) # be offloaded into a separate morphology-processing engine. However, # even in that case there is often a good deal more to the lexicon. # We show a couple examples of complete paradigms, in order to make it # easier to understand what's going on below. # Here is a typical noun (kitaab "book") with a broken plural (kutub "books"). # For nouns with broken plurals, the plural is typically declined like # the singular. Note that Arabic nouns are conjugated for three numbers # (singular, dual, plural), three cases (nominative, accusative, dative), # and three states (indefinite, definite, construct). (The construct state # is used for nouns that are modified by other nouns -- e.g. "book" in # "the book of Mary".) # Form Nominative Accusative Dative # --------------------------------------------------------------- # sg.indef kitaabun kitaaban kitaabin # sg.def al-kitaabu al-kitaaba al-kitaabi # sg.cons kitaabu kitaaba kitaabi # # du.indef kitaabaani kitaabayni kitaabayni # du.def al-kitaabaani al-kitaabayni al-kitaabayni # du.cons kitaabaa kitaabay kitaabay # # pl.indef kutubun kutuban kutubin # pl.def al-kutubu al-kutuba al-kutubi # pl.cons kutubu kutuba kutubi # Here is a typical noun (mudarris "teacher") with a different kind of # plural, a so-called "strong masculine plural", which has its own declension. # Form Nominative Accusative Dative # --------------------------------------------------------------- # sg.indef mudarrisun mudarrisan mudarrisin # sg.def al-mudarrisu al-mudarrisa al-mudarrisi # sg.cons mudarrisu mudarrisa mudarrisi # # du.indef mudarrisaani mudarrisayni mudarrisayni # du.def al-mudarrisaani al-mudarrisayni al-mudarrisayni # du.cons mudarrisaa mudarrisay mudarrisay # # pl.indef mudarrisuuna mudarrisiina mudarrisiina # pl.def al-mudarrisuuna al-mudarrisiina al-mudarrisiina # pl.cons mudarrisuu mudarrisii mudarrisii # Here, we make heavy use of macros. # This macro says: Every time an expression of the form # three-form-decl(...) occurs, replace it with the text that comes after. # The parameters will be substituted into the text. The braces that # denote the macro's text do *NOT* form part of the text that is substituted. # Note that macro substitutions are processed recursively: If the text # of a macro substitution contains calls to other macros, they will also # be processed. This makes "inheritance" very easy to implement. # This macro is used for a particular paradigm corresponding to a # particular number of a word. def three-different-form-decl(indef-form, def-form, cons-form, indef-nom, indef-acc, indef-gen, def-nom, def-acc, def-gen, cons-nom, cons-acc, cons-gen, morph-num, sem-num, gend) { indef-form.indef-nom: morph-num, sem-num, gend, nom, indef; indef-form.indef-acc: morph-num, sem-num, gend, acc, indef; indef-form.indef-gen: morph-num, sem-num, gend, gen, indef; add-al(def-form.def-nom): morph-num, sem-num, gend, nom, def; add-al(def-form.def-acc): morph-num, sem-num, gend, acc, def; add-al(def-form.def-gen): morph-num, sem-num, gend, gen, def; cons-form.cons-nom: morph-num, sem-num, gend, nom, cons; cons-form.cons-acc: morph-num, sem-num, gend, acc, cons; cons-form.cons-gen: morph-num, sem-num, gend, gen, cons; } # It's questionable whether we should do this. This assimilates al- # to a following coronal consonant, e.g. ar-rajul, as-sigaara, # ath-thalj, an-nuur, aDH-DHuhr, etc. def add-al(form) regsub('^al-([std]h|DH|[tdszrnTDSZL])', 'a\1-\1', al-.form) def three-form-decl(form, indef-nom, indef-acc, indef-gen, def-nom, def-acc, def-gen, cons-nom, cons-acc, cons-gen, morph-num, sem-num, gend) { three-different-form-decl(form, form, form, indef-nom, indef-acc, indef-gen, def-nom, def-acc, def-gen, cons-nom, cons-acc, cons-gen, morph-num, sem-num, gend) } # Using the above macro, we create two more macros to handle two common # paradigm types: Accusative and genitive are the same, and the # definite is either the same as the construct (two-form-decl-1) or # the same as the indefinite (two-form-decl-2). def two-form-decl-1(form, non-cons-nom, non-cons-obl, cons-nom, cons-obl, morph-num, sem-num, gend) { three-form-decl(form, non-cons-nom, non-cons-obl, non-cons-obl, non-cons-nom, non-cons-obl, non-cons-obl, cons-nom, cons-obl, cons-obl, morph-num, sem-num, gend) } def two-form-decl-2(form, indef-nom, indef-obl, non-indef-nom, non-indef-obl, morph-num, sem-num, gend) { three-form-decl(form, indef-nom, indef-obl, indef-obl, non-indef-nom, non-indef-obl, non-indef-obl, non-indef-nom, non-indef-obl, non-indef-obl, morph-num, sem-num, gend) } # In turn we create macros for particular paradigms: strong masculine ("uun"), # strong feminine ("aat"), dual, and basic triptote (the paradigm for # "kitaab" above and, in general, most singulars). # Note that an alternative to using braces is to put the macro text on # the same line as the `def' part of the macro (backslashes can be used # to join multiple lines together). def uun-plural(form) two-form-decl-1(form, uuna, iina, uu, ii, pl, pl-X, m) def aat-plural(form) \ two-form-decl-2(form, aatun, aatin, aatu, aati, pl, pl-X, f) def dual(form, gend) \ two-form-decl-1(form, aani, ayni, aa, ay, du, du-X, gend) def triptote(form, morph-num, sem-num, gend) \ three-form-decl(form, un, an, in, u, a, i, u, a, i, morph-num, sem-num, gend) # Here we define macros for full paradigms for words. Note how semicolons # are not used, because they are supplied by the macro text itself. # (Consult the text for three-form-decl() above, and remember that the # braces denoting the macro text are not actually part of the text. This # means that if you really want braces as the outermost thing in some # macro text, you'll need to supply two levels of braces.) def thing(sing, plur) { word sing:N(thing) { triptote(sing, sg, sg-X, m) dual(sing, m) triptote(plur, sg, pl-X, f) } } def fem-thing(sing, plur) { word sing:N(thing) { triptote(sing.t, sg, sg-X, f) dual(sing.t, f) triptote(plur, sg, pl-X, f) } } def person(sing, plur, gend) { word sing:N(person) { triptote(sing, sg, sg-X, gend) dual(sing, gend) triptote(plur, pl, pl-X, gend) } } def male(sing, plur) person(sing, plur, m) def female(sing, plur) person(sing, plur, f) def strong-male(sing) { word sing:N(person) { triptote(sing, sg, sg-X, m) dual(sing, m) uun-plural(sing) } } # Here we define the actual words. Note how short these definitions are, # specifying only what's unpredictable. thing(kitaab, kutub) thing(waqt, _awqaat) thing(Harf, Huruuf) thing(dars, duruus) thing(waqt, _awqaat) fem-thing(sigaara, sagaayir) fem-thing(madiina, mudun) male(rajul, rijaal) male(walad, _awlaad) male(Taalib, Tullaab) female(bint, _abnaat) strong-male(mudarris) word imra_a:N(person) { three-different-form-decl(imra_at, mar_at, imra_at, un, an, in, u, a, i, u, a, i, sg, sg-X, f) three-different-form-decl(imra_at, mar_at, imra_at, aani, ayni, ayni, aani, ayni, ayni, aa, ay, ay, du, du-X, f) triptote(nisaa_, pl, pl-X, f) } def extended_construct_word(stem, plur) { word stem:N(person) { three-form-decl(stem, un, an, in, u, a, i, uu, aa, ii, sg, sg-X, m) dual(stem, m) triptote(plur, pl, pl-X, m) } } extended_construct_word(_ax, _ixwaan) extended_construct_word(_ab, _abnaa_) # Typical paradigms: # 1sg 'aktaa 'aktaa 'akta | # 2sg.m taktaa taktaa takta | # 2sg.f taktayna taktay taktay | # 3sg.m yaktaa yaktaa yakta | # 3sg.f taktaa taktaa takta | # 2du taktayaani taktayaa taktayaa | # 3du.m yaktayaani yaktayaa yaktayaa | # 3du.f taktayaani taktayaa taktayaa | # 1pl naktaa naktaa nakta | # 2pl.m taktawna taktaw taktaw | # 2pl.f taktayna taktayna taktayna | # 3pl.m yaktawna yaktaw yaktaw | # 3pl.f yaktayna yaktayna yaktayna | # # 1sg 'aktuu 'aktuwa 'aktu | 'aktii 'aktiya 'akti # 2sg.m taktuu taktuwa taktu | taktii taktiya takti # 2sg.f taktiina taktii taktii | taktiina taktii taktii # 3sg.m yaktuu yaktuwa yaktu | yaktii yaktiya yakti # 3sg.f taktuu taktuwa taktu | taktii taktiya takti # 2du taktuwaani taktuwaa taktuwaa | taktiyaani taktiyaa taktiyaa # 3du.m yaktuwaani yaktuwaa yaktuwaa | yaktiyaani yaktiyaa yaktiyaa # 3du.f taktuwaani taktuwaa taktuwaa | taktiyaani taktiyaa taktiyaa # 1pl naktuu naktuwa naktu | naktii naktiya nakti # 2pl.m taktuuna taktuu taktuu | taktuuna taktuu taktuu # 2pl.f taktuuna taktuuna taktuuna | taktiina taktiina taktiina # 3pl.m yaktuuna yaktuu yaktuu | yaktuuna yaktuu yaktuu # 3pl.f yaktuuna yaktuuna yaktuuna | yaktiina yaktiina yaktiina def two-form-past(formv, formc) { formc.tu: past, 1st, sg; formc.ta: past, 2nd, m, sg; formc.ti: past, 2nd, f, sg; formv.a: past, 3rd, m, sg; formv.at: past, 3rd, f, sg; formc.tumaa: past, 2nd, du; formv.aa: past, 3rd, m, du; formv.ataa: past, 3rd, f, du; formc.naa: past, 1st, pl; formc.tum: past, 2nd, m, pl; formc.tunna: past, 2nd, f, pl; formv.uu: past, 3rd, m, pl; formc.na: past, 3rd, f, pl; } def 3rd-weak-past-ay(form) { form.ay.tu: past, 1st, sg; form.ay.ta: past, 2nd, m, sg; form.ay.ti: past, 2nd, f, sg; form.aa: past, 3rd, m, sg; form.at: past, 3rd, f, sg; form.ay.tumaa: past, 2nd, du; form.ay.aa: past, 3rd, m, du; form.ataa: past, 3rd, f, du; form.ay.naa: past, 1st, pl; form.ay.tum: past, 2nd, m, pl; form.ay.tunna: past, 2nd, f, pl; form.aw: past, 3rd, m, pl; form.ay.na: past, 3rd, f, pl; } def 3rd-weak-past-aw(form) { form.aw.tu: past, 1st, sg; form.aw.ta: past, 2nd, m, sg; form.aw.ti: past, 2nd, f, sg; form.aa: past, 3rd, m, sg; form.at: past, 3rd, f, sg; form.aw.tumaa: past, 2nd, du; form.aw.aa: past, 3rd, m, du; form.ataa: past, 3rd, f, du; form.aw.naa: past, 1st, pl; form.aw.tum: past, 2nd, m, pl; form.aw.tunna: past, 2nd, f, pl; form.aw: past, 3rd, m, pl; form.aw.na: past, 3rd, f, pl; } def 3rd-weak-past-ii(form) { form.ii.tu: past, 1st, sg; form.ii.ta: past, 2nd, m, sg; form.ii.ti: past, 2nd, f, sg; form.iya: past, 3rd, m, sg; form.iyat: past, 3rd, f, sg; form.ii.tumaa: past, 2nd, du; form.iy.aa: past, 3rd, m, du; form.iy.ataa: past, 3rd, f, du; form.ii.naa: past, 1st, pl; form.ii.tum: past, 2nd, m, pl; form.ii.tunna: past, 2nd, f, pl; form.uu: past, 3rd, m, pl; form.ii.na: past, 3rd, f, pl; } def strong-past(form) two-form-past(form, form) # In general, almost all Arabic present-tense verbs of a particular mood # can be defined using five forms. Verbs with a hamza in the first radical # have a problem in the first-singular; ideally this should be handled # automatically using a regexp or something of that sort, but we don't have # such support currently, so we use an optional param. def gen-pres(mood, fsing, fsing-fem, fdual, fplur-masc, fplur-fem) { # This shows how you can use regular expressions if need be. # regsub(string, regex, repl) is a special built-in that does regular- # expression substitution on STRING, replacing all occurrences of # REGEX with REPL. Regular-expression syntax is as in Python. # In this case, Arabic verbs have a phonetic rule that eliminates # two glottal stops occurring near each other at the beginning of a # word. For example, _a_kulu -> _aakulu, and _u_kalu -> _uukalu. # (That is, the vowel is lengthened.) # _ . regsub(foo, bar, fsing): pres, mood, 1st, sg; _ . regsub('^([aiu])_', '\1\1', fsing): pres, mood, 1st, sg; # _.fsing: pres, mood, 1st, sg; t.fsing: pres, mood, 2nd, m, sg; t.fsing-fem: pres, mood, 2nd, f, sg; y.fsing: pres, mood, 3rd, m, sg; t.fsing: pres, mood, 3rd, f, sg; t.fdual: pres, mood, 2nd, du; y.fdual: pres, mood, 3rd, m, du; t.fdual: pres, mood, 3rd, f, du; n.fsing: pres, mood, 1st, pl; t.fplur-masc: pres, mood, 2nd, m, pl; t.fplur-fem: pres, mood, 2nd, f, pl; y.fplur-masc: pres, mood, 3rd, m, pl; y.fplur-fem: pres, mood, 3rd, f, pl; } # The "two-form" present uses normal (non-3rd-weak) endings but may # have two forms of the root, one form vocalic endings (almost all of them) # and one for consonant endings (only the feminine plural). This # encompasses 2nd-weak verbs and doubled verbs, and (trivially) strong verbs. def two-form-pres-indic(formv, formc) { gen-pres(indic, formv.u, formv.iina, formv.aani, formv.uuna, formc.na) } def two-form-pres-subj(formv, formc) { gen-pres(subj, formv.a, formv.ii, formv.aa, formv.uu, formc.na) } # The jussive is different because the base form (fsing) has no ending. # This means that it may assume the consonant form instead of the vowel # form, or may have a number of variants (in particular, for doubled # verbs). So the base form needs to be given explicitly. def two-form-pres-juss(base, formv, formc) { gen-pres(juss, base, formv.ii, formv.aa, formv.uu, formc.na) } def strong-pres(form) { two-form-pres-indic(form, form) two-form-pres-subj(form, form) two-form-pres-juss(form, form, form) } def 2nd-weak-pres(formv, formc) { two-form-pres-indic(formv, formc) two-form-pres-subj(formv, formc) two-form-pres-juss(formc, formv, formc) } def doubled-pres(formv, formc) { two-form-pres-indic(formv, formc) two-form-pres-subj(formv, formc) two-form-pres-juss(formc, formv, formc) two-form-pres-juss(formv.a, formv, formc) two-form-pres-juss(formv.i, formv, formc) } # Verbs whose third radical is a /w/ or a /y/ have all manner of exceptional # forms; easiest just to list them. In general, there are three types, # depending on whether the base singular forms end in -aa, -ii, or -uu. def 3rd-weak-pres-aa(form) { gen-pres(indic, form.aa, form.ayna, form.ayaani, form.awna, form.ayna) gen-pres(subj, form.aa, form.ay, form.ayaa, form.aw, form.ayna) # Note the shortened vowel here. gen-pres(juss, form.a, form.ay, form.ayaa, form.aw, form.ayna) } def 3rd-weak-pres-ii(form) { gen-pres(indic, form.ii, form.iina, form.iyaani, form.uuna, form.iina) gen-pres(subj, form.iya, form.ii, form.iyaa, form.uu, form.iina) # Note the shortened vowel here. gen-pres(juss, form.i, form.ii, form.iyaa, form.uu, form.iina) } def 3rd-weak-pres-uu(form) { gen-pres(indic, form.uu, form.iina, form.uwaani, form.uuna, form.uuna) gen-pres(subj, form.uwa, form.ii, form.uwaa, form.uu, form.uuna) # Note the shortened vowel here. gen-pres(juss, form.u, form.ii, form.uwaa, form.uu, form.uuna) } def 2nd-weak-verb(pastv, props, pastc, presv, presc) { word pastv: props { two-form-past(pastv, pastc) 2nd-weak-pres(presv, presc) } } # Note the way that macro calls can be constructed as well. Here, the # value of PAST_TYPE is the suffix at the end of the macro name. def 3rd-weak-verb(past_stem, props, past_type, pres_stem, pres_type) { word past_stem . past_type: props { 3rd-weak-past- . past_type(past_stem) 3rd-weak-pres- . pres_type(pres_stem) } } def strong-verb(past, props, pres) { word past: props { strong-past(past) strong-pres(pres) } } 2nd-weak-verb(kaan, TransV(pred=be), kun, akuun, akun) 2nd-weak-verb(naam, IntransV(pred=sleep), nim, anaam, anam) 2nd-weak-verb(qaal, SayV(pred=say), qul, aquul, aqul) strong-verb(katab, TransV(pred=write), aktub) strong-verb(dhahab, IntransV(pred=go), adhhab) # Note that the following verb, which begins with a glottal stop, # will have a modification made to it in the first-person singular present. # (See above.) strong-verb(_akal, IntransV TransV (pred=eat), a_kul) 3rd-weak-verb(ra_, TransV(pred=see), ay, ar, aa) 3rd-weak-verb(_a9T, DitransV(pred=give), ay, u9T, ii) 3rd-weak-verb(laq, TransV(pred=find), ii, alq, aa) strong-verb(9araf, ThinkV(pred=know), a9rif) # see also 9alam strong-verb(tafakkar, ThinkV(pred=think), atafakkar) strong-verb(ta9allam, ThinkV(pred=learn), ata9allam) family N { entry: n<2>[X, 3rd, nonres]: X:sem-obj(*); } family InterrogPro(Pro) { entry: s/*(s/n<2>[res]); entry: s/*(s|n<2>[nonres,nom]); entry: s/*(s/n<2>[nonres,acc]); member: maa, man; } family Pro { entry: n<2>[X, nom, def, nonres]: X:sem-obj(*); member: pro; } family Rel { entry: (n<~2>[CASE,nonres]\n<~2>[nonres])/*(s/n<2>[res]); entry: (n<~2>[CASE,nonres]\n<~2>[nonres])/*(s|n<2>[nonres,nom]); entry: (n<~2>[CASE,nonres]\n<~2>[nonres])/*(s/n<2>[nonres,acc]); member: al-ladhii; } family AndConj(Conj) { entry: n[pl, CASE, STATE] \* n[CASE, STATE] /* n[CASE, STATE]; entry: s$1 \* s$1 /* s$1; member: wa; } family Det(indexRel=det) { entry: n<2>[X, def, nonres] /^ n<2>[X]: X:sem-obj(*); member: haadhaa, dhaalik; } # good luck on this one! construct even more complicated ones! # #ar-rajulu al-ladhii kataba al-kutuba ra_aa wa _a9Taa li binti hu al-kilaaba al-latii akalat sagaayira mudarrisii al-waladi #"the man that wrote the books saw and gave to his daughter the dogs that ate the cigarettes of the boy's teachers." # Works, correctly: #ar-rajulu al-ladhii kataba al-kutuba ra_aa wa _a9Taa li binti hu as-sagaayira al-latii qultu inna al-waladu tafakkara anna al-mar_atu _a9Tat haa li ha_ulaahi al-mudarrisiina #Bad #ar-rajula al-ladhii katabat al-kutuba ra_aa wa _a9Taa li binti hu as-sagaayira al-latii qultu inna al-waladu tafakkara anna al-mar_atu _a9Tat haa li ha_ulaahi al-mudarrisiina #"the man that wrote the books saw and gave to his daughter the cigarettes that I said that the boy thought that the woman gave them to those teachers" # Fails, correctly: #ar-rajulu al-ladhii kataba al-kutuba ra_aa wa _a9Taa li binti hu as-sagaayira al-latii qultu inna al-waladu tafakkara anna al-mar_atu _a9Tat hu li ha-ulaahi al-mudarrisiina # Works: # ar-rajulu al-ladhii al-waladu _a9Taa as-sagaayira li binti hu dhahaba # ar-rajulu _a9Taa li binti hu as-sagaayira # ar-rajulu al-ladhii waladu hu _a9Taa as-sagaayira li al-binti dhahaba # Won't work: # ar-rajulu al-ladhii al-waladu _a9Taa li binti hu as-sagaayira dhahaba # ar-rajulu al-ladhii waladu hu _a9Taa li al-binti as-sagaayira dhahaba family PossClitic(Cli, indexRel=poss) { entry: n<~1>[X, def, nonres] \* n<1>[X, cons]: X:sem-obj(*); entry: (n<~1>[X, def, nonres] / n<2>[res]) \* n<1>[X, cons]: X:sem-obj(*); member: ii, ka, ki, hu, haa, naa, kum, kunna, hum, hunna; } family ObjClitic(Cli, indexRel=poss) { entry: (s$1 | n<3>[nonres]) \ (s$1 / n<2>[nonres] | n<3>[nom,nonres]); entry: (s$1 / n<~2>[res] | n<3>[nonres]) \ (s$1 / n<2>[nonres] | n<3>[nom,nonres]); member: nii, ka, ki, hu, haa, naa, kum, kunna, hum, hunna; } family Adj(indexRel=adj) { entry: n<2>[X, NUM, GEND, CASE, STATE] \ n<2>[X]: X:sem-obj(*); } family Prep-Nom(Prep, indexRel="*NoSem*") { # The pp<~3> notation generates an 'inheritsFrom' tag rather than # an 'id' tag for the feature structure. entry: pp<~3>[lex=*] /< n<3>[gen,nonres]; member: li, fii; } family Comp(indexRel="*NoSem*") { entry: sbar<~1>[lex=*] / s<1>; member: anna, inna; } family IntransV(V) { entry: s[E] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being); } family SayV(V) { entry: s[E] / sbar[Z, lex=inna] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Z); } family ThinkV(V) { entry: s[E] / sbar[Z, lex=anna] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Z); } family TransV(V) { entry: s[E] / n[Y,acc,nonres] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Y:sem-obj); } family DitransV(V) { # The first slash (on the pp) is marked with a mode allowing backward xcomp. entry: s[E] /< pp[Z,lex=li] / n[Y,acc,nonres] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Y:sem-obj ^ Z:animate-being) ; } testbed { # different states of subject rajulun dhahaba: 1; ar-rajulu dhahaba: 1; rajulu dhahaba: 0; # subject-verb agreement rajulun dhahabuu: 0; ar-rajulu dhahabuu: 0; rajulu dhahabuu: 0; # gender agreement ar-rajulu dhahaba: 1; ar-rajulu dhahabat: 0; al-bintu dhahaba: 0; al-bintu dhahabat: 1; # possession rajulun hu dhahaba: 0; ar-rajulu hu dhahaba: 0; rajulu hu dhahaba: 1; # subject case ar-rajula dhahaba: 0; ar-rajuli dhahaba: 0; # case in construct phrase _axuu ar-rajuli dhahaba: 1; _axuu ar-rajula dhahaba: 0; _axuu ar-rajulu dhahaba: 0; # construct state in construct phrase _axun ar-rajuli dhahaba: 0; al-_axu ar-rajuli dhahaba: 0; # object case ar-rajulu ra_aa al-kitaaba: 1; ar-rajulu ra_aa al-kitaabi: 0; ar-rajulu ra_aa al-kitaabu: 0; # preposition case ar-rajulu _a9Taa al-kitaaba li al-waladi: 1; ar-rajulu _a9Taa al-kitaaba li al-waladu: 0; ar-rajulu _a9Taa al-kitaaba li al-walada: 0; # subcategorization ar-rajulu ra_aa al-kitaaba li al-waladi: 0; # backward xcomp ar-rajulu _a9Taa li al-waladi al-kitaaba: 1; _a9Taa ar-rajulu li al-waladi al-kitaaba: 1; # object clitics ana ra_aytu hu: 1; ra_aytu hu ana: 1; ra_aytu ana hu: 0; hu ra_aytu ana: 0; hu ana ra_aytu: 0; huwa ra_aa nii: 1; huwa ra_aa ii: 0; huwa ra_aa ana: 0; ar-rajulu _a9Taa haa li al-waladi: 1; _a9Taa haa ar-rajulu li al-waladi: 1; # relative clauses # "I gave it to the man that the girl saw him" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at hu: 3; # "I gave it to the man that the girl saw her" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at haa: 0; # "I gave it to the man that the girl saw" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at: 0; # "I gave it to the man that the girl saw the boy" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at al-walada: 0; } ================================================ FILE: ccg-format-grammars/inherit/inherit.ccg ================================================ # A minimal grammar which shows inheritance. # Jason Baldridge, September 2007 ################## Features ################# feature { case: nom acc; } ################## Words ################# word John:NP (pred=john); word Fido:NP (pred=fido); word food:NP (pred=food); word to:PP; word sleeps:IntransV (pred=sleep); word saw:TransV (pred=see); word gave:DitransV (pred=give); ################## Categories ################# family NP { entry: np[X]:X(*); } family PP { entry: pp[X]/np[X]:X(*); } def iv_cat (PostSyn, MoreSem) { s[E] \ np[X nom] PostSyn: E(* X MoreSem) } def tv_cat (PreSyn, PostSyn, MoreSem) { iv_cat(PreSyn / np[Y acc] PostSyn, Y MoreSem) } family IntransV(V) { entry: iv_cat(,); } family TransV(V) { entry: tv_cat(,,); } family DitransV(V) { entry: tv_cat( , / np[Z acc] , Z); entry: tv_cat(/ pp[Z acc] , , Z); } ################## Test sentences ################# testbed { John sleeps: 1; John saw Fido: 1; John gave Fido food: 1; John gave food to Fido: 1; } ================================================ FILE: ccg-format-grammars/tiny/tiny.ccg ================================================ ############################################################# # # # tiny.ccg # # # ############################################################# # Author: Ben Wing # Date: April 2006 # This is derived from Geert-Jan M. Kruijff's 'coordination' grammar, # which was simplified, revised and extended. # # David Reitter, dreitter at inf.ed.ac dot uk, 01/2005 # Michael White, mwhite at inf dot ed.ac.uk, 01/2005 # This is the `tiny' grammar, in .ccg format. There are a bunch # of comments below to explain the format. See also the `arabic' # grammar for an example of extended macro use to handle a complex # morphology. # To generate a grammar for use with OpenCCG, use `genccg' (or whatever # it's currently called). This generates the various XML files needed for # OpenCCG (grammar.xml, lexicon.xml, morph.xml, types.xml, and rules.xml). # After doing this, you can run `tccg' to load the grammar and use it for # parsing. # Some general notes about this format: # -- The general feel of the syntax is like C, Java or Perl. Indentation # and whitespace is unimportant. (The only exception is in macro # definitions, where the text of a macro must either be on the same line # as the definition -- possibly extended with backslash # line-continuation markers -- or be enclosed in braces.) # # -- The syntax tries to be very forgiving of the usage of commas and # semicolons, for the benefit of macro definitions. In most lists, in # fact, commas are optional -- arguments can be separated by no commas, # one comma, or many commas, and extra commas can occur at the end of # the list. All of this makes macro definitions and macro calls much # easier. We usually write such lists below without commas. # # The main exception where commas matter is in macro calls. You can # still put an extra comma at the end of a macro call, but otherwise you # must have exactly one comma (no more, no less) between arguments. The # reason is that macro arguments can contain pretty much any text # whatsoever (including no text at all), so commas are needed to # indicate where one argument stops and the next one starts. # # NOTE: The parser pays attention to parens, brackets, and braces in the # text of a macro call argument, and will not get confused by commas # inside of matched delimiters. Thus, a macro call `foo(a, bar(b, c))' # is correctly interpreted as a call to foo() with two arguments, `a' # and `bar(b, c)', and *NOT* a call to foo() with three arguments `a', # `bar(b', and `c)'.) # # Note also how the text of macro definitions and calls can optionally # be surrounded by braces, to clearly delimit the text boundaries (see # below). It's important to note that the braces do *NOT* form part of # the text itself; if you really want your text surrounded by braces, # you need to add a second set around the text. # # -- You can use the `-t' option to see what things look like after macro # substitutions have been applied. # # -- The order of declarations in this file is not significant except # that macros and features must be defined before they can be used. # # -- You can surround literal text (word stems, inflections, etc.) with # single or double quotes in order to insert characters into the text # that would otherwise be interpreted by the parser, such as spaces, # colons, etc. Without such quotes, the only characters that can occur # in a word are letters, numbers, '_', '-', '+' and '%'. Note, however, # that there is no such thing as "reserved words" in this format; words # like 'feature', 'word', 'entry', etc. that have a special significance # in the right place in the syntax can otherwise be freely used as word # stems and inflections, macro variable names, part of speech tags, etc. ############################################################# # # # FEATURES # # # ############################################################# # Declaration of features. This is used as follows: # # (1) So that you can just specify a feature value inside brackets below, # and the appropriate feature is substituted. That is, a declaration # like n[nom] is equivalent to n[CASE=nom]. # # (2) To specify the types listed in types.xml. # # (3) To specify the feature values used in word declarations. These # compile out into declarations in morph.xml (unrelated to # our own use of macros). In order for this to happen, however, you # *must* specify a "macro-tie", inside of parentheses after the feature # type. This is either a number (for a syntactic macro) or a letter # (for a semantic macro). See below for exactly how these are used. # # (A "strict feature" mode may be implemented at some point, where all # features that are used in category definitions must be declared.) feature { # A number following the feature type, as follows, is used for syntactic # features that can be attached to a word. The number is a feature- # structure ID; when an atomic category in a lexical category definition # has the corresponding ID, these features will be inserted into that # category. Hence, the <2> here corresponds to the <2> that occurs # below in the definition of Noun, Det, verbcat(), etc. CASE<2>: nom acc; NUM<2>: sg pl; # You can specify more than one number if you want. # NUM<2,3>: sg pl; # A word in place of a number causes semantic macros to be created; if # used in a word {} declaration, the relevant info will be inserted into # the hybrid logic part of a lexical category declaration, attached to # the argument of the given name. A declaration like (X:NUM) means that # 'NUM' is used in the XML declaration in place of # the feature's actual name. (FIXME: I don't know what the significance # of this is.) sem-num: sg-X pl-X; TENSE: past pres; # You can create hierarchical values as shown. PERS<2>: non-3rd {1st 2nd} 3rd; # Alternatively, you can explicitly list the parent(s) of a feature -- # e.g. to create multiple-inheritance hierarchies. #alternate-pers<2>: non-3rd 1st[non-3rd] 2nd[non-3rd] 3rd; # Here's a more complicated hierarchy, from the original tiny grammar. ontology: sem-obj { phys-obj { animate-being { person } thing } situation { change { action } state } }; # Here we show how you can mix the two ways of declaring hierarchies, # if you have a primarily single-inheritance hierarchy but with certain # multiply-inherited values. In this hierarchy, 'werewolf' gets both # 'man' and 'wolf' as its parents. 'intersex' will have only 'man' and # 'woman' as parents -- 'rational-being' will not be specified as it's # redundant. entity: being { # We would call this `thing' but that is used above in # the `ontology' hierarchy, and causes a warning to be issued thing2 animate { irrational-being { fish mammal { dog-like {dog wolf} cat horse } } rational-being { man { centaur[horse] werewolf[wolf] } woman { mermaid[fish] } intersex[man woman] } } } nothingness; } # You can also specify that a feature is "distributive" and/or give # licensing information for the realizer. # # Here's the equivalent of the stuff in flights/lexicon.xml and # flights/types.xml. # # A ! before a feature makes it distributive. You can specify # licensing-related attributes on either a feature or a feature-value, in # the standard way of specifying attributes in .ccg format. feature { !owner(location=args-only, instantiate=false); !info; form: dcl-base {dcl, fronted}, q-base {q(license-marked-cats=true, also-licensed-by=q-base), wh(license-marked-cats=true, also-licensed-by=q-base)}, base, emb(location=target-only), inf(location=target-only), adj(location=target-only), ng; } # If you really want the feature value as used in the word {} declarations # to be different from the feature value elsewhere, you can do that. # This allows you to generate the following XML: # # # # # # # # # # # # # # #feature { # case<0>: acc0:p-case; # case<1>: acc1:p-case; # case<2>: acc2:p-case; # case<3>: acc3:p-case; #} # NOTE: (1) This doesn't quite work in ccg2xml yet, since only `acc3' gets # added to the hierarchy in types.xml. (2) The entire motivation for doing # this kind of thing is kludgy, and will be eliminated by allowing features # to be explicitly specified for the result of a lexical insertion rule, # much like the way that type-raising rules currently work. # Here's how you specify a relation-sorting order, in case you care. relation-sorting: foo * bar baz; ############################################################# # # # WORDS # # # ############################################################# # ################## Functional words ################# # # Some examples of words. # # The format of word declarations is # # word STEM:FAMILY ...(ATTRS): FEATURES; # # or # # word STEM:FAMILY ...(ATTRS) { INFLECTED-FORM: FEATURES; ...} # # where STEM is the word's stem, FAMILY is a list of the families that a # word is part of, and ATTRS specifies any other attributes associated # with the word. FEATURES gives the word's features; these come from the # feature {} declarations above. (NOTE: Only feature values whose features # specify a "macro-tie" value -- something in <> following the # feature's name -- can be used. See above.) # # ATTRS is a list; each attribute is either a specification ATTRIBUTE=VALUE # or a single VALUE (equivalent to class=VALUE). The useful attributes are # # class Semantic class of a word. # pred Semantic predicate of a word, used in the logical form; # if omitted, defaults to the word's stem. # excluded List of excluded lexical categories. # coart Boolean indicating that this entry is a coarticulation, # eg a pitch accent, gesture, or other word-associated element. # # Any of FAMILY, ATTRS and/or FEATURES can be omitted. # # The second form above, with braces, is used for words with different # inflections. Instead of specifying the features directly after the word, # you list the features for each inflection separately. Note that * is # shorthand for the stem itself. # # Note that there can be more than one word {} declaration for a single stem. # # The families in FAMILY can be either a family name, from a family {} # block, or a part of speech. (`genccg' will derive the appropriate parts # of speech from any families given when creating the XML file.) Note that # the words associated with a particular family can be specified either by # tagging each word with its family, by listing a family's words explicitly # using the `member' declaration inside of a family {} block, or by a # combination of the two. word the:Det; word some:Det; word a:Det: sg; word for; word pro1:Pro(animate-being) { I: 1st sg nom sg-X; me: 1st sg acc sg-X; we: 1st pl nom pl-X; us: 1st pl acc pl-X; } word pro2:Pro(animate-being) { you: 2nd; } word pro3f:Pro(animate-being) { she: 3rd sg nom sg-X; her: 3rd sg acc sg-X; } word pro3m:Pro(animate-being) { he: 3rd sg nom sg-X; him: 3rd sg acc sg-X; } word pro3n:Pro(thing) { it: 3rd sg sg-X; } # If we want the CLASS associated with only some of the inflections, # we can use the same stem in more than one decl. (Assigning neuter to # "they" is strange but that's how the original morph.xml did it!) word pro3n:Pro { they: 3rd pl nom pl-X; them: 3rd pl acc pl-X; } # ################## Nouns ################# # # Here, we make use of macros. The basic idea behind macros is simple: # They simply do string substitution. However, parameters can be given, # making them very powerful. # This macro says: Every time an expression of the form basic-noun(...) # occurs, replace it with the text that comes after. The parameters will # be substituted into the text. The braces that denote the macro's text do # *NOT* form part of the text that is substituted. (Alternatively, you # can put the entire replacement text on the same line as the macro # definition, using a backslash at the end of the line if needed in order # to continue the definition on the next line.) Likewise, braces can be # used to surround text in an argument to a macro call and will not form of # the argument's text. This is useful when the argument's text contains # commas. In either case, if you really want the text itself to have # braces around it, you need to put two sets of braces around it. # Note that macro substitutions are processed recursively: If the # text of a macro substitution contains calls to other macros, they will # also be processed. This makes "inheritance" very easy to implement. # Inside of a macro definition's text, the . operator can be used; this # concatenates two words together into a single word. See the definition # of normal-noun() below for a simple example. # Substitution of arguments does not occur inside of quoted text. def basic-noun(sing, plur, class) { word sing:N(class) { *: sg sg-X; plur: pl pl-X; } } #def normal-noun(stem, class) { # word stem:N(class) { # *: sg sg-X; # stem . s: pl pl-X; # } #} # or equivalently, using our definition of basic-noun(): # def normal-noun(stem, class) basic-noun(stem, stem . s, class) # But in fact, we do something more clever to handle pluralization. # Here we make use of some built-in macros(): # # regsub(PATTERN, REPLACEMENT, TEXT): # Replace all occurrences of regexp PATTERN with REPLACEMENT in TEXT. # This follows normal Python conventions for regular expression # substitution. # # ifmatch(PATTERN, TEXT, IF-TEXT, ELSE-TEXT): # If TEXT matches PATTERN at its beginning, substitute IF-TEXT; else, # substitute ELSE-TEXT. # # ifmatch-nocase(PATTERN, TEXT, IF-TEXT, ELSE-TEXT): # Just like ifmatch() but its pattern-matching is case-insensitive. def pluralize(word) { # This shows a complicated expression involving the built-ins # 'ifmatch' and 'regsub'. If the word ends in a vowel + o or y, # we add s. Else, if the word ends in (consonant) + o or y, or s, sh, ch, # or x, we change y to i and add es. Finally, in all other cases, # just add s. So buy -> buys, boy-> boys, but try -> tries, lady -> ladies. # Similarly, go -> goes but goo -> goos. For words like volcano -> volcanos # you have to put the forms in manually (or change the rule here, and put # forms in -o + es in manually, e.g. does, goes, tomatoes, potatoes). # ifmatch('^.*[aeiou][oy]$', word, word . s, ifmatch('^.*([sxoy]|sh|ch)$', word, regsub('^(.*)y$', '\1i', word) . es, word . s)) } def noun(sing, class) basic-noun(sing, pluralize(sing), class) noun(book, thing) noun(DVD, thing) noun(glass, thing) noun(church, thing) noun(flower, thing) noun(bath, thing) noun(teacher, person) noun(lady, person) # Pluralized (correctly) to 'ladies' noun(boy, person) # Pluralized (correctly) to 'boys' # An irregular noun. basic-noun(policeman, policemen, person) # ################## Verbs ################# # # The "props" parameter specifies families or attributes. def basic-verb(stem, props, 3sing, pasttense) { word stem:props { *: pres non-3rd sg; 3sing: pres 3rd sg; *: pres pl; pasttense: past; } } def verb(stem, props, pasttense) \ basic-verb(stem, props, pluralize(stem), pasttense) verb(buy, TransV DitransBenV, bought) verb(rent, TransV DitransBenV, rented) verb(go, IntransV, went) verb(sleep, IntransV, slept) # Here we show how you can specify a predicate or other attribute. # Admittedly this is not too useful here, but it can be much more so in # the case of a foreign language, where we want the semantic predicates # to be in English. (See arabic.ccg.) verb(eat, TransV IntransV (pred=manjar), ate) ############################################################# # # # RULES # # # ############################################################# # This declaration specifies the contents of rules.xml. Each statement # specifies a single rule; it is also possible for statements to cancel # some or all rules. # # Note that some rules are enabled by default; this includes application, # composition and crossed composition (forward and backward in each case), # as well as forward type-raising from np to s/(s\np) and backward # type-raising from np to s$1\(s$1/np). rule { # turn off forward cross-composition no xcomp +; # this is how we could turn off all type-raising rules. # no typeraise; # Declare a backward type-raising rule from pp to s$1\(s$1/pp). # The $ causes a dollar-sign raise category to be created, as shown; # without it, we'd just get s\(s/pp). typeraise - $: pp => s; # Declare a type-changing rule to enable pro-drop (not useful in English!) # typechange: s[finite]\np[nom]$1 => s[finite]$1 ; } # This shows how you can turn off all defaults and specify your own # properties from scratch, if you want. # rule { # no; # remove all defaults # app +-; # comp +-; # +- means both forward and backward # xcomp -; # sub +-; # xsub +-; # # Defaults for typeraising are np => s, if omitted. # typeraise +; # typeraise - $; # } # ############################################################# # # # CATEGORIES # # # ############################################################# # Specify lexical families and the lexical insertion rules for each # family. Properties of the family can be given in parens after the # family name. The family name should either be the same as a part # of speech, or the part of speech should be given in parens after # the family name. # Categories can be specified in an expected form, e.g. s\np[nom]/np[acc]. # The notation np<3>[acc] corresponds to XML code like this: # # # # # # i.e. the <3> specifies the feature-structure ID, and the [acc] # specifies a constraint. In this case, a constraint of the form # [CASE=acc] is generated because of the feature {} declaration above. # If an unknown value is given, e.g. [foo], it's assumed to be a feature, # rather than a feature value, and you get code like # # # You can also write [foo=bar] to explicitly give a feature and value. # And an entry like [X] corresponds to # # Note also that slashes can be followed by a slash mode, e.g. /<, or # the mode can be omitted; in this case, a default mode is generated: # \<, />, |. family Det(indexRel=det) { entry: np<2>[X PERS=3rd] /^ n<2>[X]: X:sem-obj(*); } family Prep-Nom(Prep, indexRel="*NoSem*") { # You can name your entries, as shown, although it's not clear there's # much point. # # The pp<~3> notation generates an 'inheritsFrom' tag rather than an 'id' # tag for the feature structure. This unifies only the properties not # explicitly given in the category, i.e. everything but 'lex' will unify. # # The entry [lex=*] corresponds to # and means that a feature 'lex' will be attached, whose value is the # word stem. entry Nominal: pp<~3>[lex=*] /< np<3>[acc]; # If members are specified, the family defaults to 'closed'. member: for; } family Conj { entry: np[NUM, X0] \* np[X1] / np[X2]: X0(and ^ (L1 ^ elem ^ X1 ^ (L2 ^ elem ^ X2))); } family DollarTest { entry: s\np$1\*(s\np); entry: s\(np$1)\*(s\np); } family N { entry: n<2>[X NUM]: X:sem-obj(*); } family Pro { entry: np<2>[X NUM PERS CASE]: X:sem-obj(*); } # Here we create a macro to describe the category for a verb, with transitive # arguments ARGS (possibly empty) and corresponding semantics SEM. # Don't call this macro just "verb" since that's used above already! def verbcat(args, sem) { # A * here corresponds to . # Similar use of * appears above in [lex=*] in prepositions, # and * in determiners. s<1>[E] \ np<2>[X NUM PERS nom] args: E:action(* X:animate-being sem) } family IntransV(V) { entry: verbcat(,); } # This shows how we could extend verbcat() to handle transitive verbs. # Since the arguments for ditransitive verbs could potentially either # before or after the transitive argument, we need two different macros. # Or, we could create one macro, with two different parameters for "before" # and "after" arguments. def before_transverbcat(args, sem) { verbcat(args / np<3>[Y acc], Y:sem-obj sem) } def after_transverbcat(args, sem) { verbcat(/ np<3>[Y acc] args, Y:sem-obj sem) } family TransV(V) { entry: after_transverbcat(,); } family DitransBenV(V) { # Careful here! Remember that the arguments in a CCG category will be # listed in backwards order compared to how they appear in the surface # syntax. Hence SUBJ VERB X Y Z corresponds to s\np/z/y/x. entry DTV: after_transverbcat(/< np[Z acc], Z:animate-being); entry NP-PPfor: before_transverbcat(/ pp[Z lex=for], Z:animate-being); # This shows how we could define the previous entry directly in # terms of verbcat(). #entry NP-PPfor: verbcat(/ pp<4>[Z lex=for] /< np<3>[Y acc], # Y:sem-obj Z:animate-being); } ############################################################# # # # TESTBED # # # ############################################################# # Statements to put in testbed.xml. If you omit the number, it will omit # the corresponding numOfParses item in the XML; I think this currently # is equivalent to specifying 1 as the number. If you put a ! before a # line, this indicates a "known failure" (known="true" in the XML). testbed { the teacher buys the policeman a book: 1; the teacher buys the policemen some flowers: 1; the teachers buy a book: 3; she buys it: 3; she buy it: 0; # Possible example of a known failure, in case we know we don't correctly # reject "she buy buy": ! she buy buy: 0; they buys it: 0; she buys the flower for him: 1; she buys the flower for he: 0; her buys the flower for him: 0; he rented her a DVD: 1; he rented a DVD for her: 1; he rented a DVD her: 0; } ================================================ FILE: ccg-format-grammars/tinytiny/tinytiny.ccg ================================================ # A truly minimal grammar for CCG. # Ben Wing, May 2006 ################## Features ################# feature { CASE<2>: nom acc; NUM<2>: sg pl; PERS<2>: non-3rd {1st 2nd} 3rd; TENSE: past pres; SEM-NUM: sg-X pl-X; # Some alternate code that appeared from somewhere, I'm not sure anymore. # Enable the following three statements and disable the statement above # beginning `NUM<2>: ...'. It looks like it tries to implement something to # do with number agreement. (Note that number agreement is already taken care # of in the normal system. I'm not sure what the code is trying to do.) # num<2,X:num>: sg pl; # num<1,2>: sg-agr:sg pl-agr:pl; # num<2>: sg-2:sg pl-2:pl; ontology: sem-obj { phys-obj { animate-being { person } thing } situation { change { action } state } }; } ################## Words ################# # Example of how to have punctuation and other non-word characters in a # lexical item. word '.':Punc; word ',':Punc; word '?':Punc; word the:Det; word a:Det: sg; word pro1:Pro(animate-being) { I: 1st sg nom sg-X; me: 1st sg acc sg-X; we: 1st pl nom pl-X; us: 1st pl acc pl-X; } def noun(sing, plur, class) { word sing:N(class) { *: sg sg-X; plur: pl pl-X; } } noun(book, books, thing) noun(peach, peaches, thing) noun(boy, boys, person) noun(policeman, policemen, person) def verb(stem, props, 3sing, pasttense) { word stem:props { *: pres non-3rd sg; 3sing: pres 3rd sg; *: pres pl; pasttense: past; } } verb(go, IntransV, goes, went) verb(sleep, IntransV, sleeps, slept) verb(eat, TransV IntransV, eats, ate) verb(see, TransV, sees, saw) # This is a test of set arguments. Not clear it's useful or should be here; # maybe move to tiny.ccg? verb(setverb, SetargV, setverbs, setverbed) ################## Categories ################# family Punc { entry: punc; } family Det(indexRel=det) { entry: np<2>[X PERS=3rd]/^ n<2>[X]: X:sem-obj(*); } family N { entry: n<2>[X]: X:sem-obj(*); } family Pro { entry: np<2>[X]: X:sem-obj(*); } family IntransV(V) { entry: s<1>[E] \ np<2>[X nom]: E:action(* X:animate-being); } family TransV(V) { entry: s<1>[E] \ np<2>[X nom] / np<3>[Y acc]: E:action(* X:animate-being Y:sem-obj); } family SetargV(V) { entry: s<1>[E] {\np<2>[X nom], / np<3>[Y acc]}: E:action(* X:animate-being Y:sem-obj); } ################## Test sentences ################# testbed { the policemen eat: 2; the boys eat: 2; the boys eat the peaches: 1; the policeman sleeps: 1; the policemen sleeps: 0; the policemen sleep: 1; the policeman sleeps the peach: 0; the policeman saw me: 1; the policeman saw I: 0; I see the book: 1; I sees the book: 0; I see a book: 1; I see a books: 0; } ================================================ FILE: ccgbank/bin/american-to-logical-quotes.py ================================================ """ Copryright (c) 2011 Dennis N. Mehay Assumes tokenized, PTB3-normalized UTF-8 text, one sentence per line. => Turns 'American'-style quotations into 'British'/'logical'-style quotations. So, e.g., `` Hello , '' said John . becomes: `` Hello '' , said John . [Insert LGPL here] """ import sys, codecs, os streamReader = codecs.lookup("utf-8")[2] streamWriter = codecs.lookup("utf-8")[-1] sys.stdin = streamReader(sys.stdin) sys.stdout = streamWriter(sys.stdout) for ln in sys.stdin: # trim off extra whitespace and replace double spaces with single spaces. ln = ln.strip().replace(u" ", u" ") # now replace # , '' # with # '' # and # . '' # with # '' . ln = ln.replace(u" , ''", " '' ,").replace(u" . ''", " '' .") # now fix any double-punct messes this might have created. ln = ln.replace(u" '' . ?", u" . '' ?").replace(u" '' . !", u" . '' !") sys.stdout.write(ln + '\n') ================================================ FILE: ccgbank/bin/convert-mtc-systems.py ================================================ """ This program takes a set of documents (all streamed from stdin at once) and formats them in a way suitable for use with the NIST-distributed mteval script. The output is in UTF-8. Usage: cat [MTC_DIR_FOR_SYSTEM] | python convert-mtc.systems.py [doctype-string {'source', 'target', 'ref'} (default='target')] | [NEW_XML_DOC_TO_STDOUT] """ import sys, os, re, codecs, xml.sax.saxutils, my_unicode try: import chardet except: chardet = None from xml.etree.ElementTree import * from collections import defaultdict def tokenize(t): """very simple text tokenization: n't => n't 's => 's ' => ' where '' is not whitespace. """ t = t.replace("n't", " n't").replace("'s", " 's").replace("' ", " ' ") if t[-1] == "'": t = t[:-1] + " " + t[-1] return t.replace(" ", " ") def decode_line(ln, encoding): res = None try: res = ln.decode(encoding) except: try: res = ln.decode("iso-8859-2") except: try: res = ln.decode("iso-8859-1") except: try: res = ln.decode("utf-8") except: try: res = ln.decode("GB2312") except: try: res = ln.decode("Big5") except: try: res = ln.decode("EUC-TW") except: res = ln return res doc_pattern = re.compile(u"<[Dd][Oo][Cc] docid=\"(.*)\" sysid=\"(.*)\">(.*)") doc_pattern_source = re.compile(u"<[Dd][Oo][Cc] docid=\"(.*)\">(.*)") seg_pattern = re.compile(u"(.*)$") doc_type = "target" if len(sys.argv) >= 2: doc_type = sys.argv[1].lower() if not doc_type in ["target", "source", "reference"]: doc_type = "target" mtc_in = sys.stdin.readlines() mtc_all = (os.linesep).join(mtc_in) if not chardet is None: encoding = chardet.detect(mtc_all)['encoding'] else: encoding = "ISO-8859-2" # turn stdout into a UTF-8 converting writer. streamWriter = codecs.lookup("UTF-8")[-1] sys.stdout = streamWriter(sys.stdout) output = sys.stdout # map from auto-assigned ID to MTC ID. autoid2mtcid = {} mtc = defaultdict(lambda: []) sys = None for l in mtc_in: l = decode_line(l, encoding).strip() if l.startswith("" + \ os.linesep + "" + os.linesep +\ "" + os.linesep) if doc_type == "target": output.write(" " % sys + os.linesep) elif doc_type == "source": output.write(" " + os.linesep) else: output.write(" " + os.linesep) docs = mtc.keys() docs.sort() for (sy,doc) in docs: output.write(" " % doc + os.linesep) segids_and_texts = mtc[(sy,doc)] segids_and_texts.sort(lambda a,b: cmp(int(a[0]),int(b[0]))) for (segid,text) in segids_and_texts: output.write(" %s " % (segid,xml.sax.saxutils.escape(my_unicode.removeInvalidChars(tokenize(text)))) + os.linesep) output.write(" " + os.linesep) if doc_type == "target": output.write(" " + os.linesep + "") elif doc_type == "source": output.write(" " + os.linesep + "") else: output.write(" " + os.linesep + "") ================================================ FILE: ccgbank/bin/convert-spaces-to-newlines.py ================================================ # # converts spaces to newlines, and newlines to special chars, # from stdin to stdout # import sys, re; [sys.stdout.write(re.sub(' ','\n',re.sub('\n','',line))) for line in sys.stdin] ================================================ FILE: ccgbank/bin/convert_all ================================================ #!/bin/bash for i in 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 do echo "Starting conversion for section $i" nice ccg-build -Dsect=$i -Dfile=* convert-puncts-aux-bbn >& logs/log.convert.$i & done ================================================ FILE: ccgbank/bin/correlate-to-judgments.py ================================================ """ Correlate human judgments (streamed into sys.stdin -- e.g., from the MTC) to the BLEU/NIST scores in the given directories (sys.argv[1] and sys.argv[2]). Depends on rpy2 having been installed. """ import sys, os try: import rpy2 import rpy2.robjects as robjs except ImportError: print >> sys.stderr, "please install rpy2. exiting..." sys.exit(-1) def num2string(n): """ E.g., 0 => '00', 1 => '01' and 10 => '10'. """ try: numm = int(n) except: numm = n if numm < 10: return str(numm) else: return str(numm) human_judgments = [] human_judgments_fluency = [] # fluency. human_judgments_acc = [] # accuracy. human_judgments_ave = [] # average of both. for l in sys.stdin: l = l.strip() if l.startswith("#"): continue lparts = l.split(",") # appending (sys,doc,judge,ref_sys,segment,fluency,accuracy) human_judgments.append(tuple(lparts[0:7])) human_judgments_fluency.append(int(lparts[5])) human_judgments_acc.append(int(lparts[6])) human_judgments_ave.append((float(lparts[5]) + float(lparts[6]))/2.0) human_judgments_fluency = robjs.FloatVector([float(i) for i in human_judgments_fluency]) human_judgments_acc = robjs.FloatVector([float(i) for i in human_judgments_acc]) human_judgments_ave = robjs.FloatVector([float(i) for i in human_judgments_ave]) bleu_nist_dir1 = sys.argv[1] bleu_nist_dir2 = sys.argv[2] # the next two are maps from: (sys,doc,ref,segID) => score bleu_scores = {} nist_scores = {} for f in [fl for fl in os.listdir(bleu_nist_dir1) if ("BLEU" in fl or "NIST" in fl)]: for l in open(bleu_nist_dir1 + os.sep + f, "rb").readlines(): if l.strip() == "": continue (sys,ref_sys,doc,seg,bleu_or_nist_score) = l.split(",") if "BLEU" in f: bleu_scores[(sys,doc,ref_sys,"S"+num2string(seg))] = bleu_or_nist_score else: nist_scores[(sys,doc,ref_sys,"S"+num2string(seg))] = bleu_or_nist_score for f in [fl for fl in os.listdir(bleu_nist_dir2) if ("BLEU" in fl or "NIST" in fl)]: for l in open(bleu_nist_dir2 + os.sep + f, "rb").readlines(): if l.strip() == "": continue (sys,ref_sys,doc,seg,bleu_or_nist_score) = l.split(",") if "BLEU" in f: bleu_scores[(sys,doc,ref_sys,"S"+num2string(seg))] = float(bleu_or_nist_score) else: nist_scores[(sys,doc,ref_sys,"S"+num2string(seg))] = float(bleu_or_nist_score) # for both BLEU and NIST, compute rpy2 vectors that parallel the seqeuence # of human judgments. # step through the (sys,doc,judge,ref_sys,segment,fluency,accuracy) tuples. bleu_lst = [] nist_lst = [] for (s,d,j,rs,sg,f,a) in human_judgments: if (s,d,rs,sg) in bleu_scores: bleu_lst.append(bleu_scores.get((s,d,rs,sg))) else: print "nope", (s,d,rs,sg) if (s,d,rs,sg) in nist_scores: nist_lst.append(nist_scores.get((s,d,rs,sg))) else: print "nope", (s,d,rs,sg) bleu_vec = robjs.FloatVector(bleu_lst) nist_vec = robjs.FloatVector(nist_lst) # compute correlations b_fluency = robjs.r['cor'](bleu_vec, human_judgments_fluency) b_accuracy = robjs.r['cor'](bleu_vec, human_judgments_acc) b_average = robjs.r['cor'](bleu_vec, human_judgments_ave) print "BLEU's Pearson correlation wrt fluency:", b_fluency print "BLEU's Pearson correlation wrt accuracy:", b_accuracy print "BLEU's Pearson correlation wrt the average of fluency and accuracy", b_average nist_fluency = robjs.r['cor'](nist_vec, human_judgments_fluency) nist_accuracy = robjs.r['cor'](nist_vec, human_judgments_acc) nist_average = robjs.r['cor'](nist_vec, human_judgments_ave) print "NIST's Pearson correlation wrt fluency:", nist_fluency print "NIST's Pearson correlation wrt accuracy:", nist_accuracy print "NIST's Pearson correlation wrt the average of fluency and accuracy", nist_average ================================================ FILE: ccgbank/bin/filter_feats.py ================================================ #!/usr/bin/env python """ (c) 2008 Dennis N. Mehay Use this file any way you want, just please give the author credit if it makes it into any research in any meaningful way. I make no claims whatsoever about the fitness or merchantability of this code. Use at your own risk. """ import sys, math from optparse import OptionParser as OP pr = OP() pr.add_option("-i","--input",type="string",help="input source [default=].",\ default=sys.stdin) pr.add_option("-o","--output",type="string",help="output location [default=].",\ default=sys.stdout) pr.add_option("-n","--number",type="int",\ help="number of times a category must have been seen to retain [default=5]",\ default=5) pr.add_option("-f","--feat_freq",type="int",help="feature frequency cutoff\n"+\ "(how frequent must a feature be to retain it? [default=1])",default=1) (opts,args) = pr.parse_args(sys.argv) inf = opts.input if not inf is sys.stdin: inf = open(inf,'r') outf = opts.output if not outf is sys.stdout: outf = open(outf,'w') try: tag_cnt = {} ft_cnt = {} lines = {} ln_cnt = -1 for l in inf: l = l.strip() ln_cnt += 1 lines[ln_cnt] = l l = l.split() tag = l[0] tag_cnt[tag] = tag_cnt.get(tag,0) + 1 feats = l[1:] if ':' in feats[0] and not(feats[-1]==':'): # real-valued features feats = map(lambda a: (a[0:a.rfind(':')],a[a.rfind(':')+1:]), l[1:]) for (f,act) in feats: ft_cnt[f] = ft_cnt.get(f,0) + 1 #math.fabs(float(act)) else: # boolean for f in feats: ft_cnt[f] = ft_cnt.get(f,0) + 1 for i in range(ln_cnt + 1): l = lines.get(i) l = l.strip().split() tag = l[0] feats = l[1:] if tag_cnt.get(tag) >= opts.number: tag_printed = False if ':' in feats[0] and not(feats[-1]==':'): # real-valued features feats = map(lambda a: (a[0:a.rfind(':')],a[a.rfind(':')+1:]), l[1:]) for (f,act) in feats: if ft_cnt.get(f) >= opts.feat_freq: if not tag_printed: print >> outf, tag, tag_printed = True print >> outf, f+':'+act, else: # boolean for f in feats: if not tag_printed: print >> outf, tag, tag_printed = True print >> outf, f, print >> outf, '' if i%100==0: outf.flush() finally: outf.flush() if not inf is sys.stdin: inf.close() if not outf is sys.stdout: outf.close() ================================================ FILE: ccgbank/bin/find-betas-no-gold.py ================================================ """ Given: (1) a file supertagged words (OpenCCC file output format as produced by, e.g., WordAndPOSDictionaryLabellingStrategy), (2) a list (as a string) of tagging ambiguity levels (e.g., "1.4 1.6 1.8...") that represent the desired tag/word levels (rounded off at the hundredths place to <=1.41, <=1.61, etc.), (3) possibly tagging dictionaries (if needed), and (4) the corresponding 'K' parameters (e.g., "20 150" as in Clark and Curran (2007)), produce the list of betas that would produce those ambiguity levels. """ import optparse import sys import decimal import math """ A little on-the-fly class creation for iterating through multi-stag format files. """ class STIterator: def __init__(self, f): self.f = f def next(self): l = self.f.readline() while l.strip()=='' and l: l = self.f.readline() if l: lines = [] # not at eof. if l.strip() != "": print >> sys.stderr, "line=", l.strip(), "ill-formed st output file." raise Exception else: l = self.f.readline() while l.strip() != "" and l: lines.append(l.strip()) l = self.f.readline() if lines == []: print >> sys.stderr, "line=", l.strip(), "ill-formed st output file." raise Exception else: res = [] for ln in lines: # each line is: word POS1 ... POSN ST1 ... STM # we just want the word, first pos and supertags. parts = ln.split() wd = parts[0] pos = parts[2] how_many_poss = int(parts[1]) stgs = zip(parts[2+(how_many_poss*2)+1::2],map(lambda n: float(n), parts[2+(how_many_poss*2)+2::2])) res.append((wd,pos,stgs)) return res else: raise StopIteration def __iter__(self): return self p = optparse.OptionParser() p.add_option("-i", "--inputf", type="string", help="input source [default=]", default=sys.stdin) p.add_option("-o", "--outputf", type="string", help="output destination [default=]", default=sys.stdout) p.add_option("-a", "--ambiguities", type="string", help="a space delimited string of tagging ambiguity levels [default=\"1.2 1.4 1.6 1.8 2.0 2.5 3.0 3.5\"]", default="1.2 1.4 1.6 1.8 2.0 2.5 3.0 3.5") p.add_option("-K", "--Ks", type="string", help="a space delimited string of K values (only two) [default=\"20 150\", optional]", \ default="20 150") p.add_option("-w", "--wordkeyeddict", type="string", help="word-keyed tagging dict [no default, optional]",\ default=None) p.add_option("-p", "--poskeyeddict", type="string", help="POS-keyed tagging dict [no default, optional]",\ default=None) (ops,args) = p.parse_args() try: # POS-keyed dict must be there if word-keyed one is. assert (not (not (ops.wordkeyeddict is None) and (ops.poskeyeddict is None))) except: print >> sys.stderr, "need POS-keyed dict if using word-keyed dict." sys.exit(-1) inf = ops.inputf if not inf is sys.stdin: inf = open(inf, 'r') outf = ops.outputf if not outf is sys.stdout: outf = open(outf, 'w') input_sents = [s for s in STIterator(inf)] wdict = {} if not ops.wordkeyeddict is None: entries = map(lambda l: l.split(), open(ops.wordkeyeddict, 'r').readlines()) wdict[entries[0]] = (int(entries[1]), set(entries[2:])) pdict = {} if not ops.poskeyeddict is None: entries = map(lambda l: l.split(), open(ops.poskeyeddict, 'r').readlines()) posdict[entries[0]] = set(entries[1:]) try: ambs = map(lambda a: float(a), ops.ambiguities.split()) betas = [] current_beta = 1.0 last_beta_above = None last_beta_below = 0.0 total_tags = 0.0 total_words = 0.0 total_right = 0.0 ks = map(lambda kay: int(kay), ops.Ks.split()) for a in ambs: current_beta = 1.0 last_beta_above = None last_beta_below = 0.0 k = ks[0] if a!=ambs[-1] else ks[1] found = False while not found: total_tags = 0.0 total_words = 0.0 total_right = 0.0 for insent in input_sents: for lex in insent: total_words += 1 w = lex[0] pos = lex[1] stags = lex[2] if len(wdict)>0: # filter with appropriate dictionary. (freq,tags) = wdict.get(w,(0,set([]))) if freq >= k: tags = tags else: tags = pdict.get(pos,set([])) if len(tags)>0: stags = filter(lambda st: st[0] in tags, stags) best = stags[0][1] # how many tags are there that made the beta cut-off? total_tags += len(filter(lambda st: st[1] >= (current_beta * best), stags)) # round to the nearest hundredth tags_per_word = (float(total_tags)/total_words) decimal.getcontext().prec = 4 as_string = str(decimal.Decimal(str(tags_per_word))) # we're looking for 1.40..., or 1.60..., etc. (as the case may be) found = tags_per_word == a or ((as_string[:3]==str(a)[:3]) and (as_string[3]=='0')) if found: betas.append(current_beta) current_beta = 1.0 last_beta = None else: # decide which direction to loosen the beta. if tags_per_word > a: # get more restrictive (i.e., larger beta). if last_beta_above is None: print >> sys.stderr, "error" sys.exit(-1) else: tempbeta = current_beta current_beta += math.fabs(last_beta_above - current_beta)/2.0 last_beta_below = tempbeta else: # get less restrictive (i.e., smaller beta) tempbeta = current_beta current_beta -= math.fabs(current_beta - last_beta_below)/2.0 last_beta_above = tempbeta print >> outf, "betas", ' '.join(map(lambda b: str(b), betas)) except: print "Unexpected error:", sys.exc_info()[0] raise finally: # clean up, clean up... if not inf is sys.stdin: inf.close() if not outf is sys.stdout: outf.close() ================================================ FILE: ccgbank/bin/gen_parser_events_a ================================================ #!/bin/bash for i in 02 03 04 05 do ccg-build -Dsect=$i -f build-ps.xml gen-parser-events &> logs/log.gen.parser.events.$i done ================================================ FILE: ccgbank/bin/gen_parser_events_b ================================================ #!/bin/bash for i in 06 07 08 09 do ccg-build -Dsect=$i -f build-ps.xml gen-parser-events &> logs/log.gen.parser.events.$i done ================================================ FILE: ccgbank/bin/gen_parser_events_c ================================================ #!/bin/bash for i in 10 11 12 13 do ccg-build -Dsect=$i -f build-ps.xml gen-parser-events &> logs/log.gen.parser.events.$i done ================================================ FILE: ccgbank/bin/gen_parser_events_d ================================================ #!/bin/bash for i in 14 15 16 17 do ccg-build -Dsect=$i -f build-ps.xml gen-parser-events &> logs/log.gen.parser.events.$i done ================================================ FILE: ccgbank/bin/gen_parser_events_e ================================================ #!/bin/bash for i in 18 19 20 21 do ccg-build -Dsect=$i -f build-ps.xml gen-parser-events &> logs/log.gen.parser.events.$i done ================================================ FILE: ccgbank/bin/gen_realizer_events_a ================================================ #!/bin/bash for i in 02 03 04 05 do ccg-build -Dsect=$i -f build-rz.xml gen-realizer-events &> logs/log.gen.realizer.events.$i done ================================================ FILE: ccgbank/bin/gen_realizer_events_b ================================================ #!/bin/bash for i in 06 07 08 09 do ccg-build -Dsect=$i -f build-rz.xml gen-realizer-events &> logs/log.gen.realizer.events.$i done ================================================ FILE: ccgbank/bin/gen_realizer_events_c ================================================ #!/bin/bash for i in 10 11 12 13 do ccg-build -Dsect=$i -f build-rz.xml gen-realizer-events &> logs/log.gen.realizer.events.$i done ================================================ FILE: ccgbank/bin/gen_realizer_events_d ================================================ #!/bin/bash for i in 14 15 16 17 do ccg-build -Dsect=$i -f build-rz.xml gen-realizer-events &> logs/log.gen.realizer.events.$i done ================================================ FILE: ccgbank/bin/gen_realizer_events_e ================================================ #!/bin/bash for i in 18 19 20 21 do ccg-build -Dsect=$i -f build-rz.xml gen-realizer-events &> logs/log.gen.realizer.events.$i done ================================================ FILE: ccgbank/bin/get-text-from-mtc-style.py ================================================ """ Gets the text from a MTC-style corpus. Just looks for ' ' segments. => (c) 2011 Dennis Nolan Mehay [Insert LGPL here] """ import re, sys, codecs, os pattern = re.compile(u"\\s*(.*)\\s*") input = sys.stdin.read() try: import chardet encoding = chardet.detect(input)['encoding'] except: # this is what the original MTC corpus is encoded in. encoding = "iso-8859-2" input = input.decode(encoding) streamWriter = codecs.lookup(encoding)[-1] sys.stdout = streamWriter(sys.stdout) for seg in pattern.findall(input): sys.stdout.write(seg.strip() + os.linesep) ================================================ FILE: ccgbank/bin/get-truecase-list.py ================================================ """ Requires Python >= 2.6x+ but < 3.0. Takes in a stream (from stdin) or file of *tokenized* plain text (utf-8), returns a list of words that occurred more than twice and were in upper-case more frequently than not. """ import optparse, sys, codecs, os from collections import defaultdict def isAllUpper(st): return ( st.upper() == st and st.lower() != st ) op = optparse.OptionParser() op.add_option("-i", "--input", type="string", help="input file or stream (default = )", default=sys.stdin) op.add_option("-o", "--output", type="string", help="output file or stream (default = )", default=sys.stdout) op.add_option("-f", "--use_first", action="store_true", help="whether to use the first word of each "+\ "sentence for counting uppercase vs. lowercase (default = False)", default=False) (ops, args) = op.parse_args() inf = ops.input if not inf is sys.stdin: inf = open(inf, "rb") use_first_word = ops.use_first outf = ops.output if not outf is sys.stdout: outf = codecs.open(outf, "wb", "utf-8") else: # make stdout code utf-8 streamWriter = codecs.lookup("UTF-8")[-1] outf = streamWriter(outf) # map from: lowercased_word_key => specific_cased_form => count wds2ulcounts = defaultdict(lambda: defaultdict(lambda: 0)) try: l = inf.readline() while l: l = l.strip().decode("utf-8") if l == u"": continue words = l.split() if not use_first_word: words = words[1:] for w in words: key = w.lower() wds2ulcounts[key][w] += 1 l = inf.readline() for (wdkey,frms) in wds2ulcounts.items(): wdforms = frms.items() # sum all counts. if more than 2, then write out the most frequent, else don't. sum_all = sum([cnt for (wf,cnt) in wdforms]) # if there is only one form, seen more than once and it is a cased form, print it (this last will avoid printing punctuation and # always-lowercase words like 'the'). if len(wdforms) == 1: most_freq = wdforms[0] if sum_all > 2 and most_freq[0].lower() != most_freq[0]: #outf.write("wd %s only has one form, seen %d times" % (wdforms[0][0], wdforms[0][1]) + os.linesep) outf.write(most_freq[0] + os.linesep) else: continue else: if sum_all == 2: #outf.write("wd %s only occurred twice. cannot decide which is most frequent." % (wdkey) + os.linesep) continue else: wdforms.sort(lambda a,b: -cmp(a[1],b[1])) most_freq = wdforms[0] second_most_freq = wdforms[1] # see whether there is a tie. if so, no dice. if most_freq[1] == second_most_freq[1]: #outf.write("wd %s occurred more than twice, but there was a tie btw forms %s and %s (perhaps others)." % \ # (wdkey, wdforms[0][0], wdforms[1][0]) + os.linesep) continue else: # only mention it if the most freq form is uppercased somewhere. if most_freq[0][0].lower() != most_freq[0][0]: #outf.write("wd %s occurred most with form %s." % (wdkey, wdforms[0][0]) + os.linesep) outf.write(most_freq[0] + os.linesep) finally: try: outf.close() except: pass try: inf.close() except: pass ================================================ FILE: ccgbank/bin/get-uniq-nbest.py ================================================ from BeautifulSoup import BeautifulStoneSoup as BSS import codecs import sys, os streamWriter = codecs.lookup('utf-8')[-1] sys.stdout = streamWriter(sys.stdout) inf = open(sys.argv[1], "rb").read() try: beta = float(sys.argv[2]) except: beta = 0.1 soup = BSS(inf) segs = soup.findAll(lambda t: t.name == u'seg') tot_paraphrases = 0.0 tot_segs = 0.0 for seg in segs: tot_segs += 1 if seg.get('complete') == 'true': best = seg.find(lambda p: p.name == 'best') ref = seg.ref.find(text=True) eye_dee = seg.get(u'id') paraphrases = set([p.find(text=True) for p in seg.findAll(lambda e: e.name in [u'best', u'next'])]) tot_paraphrases += len(paraphrases) sys.stdout.write(ref + u' ||| ' + u' <-> '.join(paraphrases)) sys.stdout.write(os.linesep) print "ave paraphrases/seg", tot_paraphrases/tot_segs ================================================ FILE: ccgbank/bin/get_factors_from_parse.py ================================================ #!/usr/bin/env python """ (c) 2008 Dennis N. Mehay Use this file any way you want, just please give the author credit if it makes it into any research in any meaningful way. I make no claims whatsoever about the fitness or merchantability of this code. Use at your own risk. Take a file of CCGbank-style parses and get the words, POSs and lexical cat's from them. We also insert the word as the 'lemma', just as a placeholder. So we have the following output form (for each parse in the input file): ||| ... ||| Print out parse IDs (if there) as they are. """ import sys, re import optparse p = optparse.OptionParser() p.add_option("-i", "--inputf", type="string", \ help="Input file to be postprocessed (one parse per line with IDs preceding them) [defaults to stdin]",\ default=None) p.add_option("-o", "--outputf", type="string", \ help="The output location [defaults to stdout]",\ default=None) (opts, args) = p.parse_args(sys.argv) inf = None outf = None if opts.inputf is None: inf = sys.stdin else: inf = open(opts.inputf,'r') if opts.outputf is None: outf = sys.stdout else: outf = open(opts.outputf,'w') try: global lexNodePattern lexNodePattern = re.compile(r'()+?') """ A procedure that returns a list of all lexical nodes in a CCGbank-style parse tree (in string representation). """ def getLexicalNodes(tree): matches = re.findall(lexNodePattern, tree) return matches for l in inf: if "ID=" in l: print >> outf, l.strip() elif l.strip()!='': nodes = getLexicalNodes(l.strip()) ans = '' for n in nodes: parts = n.split() (w,pos,st) = (parts[4],parts[2],parts[1]) ans += w+'|'+w+'|'+pos+'|'+st+ ' ' print >> outf, ans.strip() finally: if not opts.inputf is None: inf.close() if not opts.outputf is None: outf.close() ================================================ FILE: ccgbank/bin/get_just_words_from_ner_text.py ================================================ """ Takes NE tagged text from stdin (assuming utf-8) and does just what it says: prints to stdout only the words. """ import codecs, sys, os from optparse import OptionParser as OP pr = OP() pr.add_option("-o","--output",type="string",help="output location [default=].",\ default=sys.stdout) (opts,args) = pr.parse_args(sys.argv) outf = opts.output if not outf is sys.stdout: outf = open(outf,'w') streamWriter = codecs.lookup("utf-8")[-1] outw = streamWriter(outf) for l in sys.stdin: l = l.decode("utf-8") parts = l.split() snt = [] for p in parts: if u"_" in p: subparts = p.split(u"_") if len(subparts) > 1: w = u"_".join(subparts[:-1]) else: w = p snt.append(w) else: snt.append(p) outw.write(u" ".join(snt) + '\n') outf.flush() if not outf is sys.stdout: outf.close() ================================================ FILE: ccgbank/bin/lowercase_tagged_text.py ================================================ #!/usr/bin/env python """ Takes a POS-tagged file and writes out the text with tokens lowercased except for proper nouns. A file with the list of word-tag pairs can also be written out. (c) 2010 Michael White [insert LGPL here] """ import sys from optparse import OptionParser as OP pr = OP() pr.add_option("-i","--input",type="string",help="input source [default=]",\ default=sys.stdin) pr.add_option("-o","--output",type="string",help="output location [default=]",\ default=sys.stdout) pr.add_option("-p","--pairs",type="string",help="output file for word-tag pairs",\ default=None) (opts,args) = pr.parse_args(sys.argv) inf = opts.input if not inf is sys.stdin: inf = open(inf,'r') outf = opts.output if not outf is sys.stdout: outf = open(outf,'w') pairsf = opts.pairs if not pairsf is None: pairsf = open(pairsf,'w') try: sent = [] pairs = [] for l in inf: l = l.strip() l = l.split() if l[0] == '': sent = [] pairs = [] elif l[0] == '': print >> outf, ' '.join(sent) if not pairsf is None: for (token,tag) in pairs: print >> pairsf, token, tag else: token,tag = l[0],l[1] if tag[:3] != 'NNP' and (len(token) <= 1 or not token[1].isupper()): token = token.lower() sent.append(token) pairs.append((token,tag)) if len(l) >= 4: tag2 = l[3] pairs.append((token,tag2)) finally: if not inf is sys.stdin: inf.close() if not outf is sys.stdout: outf.close() if not pairsf is None: pairsf.close() ================================================ FILE: ccgbank/bin/merge-mtc-ids.py ================================================ """ This program re-inserts the MTC unique IDs (sys+DOC+segment) into an auto-number-ID'ed parse of said MTC (or similar) document produced by OpenCCG's 'ccg-parse'. Usage: python merge-mtc-ids.py [output-of-OpenCCG-parser] [MTC-like-input-file] > [output-of-OpenCCG-parser-with-original-MTC-ids] """ import sys, os, re, codecs try: import chardet except: chardet = None from xml.etree.ElementTree import * doc_pattern = re.compile(u"<[Dd][Oo][Cc] docid=\"(.*)\" sysid=\"(.*)\">(.*)") seg_pattern = re.compile(u"(.*)$") openccg_in = sys.argv[1] #mtc_in = codecs.open(sys.argv[2], "rb", "utf-8").read() if not chardet is None: encoding = chardet.detect(open(sys.argv[2], "rb").read())['encoding'] else: encoding = "ISO-8859-2" mtc_in = codecs.open(sys.argv[2], "rb", encoding).readlines() # turn stdout into a UTF-8 converting writer. streamWriter = codecs.lookup(encoding)[-1] sys.stdout = streamWriter(sys.stdout) output = sys.stdout # map from auto-assigned ID to MTC ID. autoid2mtcid = {} mtc_ids = [] for l in mtc_in: l = l.strip() if l.startswith("" + os.linesep + "" + os.linesep) for event, elem in iterparse(openccg_in): if elem.tag.lower() == "item": next_mtc_id = mtc_ids.pop(0) elem.set("info", u",".join(next_mtc_id[:-1])) output.write(u"\t" + tostring(elem).strip() + os.linesep) output.write("" + os.linesep) ================================================ FILE: ccgbank/bin/merge-stanford-morpha-with-pos.py ================================================ """ Given two files: (1) the output of Stanford's re-implementation of 'morpha' and (2) the 'pairs' file of (), merge them into a morph.xml file. """ import sys, codecs, os from optparse import OptionParser as OP from xml.sax import saxutils pr = OP() pr.add_option("-m","--morpha_input",type="string",help="morpha input file (required arg)",\ default=None) pr.add_option("-p","--pairs_input",type="string",help="pairs input file (required arg)",\ default=None) pr.add_option("-o","--output",type="string",help="output location [default=]",\ default=sys.stdout) (opts,args) = pr.parse_args(sys.argv) # we do not check that you passed in the files (this is intended for internal use only, not as a # user-friendly app). pinf = codecs.open(opts.pairs_input,'rb','utf-8') minf = codecs.open(opts.morpha_input,'rb','utf-8') outf = opts.output if not outf is sys.stdout: outf = codecs.open(outf,'wb','utf-8') else: streamWriter = codecs.lookup("utf-8")[-1] outf = streamWriter(sys.stdout) pl = pinf.readline() ml = minf.readline() outf.write('' + '\n') outf.write('' + '\n') entries = [] try: while pl and ml: pl = pl.strip() ml = ml.strip() pl = pl.split() ml = ml.split() # skip blank lines from line ending differences if len(pl) < 2: pl = pinf.readline() ml = minf.readline() continue s = [' 2: s.append(' class="') s.append(saxutils.escape(pl[2])) s.append('"') if ml[0].lower() != pl[0].lower() and not ("^" in ml[0] or "*****" in ml[0]): # add stem only if distinct. s.append(' stem="') s.append(saxutils.escape(ml[0])) s.append('"') s.append('/>') entries.append(s[:]) pl = pinf.readline() ml = minf.readline() # sort/uniq entries.sort() last_one = None for e in entries: if last_one is None or e != last_one: outf.write(u''.join(e) + '\n') last_one = e outf.write('' + '\n') finally: pinf.close() minf.close() if not outf is sys.stdout: outf.close() ================================================ FILE: ccgbank/bin/merge_pos_ne.py ================================================ #!/usr/bin/env python """ Takes a POS-tagged file and a file of the original, NE-tagged text and writes out a file of: ... [word][POS][SEM_CLASS](if any)[stem](if any) ... (c) 2011 Dennis N. Mehay [insert LGPL here] """ class POSOutputIter: def __init__(self, filelikeobj): self.f = filelikeobj def __iter__(self): return self def next(self): try: nxtLn = self.f.readline().strip() nxtSent = [] if nxtLn != "": raise StopIteration nxtLn = self.f.readline().strip() while nxtLn != "": nxtSent.append(nxtLn) nxtLn = self.f.readline().strip() return nxtSent except: raise StopIteration import sys, codecs, os from optparse import OptionParser as OP pr = OP() pr.add_option("-p","--pos_in",type="string",help="POS-tagged input",\ default=None) pr.add_option("-n","--ner_tagged_in",type="string",help="NE-tagged input (no POS tags yet)",\ default=None) pr.add_option("-o","--output",type="string",help="output location [default=]",\ default=sys.stdout) (opts,args) = pr.parse_args(sys.argv) pinf = opts.pos_in if not pinf is sys.stdin: pinf = codecs.open(pinf,'r', 'utf-8') ninf = opts.ner_tagged_in if not ninf is sys.stdin: ninf = codecs.open(ninf,'r', 'utf-8') outf = opts.output if not outf is sys.stdout: outf = codecs.open(outf,'wb', 'utf-8') else: streamWriter = codecs.lookup("utf-8")[-1] outf = streamWriter(sys.stdout) try: for posSent in POSOutputIter(pinf): origSent = ninf.readline() for (posTW,NETagW) in zip(posSent, origSent.split()): NETagWParts = NETagW.split(u"_") if len(NETagWParts) > 1: NETag = u"\t" + NETagWParts[-1] else: NETag = "" posTW = posTW.split() w = posTW[0] tgs = posTW[1:][::2][:2] for t in tgs: outf.write(w + u"\t" + t + NETag + '\n') finally: pinf.close() ninf.close() outf.close() ================================================ FILE: ccgbank/bin/my_unicode.py ================================================ """ Useful functions for dealing with Unicode messiness that arises from dealing with messy input (e.g., gibberish from the Multiple Translation Chinese corpus). """ import re, doctest eval(r'u"[\u0080-\uffff]+"') RE_XML_ILLEGAL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \ u'|' + \ u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \ (unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff), unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff), unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff)) invalid_char_re = re.compile("[^\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD]") def removeInvalidChars(text): """ Text is a unicode string. All characters that are not valid XML characters are removed. """ return re.sub(RE_XML_ILLEGAL, "?", text) if __name__=="__main__": doctest.testmod() ================================================ FILE: ccgbank/bin/nbest-mtc-to-bleu-nist.py ================================================ """ This program takes the n-best realizer output as one file (with sys+DOC+segment IDs -- 'info' attributes), the tb.xml parser output of OpenCCG (for grabbing the strings of those things that did not parse), and creates an XML form suitable for use as a reference in the NIST-distributed BLEU script. Usage: python nbest-mtc-to-bleu-nist.py [nbest-from-realizer] [parser-output(tb.xml)] [max-n] | [NIST/BLEU-compatible-multiref-file] """ import sys, os, re, codecs, tempfile, xml.sax.saxutils try: import chardet except: chardet = None from xml.etree.ElementTree import * from collections import defaultdict # hack procedure. remove later. def remove_ne(txt): # remove: Time, Location, Organization, Person, Money, Percent, Date txt = txt.replace(" LOCATION", "").replace(" PERSON", "").replace(" MONEY", "").replace(" PERCENT", "").replace(" DATE", "").replace(" TIME", "").replace(" ORGANIZATION", "") return txt doc_pattern = re.compile(u"<[Dd][Oo][Cc] docid=\"(.*)\" sysid=\"(.*)\">(.*)") seg_pattern = re.compile(u"(.*)$") openccg_all = open(sys.argv[1], "rb").read().replace("_&", "_&").replace(" & ", "& ") parser_all = open(sys.argv[2], "rb").read().replace("_&", "_&").replace(" & ", "& ") if not chardet is None: encoding1 = chardet.detect(openccg_all)['encoding'] else: encoding1 = "utf-8" if not chardet is None: encoding2 = chardet.detect(parser_all)['encoding'] else: encoding2 = "utf-8" openccg_src = tempfile.NamedTemporaryFile() openccg_src.write(openccg_all) openccg_src.flush() openccg_all = None parser_src = tempfile.NamedTemporaryFile() parser_src.write(parser_all) parser_src.flush() parser_all = None try: n_size = int(sys.argv[3]) except: n_size = 4 # turn stdout into a UTF-8 converting writer. streamWriter = codecs.lookup("utf-8")[-1] sys.stdout = streamWriter(sys.stdout) output = sys.stdout # list of list of (ID/ref pairs) refs = [] # max number of unique refs in any (there may not 'n_size' in any of them) max_num_refs = 0 # map from docID -> [(segID, [text])] doc_to_segs = defaultdict(lambda: []) # get unparsed strings. for event, elem in iterparse(parser_src.name): if elem.tag.lower() == "item" and elem.get("numOfParses") == "0": txt = elem.get("string").strip() (sys,doc,seg) = elem.get("info").split(",") doc_to_segs[doc].append((seg, [txt])) for event, elem in iterparse(openccg_src.name): if elem.tag.lower() == "seg": (sys,doc,seg) = elem.get("id").split(",") is_complete = True if elem.get("complete") else False nbest_realizations = [] if not is_complete: # just get the original input. for child in list(elem): if child.tag.lower() == "ref": nbest_realizations.append(child.text.strip()) else: # get the n-best (only keeping unique strings), so, e.g., 4-best might turn into # 1-best if they're all the same. how_many = 0 for child in list(elem): if child.tag.lower() in ["ref", "best", "next"] and how_many < n_size: txt = child.text.strip() if not txt in nbest_realizations: nbest_realizations.append(txt) how_many += 1 elif how_many >= n_size: break if len(nbest_realizations) > max_num_refs: max_num_refs = len(nbest_realizations) doc_to_segs[doc].append((seg, nbest_realizations)) output.write("" + \ os.linesep + "" + os.linesep +\ "" + os.linesep) docs = doc_to_segs.keys() docs.sort() for i in range(max_num_refs): output.write("" % (i+1) + os.linesep) for doc in docs: output.write(" " % doc + os.linesep) segs = doc_to_segs[doc] segs.sort(lambda a,b: cmp(int(a[0]), int(b[0]))) for (seg, paraphrases) in segs: this_one = i if i >= len(paraphrases): # there aren't as many paraphrases here as there are in the maximum length ref, so we just re-duplicate # the last one of this ref. this_one = len(paraphrases)-1 output.write(" %s " % (int(seg), xml.sax.saxutils.escape(remove_ne(paraphrases[this_one].replace("_"," "))) + os.linesep)) output.write(" " + os.linesep) output.write("" + os.linesep) output.write("") ================================================ FILE: ccgbank/bin/ner/NERApp/src/nerapp/NERApp.java ================================================ package nerapp; import java.io.*; //import edu.stanford.nlp.ie.crf.*; import edu.stanford.nlp.ie.AbstractSequenceClassifier; import edu.stanford.nlp.ie.NERClassifierCombiner; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.sequences.DocumentReaderAndWriter; import edu.stanford.nlp.sequences.PlainTextDocumentReaderAndWriter; import edu.stanford.nlp.util.CoreMap; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** This tags text using the Stanford NE tagger API. *

* Usage: java -cp "stanford-ner.jar:." NERApp [serializedClassifier] [fileName] *

* There are no default arguments. * (Created by modifying Jenny Finkel and Chris Manning's example "NERDemo.java".) *

* @author Dennis N. Mehay */ public class NERApp { @SuppressWarnings("unchecked") public static void main(String[] args) throws IOException { DocumentReaderAndWriter readerAndWriter = new PlainTextDocumentReaderAndWriter(); String usageStr = System.getProperty("line.separator") + "java -cp \"stanford-core-nlp.jar:.\" NERApp [inputFileName] [outputFileName] [classifierModelFile1] (...[classifierModelFile10])"+ System.getProperty("line.separator") + System.getProperty("line.separator") + "(I.e., you can specify between one and ten classifiers whose predictions will be combined.\n"+ "Specify the best model first -- it will have precedence in the model combination.)" + System.getProperty("line.separator"); if (args.length < 3) { System.out.println(usageStr); System.exit(-1); } String[] classifierMods = new String[10]; for(int j = 2; j < args.length; j++) { classifierMods[j-2] = args[j].trim(); } int numClassifiers = 0; AbstractSequenceClassifier classifier = null; //CRFClassifier.getClassifierNoExceptions(serializedClassifier); for(String classMod : classifierMods) { if(classMod != null) { numClassifiers++; } } switch (numClassifiers) { case 1: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0]); break; case 2: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1]); break; case 3: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2]); break; case 4: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3]); break; case 5: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3], classifierMods[4]); break; case 6: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3], classifierMods[4], classifierMods[5]); break; case 7: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3], classifierMods[4], classifierMods[5], classifierMods[6]); break; case 8: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3], classifierMods[4], classifierMods[5], classifierMods[6], classifierMods[7]); break; case 9: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3], classifierMods[4], classifierMods[5], classifierMods[6], classifierMods[7], classifierMods[8]); break; case 10: classifier = new NERClassifierCombiner(true, false, false, classifierMods[0], classifierMods[1], classifierMods[2], classifierMods[3], classifierMods[4], classifierMods[5], classifierMods[6], classifierMods[7], classifierMods[8], classifierMods[9]); break; default: System.out.println(usageStr); System.exit(-1); } Iterable sents = IOUtils.readLines(args[0]); BufferedWriter outf = new BufferedWriter(new FileWriter(new File(args[1]))); for (String sent : sents) { String[] parts = sent.split("\\s+"); List wdList = new ArrayList(parts.length); for(String w : parts) { wdList.add(new MyWord(w)); } List tagging = classifier.classifySentence(wdList); String currNE = null; StringBuilder res = new StringBuilder(); String wd, annot; int cursor = -1; for(CoreMap item : tagging) { cursor += 1; wd = item.get(CoreAnnotations.TextAnnotation.class); annot = item.get(CoreAnnotations.AnswerAnnotation.class); if(annot != null && annot.equals("O") || annot.equals("MISC")) { annot = null; } if(currNE != null && !currNE.equals(annot)) { res.append(""); currNE = null; } if(annot != null && !annot.equals(currNE)) { currNE = annot; res.append(" "); res.append("<"); res.append(currNE); res.append(">"); res.append(wd); } else { res.append(" "); res.append(wd); } } if(null != currNE) { res.append(""); } outf.write(res.toString() + System.getProperty("line.separator")); outf.flush(); } outf.close(); } public static String classifyToString(List sentence, DocumentReaderAndWriter readerAndWriter, AbstractSequenceClassifier classif) { PlainTextDocumentReaderAndWriter.OutputStyle outFormat = PlainTextDocumentReaderAndWriter.OutputStyle.fromShortName("inlineXML"); DocumentReaderAndWriter tmp = readerAndWriter; readerAndWriter = new PlainTextDocumentReaderAndWriter(); readerAndWriter.init(classif.flags); StringBuilder sb = new StringBuilder(); sb.append(((PlainTextDocumentReaderAndWriter) readerAndWriter).getAnswers(sentence, outFormat, true)); return sb.toString(); } } class MyWord implements HasWord { private String wd = null; public MyWord(String wd) { this.wd = wd; } public String word() { return wd; } public void setWord(String string) { this.wd = string; } } ================================================ FILE: ccgbank/bin/ner/build-ner-api.properties ================================================ # For compiling against the Stanford API. stanford.core.nlp=../../stanford-nlp/stanford-core-nlp.jar ================================================ FILE: ccgbank/bin/ner/build-ner-api.xml ================================================ ================================================ FILE: ccgbank/bin/ner/ner-tag.sh ================================================ #!/bin/bash # # Tag a file $1 using Stanford NER located in base directory $2 (first arg, e.g., "/home/me/stanford-ner-2.1.0") # with model $3 (second arg, e.g., "all.3class.distsim.crf.ser.gz"). # # Output is placed in the file whose path is given in argument $4. # java -mx700m -cp "$1/stanford-ner.jar:`dirname $0`/NERApp.jar" nerapp.NERApp $2/classifiers/$3 $1 2> /dev/null | python `dirname $0`/post-process-stanford-ner.py > $4 ================================================ FILE: ccgbank/bin/ner/ner_word.py ================================================ class NERWord: """ A simple wrapper for NER-labelled words. """ def __init__(self, wd, label=None, delim="/"): self.wd = wd self.label = label self.delim = delim @staticmethod def parseLineOfWords(ln): """ Parses a line of words labelled with NE labels (e.g., "John Smith entered the United States"). """ res = [] for w in ln: if ""): parts = w.partition("" in wd: parts = wd.partition(">") wd = parts[2] lb = lb[2:-1] elif w.startswith("<") and ">" in w: parts = w.partition(">") (wd,lb) = (parts[2], parts[0]+">") else: (wd,lb) = (w,None) res.append((wd,lb)) # now distribute the labels to words between within the ... labels. final_res = [] i = len(res) - 1 while i >= 0: (wd,lb) = res[i] if lb is None: final_res.append(NERWord(wd,lb)) elif not "US Dept of Defense example ." | python post-process-stanford-ner.py This is a US_Dept_of_Defense_ORGANIZATION example . Type: $ python post-process-stanford-ner.py -h for help on the command-line options. """ def fuseNERWords(list_of_ner_words): return "_".join([nerWrd.getWord() for nerWrd in list_of_ner_words] + [list_of_ner_words[0].getLabel()]) op = optparse.OptionParser() op.add_option("--known_verbs", type="string", \ help="file containing known verbs (for split-at-verb-boundary heuristic) [defaults to an empty list of known verbs]",\ default=None) (ops, args) = op.parse_args() known_verbs = set([v.strip() for v in open(ops.known_verbs, "rb").readlines()]) if not ops.known_verbs is None else set() puncts = set([',', "'", '"', ".", "?", "!"]) heuristic_splitters = (known_verbs | puncts) for l in sys.stdin: l = l.strip() if l == "": continue ner_parts = l.split() ner_parts = NERWord.parseLineOfWords(ner_parts) i = 0 current_NE = None current_group = None res = [] while i < len(ner_parts): prt = ner_parts[i] (wd,ne) = (prt.getWord(), prt.getLabel()) if not ne is None: # we have a NE label. is it a continuation of what came before (if anything)? if ne == current_NE: current_group.append(prt) else: if current_NE is None: current_NE = ne current_group = [prt] else: res.append(fuseNERWords(current_group)) current_NE = ne current_group = [prt] else: if not current_NE is None: res.append(fuseNERWords(current_group)) current_NE = None current_group = None res.append(wd) i += 1 if not current_NE is None: res.append(fuseNERWords(current_group)) print " ".join(res) ================================================ FILE: ccgbank/bin/normalize_text.py ================================================ """ Assuming UTF-8 input (defaults to stdin, otherwise, supply a file), normalize plain text in various ways -- e.g., normalize quotation marks. Copyright Dennis N. Mehay (2011) [Insert LGPL here] """ from optparse import OptionParser as OP import codecs, sys, os op = OP() op.add_option("-i", "--input", type="string", help="input stream of text (file, or default=sys.stdin)", default=sys.stdin) op.add_option("-o", "--output", type="string", help="output stream (file, or default=sys.stdout)", default=sys.stdout) (ops,args) = op.parse_args() if not ( ops.input is sys.stdin or ops.input == "-" ): inf = codecs.open(ops.input, "rb", "utf-8") else: streamReader = codecs.lookup("utf-8")[2] inf = streamReader(sys.stdin) if not ( ops.output is sys.stdout or ops.output == "-" ): outf = codecs.open(ops.output, "wb", "utf-8") else: streamWriter = codecs.lookup("utf-8")[-1] outf = streamWriter(sys.stdout) try: l = inf.readline().strip() while l: transformed_line = [] opening_quotes = True for c in l: if c == u'"' and opening_quotes: transformed_line.append(u"``") opening_quotes = False # next double quotes will be closing quotes. elif c == u'"' and not opening_quotes: transformed_line.append(u"''") opening_quotes = True # reset the open-close tracker. else: transformed_line.append(c) outf.write(u"".join(transformed_line) + os.linesep) l = inf.readline().strip() finally: inf.close() outf.close() ================================================ FILE: ccgbank/bin/post-process-metricsmatr.py ================================================ """ Pipe in a MetricsMATR-style mteval script output and, given (as sys.argv[1]) the name of the reference system, produce a file that has lines of the form: ,,,, ... ,,,, """ import sys refid = sys.argv[1].strip() for l in sys.stdin: # e.g., "multiple_translation_set E09 XIN20020316.0014 1 0.0715856157727753" (setid,sysid,docid,segid,score) = l.strip().split() print ",".join([sysid, refid, docid, segid, score]) ================================================ FILE: ccgbank/bin/prepare-for-stanford-morpha.py ================================================ """ Take in a space-delimited file of ...... and turn it into a form that the Stanford NE recognizer can accept (and from which we can extract all the information we need later). => """ import sys, codecs, os streamWriter = codecs.lookup("utf-8")[-1] sys.stdout = streamWriter(sys.stdout) streamReader = codecs.lookup("utf-8")[2] sys.stdin = streamReader(sys.stdin) for l in sys.stdin: l = l.strip() parts = l.split() if len(parts) > 2: # has NE label. wordform = u"*****".join([parts[0],parts[-1]]).replace("_","^") else: wordform = parts[0] pos = parts[1] joined = u"_".join([wordform, pos]) sys.stdout.write(joined + '\n') ================================================ FILE: ccgbank/bin/reverse-spaces-to-newlines.py ================================================ # # reverses the conversion from spaces to newlines, and newlines to special chars, # from stdin to stdout # import sys, re; [sys.stdout.write(re.sub('','\n',re.sub('\n',' ',line))) for line in sys.stdin] ================================================ FILE: ccgbank/bin/run-all-bleu.sh ================================================ #!/bin/bash # run Bleu/NIST on all systems sys directory (passed in) with all references in the ref directory # (also passed in as an arg). # $1 is the mteval script. # $2 is the location of the system directory (where all the system files are stored) # $3 is the location of the reference directory (where all the reference files are stored). # $4 is the location of the source *file*. # $5 is the location where the scores will go. curr_dir=`dirname $0` for sys in `ls $2/E*` do for ref in `ls $3/*` do echo "Command line: $1 --metricsMATR -t $sys -r $ref -s $4" sys_shortname=`basename ${sys} | sed "s/^\(E[0-9][0-9]\).*/\1/g"` ref_shortname=`basename ${ref} | sed "s/^\(E[0-9][0-9]\).*/\1/g"` $1 --metricsMATR -t $sys -r $ref -s $4 cat BLEU-seg.scr | python $curr_dir/post-process-metricsmatr.py ${ref_shortname} > $5/BLEU.${sys_shortname}-${ref_shortname}.scr cat NIST-seg.scr | python $curr_dir/post-process-metricsmatr.py ${ref_shortname} > $5/NIST.${sys_shortname}-${ref_shortname}.scr rm BLEU-*; rm NIST-*; done done ================================================ FILE: ccgbank/bin/stem_nns_vbx ================================================ #/bin/bash # # produces a file .dir/morph from .dir/pairs # that lists the words, pos tags and, for plural nouns and verbs, # also stems, derived using morpha # # write plural nouns and verbs to .dir/nns-vbx cat $1.dir/pairs | sort | uniq | grep -E "NNS|VB." > $1.dir/nns-vbx # get rest cat $1.dir/pairs | sort | uniq | grep -E -v "NNS|VB." | grep -E -v "s>" > $1.dir/non-nns-vbx # get stems cat $1.dir/nns-vbx | tr [:blank:] _ | morpha > $1.dir/nns-vbx.stems # merge stems paste $1.dir/nns-vbx $1.dir/nns-vbx.stems > $1.dir/nns-vbx-stems # merge files cat $1.dir/non-nns-vbx $1.dir/nns-vbx-stems | sort | uniq > $1.dir/morph # cleanup rm -f $1.dir/nns-vbx* $1.dir/non-nns-vbx ================================================ FILE: ccgbank/bin/toUTF-8.py ================================================ """ Copryright (c) 2011 Dennis N. Mehay Assumes that 'chardet' is installed. Re-encodes most known Unicode encodings as UTF-8. (Provided that there is enough text for chardet to correctly determine the encoding of the input file.) If any file exists by the output file name, it will be overwritten. [Insert LGPL here] """ import sys, codecs, optparse try: import chardet except ImportError, ie: print >> sys.stderr, "'chardet' must be installed for this script to work. Exiting..." sys.exit(-1) op = optparse.OptionParser() op.add_option("-i", "--inputf", type="string", help="input file [required]", default=None) op.add_option("-o", "--outputf", type="string", help="output file [required, will be overwritten]", default=None) (ops,args) = op.parse_args() try: assert(not (ops.inputf is None or ops.outputf is None)) except AssertionError, ae: print >> sys.stderr, "provide input and output files (type: 'python toUTF-8.py -h' for help)" sys.exit(-1) # get input file's content and convert to utf-8 inf = open(ops.inputf, "rb") input = inf.read() outf = None try: encoding = chardet.detect(input).get('encoding') input = input.decode(encoding) outf = codecs.open(ops.outputf, "wb", "utf-8") outf.write(input) except Exception, e: print >> sys.stderr, "Something went wrong. Perhaps your input format is too obscure" finally: outf.close() inf.close() ================================================ FILE: ccgbank/bin/write_morph.py ================================================ #!/usr/bin/env python """ Takes a file of word-tag pairs or word-tag-stem triples and writes an xml morph file. (c) 2010 Michael White (modifed by D.N. Mehay 2011) [insert LGPL here] """ import sys from optparse import OptionParser as OP from xml.sax import saxutils global sem_classes sem_classes = set(["PERSON", "ORGANIZATION", "LOCATION", "MONEY", "PERCENT", "TIME", "DATE"]) pr = OP() pr.add_option("-i","--input",type="string",help="input source [default=]",\ default=sys.stdin) pr.add_option("-o","--output",type="string",help="output location [default=]",\ default=sys.stdout) (opts,args) = pr.parse_args(sys.argv) inf = opts.input if not inf is sys.stdin: inf = open(inf,'r') outf = opts.output if not outf is sys.stdout: outf = open(outf,'w') try: print >> outf, '' for l in inf: l = l.strip() l = l.split() s = ['= 3: if l[2] in sem_classes: s.append(' class="') s.append(saxutils.escape(l[2])) s.append('"') else: s.append(' stem="') s.append(saxutils.escape(l[2])) s.append('"') if len(l) >= 4: s.append(' stem="') s.append(saxutils.escape(l[3])) s.append('"') finally: if not inf is sys.stdin: inf.close() if not outf is sys.stdout: outf.close() ================================================ FILE: ccgbank/build-ht.properties ================================================ # nb: info.dir is specified in build.properties preds.train=${info.dir}/preds-train preds.dev=${info.dir}/preds-00-all ht.factors.train=${info.dir}/ht.factors.train ht.factors.dev=${info.dir}/ht.factors.dev grammar.dir=${extract.dir} ht.corpus.dir=${extract.dir}/test ht.feats.dir=${feats.dir}/hypertagger ht.models.dir=${models.dir}/hypertagger pos.lm=${ht.models.dir}/pos.lm st.lm=${ht.models.dir}/st.lm vocab.pos=${ht.models.dir}/vocab.pos vocab.st=${ht.models.dir}/vocab.st vocab.train=${ht.models.dir}/vocab.train posprior.lm=${ht.models.dir}/p_w0.lm pos.feats=${ht.feats.dir}/pos.feats pos.dev.feats=${ht.feats.dir}/pos.feats.00 pos.mod=${ht.models.dir}/pos.mod htprior.lm=${ht.models.dir}/t_p0w0.lm pos.dict=${ht.models.dir}/pos.dict.min10 word.dict=${ht.models.dir}/word.dict.min10 ht.feats=${ht.feats.dir}/ht.feats ht2.feats=${ht.feats.dir}/ht2.feats ht.dev.feats=${ht.feats.dir}/ht.dev.feats ht.dev.log=${log.dir}/ht.dev.log ht.mod=${ht.models.dir}/ht.mod ht2.mod=${ht.models.dir}/ht2.mod argnames=Arg0:A0 Arg1:A1 Arg1a:A1a Arg1b:A1b Arg2:A2 Arg2a:A2a Arg2b:A2b Arg3:A3 Arg4:A4 Arg5:A5 ================================================ FILE: ccgbank/build-ht.xml ================================================ ================================================ FILE: ccgbank/build-models.properties ================================================ # nb: info.dir is specified in build.properties factors.train=${info.dir}/factors-train factors.dev=${info.dir}/factors-00-all text.train=${info.dir}/text-train text.sc.train=${info.dir}/textsc-train text.dev=${info.dir}/text-00-all catfreq.cutoff=10 feats.dir=./feats models.dir=./models plugins.dir=./plugins original.postagger.models.dir=./original/models/postagger novel.dir=${data.dir}/novel novel.file=${novel.dir}/two-sents ================================================ FILE: ccgbank/build-models.xml ================================================ Invoking supertagger all Done Invoking parser all Done Invoking hypertagger all Done Invoking realizer all Done ================================================ FILE: ccgbank/build-original.properties ================================================ # nb: original.ccgbank.dir and data.dir are specified in build.properties original.dir=./original corpus.dir=${original.dir}/corpus feats.dir=${original.dir}/feats models.dir=${original.dir}/models original.log.dir=${original.dir}/logs train.leaves.srilm=${corpus.dir}/train.leaves.srilm dev.leaves.srilm=${corpus.dir}/dev.leaves.srilm postagger.feats.dir=${feats.dir}/postagger postagger.models.dir=${models.dir}/postagger supertagger.feats.dir=${feats.dir}/supertagger supertagger.models.dir=${models.dir}/supertagger train.pos.seq=${postagger.feats.dir}/train.pos.seq pos.lm=${postagger.models.dir}/pos.lm train.st.seq=${supertagger.feats.dir}/train.st.seq st.lm=${supertagger.models.dir}/st.lm vocab.pos=${postagger.models.dir}/vocab.pos vocab.st=${supertagger.models.dir}/vocab.st vocab.train=${supertagger.models.dir}/vocab.train posprior.lm=${postagger.models.dir}/p_w0.lm pos.feats=${postagger.feats.dir}/pos.feats pos.mod=${postagger.models.dir}/pos.mod stprior.lm=${supertagger.models.dir}/t_p0w0.lm pos.dict=${supertagger.models.dir}/pos.dict.min10 word.dict=${supertagger.models.dir}/word.dict.min10 st.feats=${supertagger.feats.dir}/st.feats st.mod=${supertagger.models.dir}/st.mod novel.dir=${data.dir}/novel novel.file=${novel.dir}/two-sents ================================================ FILE: ccgbank/build-original.xml ================================================ ================================================ FILE: ccgbank/build-ps.properties ================================================ # nb: feats.dir and models.dir are specified in build-models.properties parser.feats.dir=${feats.dir}/parser parser.models.dir=${models.dir}/parser deriv.factors.train=${parser.feats.dir}/deriv.factors.train vocab.parser.train=${parser.models.dir}/vocab.parser.train nbest.list.size=5 original.postagger.models.dir=./original/models/postagger # what percentage of first 10 words being cased makes the heuristic say a sentence is in # title-case? (change if desired) titlecase.threshold=0.5 # you will need to download and extract ccgbank-data.tgz or english-models.tgz to get the truecase list truecase.list=./aux/aux-files/truecase-list.gz # we assume you have the stanford core nlp jar file and NE tagging models # see docs/ccgbank-README for installation instructions # you can also modify the properties below to point to the jar and model files elsewhere stanford.core.nlp.dir=./stanford-nlp stanford.core.nlp.jar=${stanford.core.nlp.dir}/stanford-core-nlp.jar # by default, we assume you have placed the Stanford core NLP jar file in ./stanford-nlp, and that # there are the following models in a subdirectory there called 'classifiers' # (you can get Stanford NE recognition models from: http://nlp.stanford.edu/software/CRF-NER.shtml # or from the core NLP download, per docs/ccgbank-README) ner.model1=${stanford.core.nlp.dir}/classifiers/english.all.3class.distsim.crf.ser.gz ner.model2=${stanford.core.nlp.dir}/classifiers/english.muc.7class.distsim.crf.ser.gz ner.model3=${stanford.core.nlp.dir}/classifiers/english.conll.4class.distsim.crf.ser.gz # if you're using an older version of the Stanford NLP tools, it may # be convenient to comment in the older names below #ner.model1=${stanford.core.nlp.dir}/classifiers/all.3class.distsim.crf.ser.gz #ner.model2=${stanford.core.nlp.dir}/classifiers/muc.distsim.crf.ser.gz #ner.model3=${stanford.core.nlp.dir}/classifiers/conll.distsim.crf.ser.gz ================================================ FILE: ccgbank/build-ps.xml ================================================ Extracting derivation factors for section @{sect} Concatenating training derivation factors to ${deriv.factors.train} Copying train grammar with cutoffs to ${extract.dir} Copying dev morph to ${extract.dir} Loading parse.prefs Parsing dev section to ${log.dir}/parse.dev.log Concatenating training derivation factors excluding sect @{sect} to ${deriv.factors.train}.excl@{sect} Copying flm files to ${parser.models.dir}/excl@{sect} Generating parser training events for sect @{sect} Copying train grammar with to ${extract.dir} Loading gen-events.prefs Generating events to ${parser.feats.dir}/events-@{sect}.gz Concatenating event files to ${parser.feats.dir}/events-train.gz Concatenating ${parser.feats.dir}/events-@{sect}.gz Calculating feature alphabet as ${parser.feats.dir}/alph.gz Training perceptron model to ${parser.models.dir}/model.gz Loading parse.prefs Parsing dev section to ${log.dir}/parse.perceptron.dev.log Copying training grammar with cutoffs and other miscellanea to ${novel.file}.dir/extract/ Merging training morph (with cutoffs) and novel morph to ${novel.file}.dir/extract/morph.xml Loading parse.prefs Parsing ${novel.file}.dir/nertext-nolabs to ${novel.file}.dir/tb.xml Loading parse.prefs Parsing ${novel.file}.dir/nertext-nolabs to ${novel.file}.dir/tb.xml ================================================ FILE: ccgbank/build-release.xml ================================================ ================================================ FILE: ccgbank/build-rz.properties ================================================ # nb: feats.dir and models.dir is specified in build-models.properties realizer.feats.dir=${feats.dir}/realizer realizer.models.dir=${models.dir}/realizer realizer.model.global=model.global realizer.alph.init=alph.init realizer.alph.name=alph.gz realizer.model.name=model.gz big.words.lm=${realizer.models.dir}/gigaword4.5g.kenlm.bin words.lm=${realizer.models.dir}/train.3bo words.sc.lm=${realizer.models.dir}/train-sc.3bo stpos.flm=${realizer.models.dir}/stp3.flm # nb: vocab.train is copied from build-st.properties (which is otherwise not needed) supertagger.models.dir=${models.dir}/supertagger vocab.train=${supertagger.models.dir}/vocab.train # todo: import this property instead hypertagger.models.dir=${models.dir}/hypertagger # nb: parser.models.dir is copied from build-ps.properties (which is otherwise not needed) parser.models.dir=${models.dir}/parser ================================================ FILE: ccgbank/build-rz.xml ================================================ Copying train grammar to ${extract.dir} Copying dev morph to ${extract.dir} Copying test morph to ${extract.dir} Loading gen-events.prefs Loading rz-test.prefs Concatenating training text excluding sect @{sect} to ${text.train}.excl@{sect} Concatenating sem class replaced training text excluding sect @{sect} to ${text.sc.train}.excl@{sect} Building trigram language model to ${realizer.models.dir}/excl/train.excl@{sect}.3bo Building trigram language model to ${realizer.models.dir}/excl/train-sc.excl@{sect}.3bo Deleting concatenated text files Invoking parser target for generative models excluding each section Done Generating realizer training events for sect @{sect} Copying train grammar to ${extract.dir} Generating events to ${realizer.feats.dir}/events-@{sect}.gz Concatenating event files to ${realizer.feats.dir}/events-train.gz Concatenating ${realizer.feats.dir}/events-@{sect}.gz Calculating feature alphabet as ${realizer.feats.dir}/alph.gz Training perceptron model with global feats to ${realizer.models.dir}/${realizer.model.global} Training perceptron model to ${realizer.models.dir}/${realizer.model.name} from ${realizer.feats.dir}/${realizer.alph.name} Realizing dev section to ${log.dir}/realize.dev.log Realizing dev section to ${log.dir}/realize.perceptron.dev.log Realizing test section to ${log.dir}/realize.perceptron.test.log Realizing ${novel.file}.dir/tb.xml to ${novel.file}.dir/realize.nbest ================================================ FILE: ccgbank/build-st.properties ================================================ # nb: feats.dir and models.dir are specified in build-models.properties supertagger.feats.dir=${feats.dir}/supertagger supertagger.models.dir=${models.dir}/supertagger train.pos.seq=${supertagger.feats.dir}/train.pos.seq pos.lm=${supertagger.models.dir}/pos.lm train.st.seq=${supertagger.feats.dir}/train.st.seq st.lm=${supertagger.models.dir}/st.lm vocab.pos=${supertagger.models.dir}/vocab.pos vocab.st=${supertagger.models.dir}/vocab.st vocab.train=${supertagger.models.dir}/vocab.train posprior.lm=${supertagger.models.dir}/p_w0.lm pos.feats=${supertagger.feats.dir}/pos.feats pos.mod=${supertagger.models.dir}/pos.mod pos.dict=${supertagger.models.dir}/pos.dict.min10 word.dict=${supertagger.models.dir}/word.dict.min10 st.feats=${supertagger.feats.dir}/st.feats st.mod=${supertagger.models.dir}/st.mod ================================================ FILE: ccgbank/build-st.xml ================================================ ================================================ FILE: ccgbank/build.properties ================================================ data.dir=./data templates.dir=./templates convert.dir=./convert extract.dir=./extract info.dir=${extract.dir}/info test.dir=${extract.dir}/test grams.dir=${extract.dir}/grammars log.dir=./logs tmp.dir=./tmp training.dirs=02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21 # nb: the "propccgbank" and aux files should be unpacked from # the ccgbank-data.tgz (see docs/ccgbank-README for details); # the original ccgbank dir property below can be changed to # point to your copy of the CCGBank, or you can just create a # symbolic link from ./ccgbank1.1 to your copy of it ccgbank.dir=./propccgbank/pp_head_verbsAnnotated/AUTO original.ccgbank.dir=./ccgbank1.1/data/AUTO #sect=* sect=00 #file=* file=wsj_0001 aux.top.dir=./aux aux.dir=${aux.top.dir}/aux-files words=${data.dir}/wsj-nns-vb stems=${data.dir}/wsj-nns-vb-stems ================================================ FILE: ccgbank/build.xml ================================================ Extracting grammar with no cutoffs and making LFs Section: ${sect} File: ${file} Concatenating @{type}-train Starting extract task for section @{sect} Concatenating training section files in ${info.dir} Extracting grammar with no cutoffs and making LFs with debug derivs true Section: ${sect} File: ${file} ================================================ FILE: ccgbank/data/README ================================================ The files wsj-nns-vb and wsj-nns-vb-stems contain the plural nouns & verbs and their stems, respectively, from the WSJ part of the Penn Treebank, as computed by the morpha tool. Sample commands for creating these files (with directories on the OSU Linguistics system) are in get_wsj_nns_vb and stem_wsj_nns_vb. There are also some made-up sentences for testing the parser and realizer in novel/two-sents, and an example CCGbank file with Propbank roles and corrections in the sample dir. ================================================ FILE: ccgbank/data/get_wsj_nns_vb ================================================ cat /home/corpora/EN/penn_treebank_3/ims-cwb/wsj/wsj.cqp | egrep "NNS|VB" | sort | uniq > wsj-nns-vb ================================================ FILE: ccgbank/data/novel/two-sents ================================================ Google announced today that it would offer free texting on its Google Voice app for the iPhone. The press release was greeted rapturously. ================================================ FILE: ccgbank/data/sample/AUTO/00/wsj_0001.auto ================================================ ID=wsj_0001.1 PARSER=GOLD NUMPARSE=1 ( ( ( ( ( ( ( () () ) ) () ) ( ( ( ( () () ) ) () ) ) ) () ) ( () ( ( ( () ( () () ) ) ( () ( () ( () () ) ) ) ) ( () () ) ) ) ) () ) ID=wsj_0001.2 PARSER=GOLD NUMPARSE=1 ( ( ( ( () () ) ) ( () ( ( () ) ( () ( ( ( () () ) ) ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) () ) ================================================ FILE: ccgbank/data/stem_wsj_nns_vb ================================================ cat wsj-nns-vb | tr [:blank:] _ | ./morpha.ix86_linux > wsj-nns-vb-stems ================================================ FILE: ccgbank/data/wsj-nns-vb ================================================ 12-inches NNS 15 VBN 16-year-olds NNS 1850s NNS 1890s NNS 18-year-olds NNS 1900s NNS 1920s NNS 1930s NNS 1940s NNS 1950s NNS 1960s NNS 1970s NNS 1980s NNS 1990s NNS 19-year-olds NNS 204s NNS 20s NNS 20-stocks NNS 3090s NNS '30s NNS 30s NNS 323s NNS '40s NNS '50s NNS '60s NNS '70s NNS 747-400s NNS 757-200s NNS 75-cents-an-hour NNS '80s NNS 80s NNS 8300s NNS '90s NNS A330-300s NNS abacuses NNS abandoned VBD abandoned VBN abandoning VBG Abandoning VBG abandons VBZ abandon VB abandon VBP abated VBD abated VBN abates VBZ abate VB abating VBG abdicate VBP abetted VBN abetting VBG abide VB abilities NNS abolished VBD abolished VBN abolishing VBG abolish VB aborted VBN abortion-rights NNS Abortion-rights NNS abortions NNS abounded VBD abounding VBG abounds VBZ abound VBP abrasives NNS abridging VBG absences NNS absent VB Absent VB absolve VBP absolving VBG absorbed VBD absorbed VBN Absorbed VBN absorbers NNS absorbing VBG absorbs VBZ absorb VB absorb VBP abstained VBD abstentions NNS abstracts NNS abused VBD abused VBN abuses NNS abuse VB abusing VBG academics NNS acceded VBD accede VB accelerated VBD accelerated VBN accelerates VBZ accelerate VB accelerate VBP accelerating VBG accents NNS acceptances NNS ACCEPTANCES NNS accepted VBD accepted VBN accepting VBG accepts VBZ accept VB accept VBP accessories NNS Accessories NNS access VB accidents NNS acclaim VB accolades NNS accommodated VBN accommodate VB accommodate VBP accommodating VBG accommodations NNS accomodate VB accompanied VBD accompanied VBN accompanies VBZ accompanying VBG accompany VB accomplished VBN accomplishes VBZ Accomplishing VBG accomplishments NNS accomplish VB accorded VBD accorded VBN according VBG According VBG accords NNS accord VB accountants NNS accounted VBD accounted VBN accounting VBG accounts NNS Accounts NNS accounts VBZ account VB account VBP accreted VBN accruals NNS accrued VBN Accrued VBN accrues VBZ accrue VB accruing VBG accumulated VBN accumulate VBP accumulating VBG accusations NNS accused VBD accused VBN accusers NNS accuses VBZ accuse VB accuse VBP accusing VBG accustomed VBN aces VBZ achieved VBD achieved VBN achievements NNS achieves VBZ achieve VB achieve VBP achieving VBG aching VBG acidified VBN acids NNS acknowledged VBD acknowledged VBN acknowledges VBZ acknowledge VB acknowledge VBP acknowledging VBG acorns NNS acquainted VBN acquiesced VBD acquiesce VB acquired VBD acquired VBN Acquired VBN acquirers NNS Acquirers NNS acquires VBZ acquire VB acquiring VBG acquisitions NNS Acquisitions NNS acquitted VBN acquit VB acres NNS across-the-board-cuts NNS acted VBD acted VBN acting VBG Acting VBG actions NNS activated VBN activate VBP actives NNS activists NNS Activists NNS activities NNS actors NNS acts NNS acts VBZ actuaries NNS act VB act VBP adapted VBD adapted VBN Adapted VBN adapting VBG adapt VB adapt VBP added VBD Added VBD added VBN Added VBN addicted VBN addicts NNS adding VBG Adding VBG additions NNS additives NNS addressed VBD addressed VBN addresses NNS addresses VBZ addressing VBG Addressing VBG address VB address VBP adds VBZ Adds VBZ add VB Add VB add VBP adepts NNS adhered VBD adhere VB adhesives NNS adjourned VBN adjudicators NNS adjusted VBD adjusted VBN adjusters NNS Adjusters NNS adjusting VBG adjustments NNS adjusts VBZ adjust VB Adjust VB adjust VBP admen NNS administered VBN administers VBZ administer VB administrations NNS administrators NNS admired VBD admired VBN Admirers NNS admires VBZ admissions NNS admits NNS admits VBZ admitted VBD ADMITTED VBD admitted VBN admitting VBG admit VB admit VBP admonishing VBG adolescents NNS adopted VBD ADOPTED VBD adopted VBN adopting VBG Adopting VBG adopts VBZ adopt VB adopt VBP adorned VBD adorned VBN adorn VB ADRs NNS ads NNS Ads NNS ADS NNS adults NNS Adults NNS advanced-ceramics NNS advanced VBD advanced VBN advancements NNS advancers NNS Advancers NNS advances NNS Advances NNS advance VB advance VBP advancing VBG Advancing VBG advantages NNS advantage VB adventures NNS adversaries NNS advertised VBD advertised VBN advertisements NNS advertisers NNS Advertisers NNS advertises VBZ advertise VB advertise VBP advertising VBG Advertising VBG advertising VBG|NN advised VBD advised VBN advisers NNS advises VBZ Advises VBZ advise VB advise VBP advising VBG advisories NNS advocated VBD advocated VBN advocates NNS Advocates NNS advocates VBZ advocate VB advocate VBP advocating VBG aerobics NNS affairs NNS affected JJ|VBN affected VBD affected VBN affecting VBG affections NNS affects VBZ affect VB affect VBP affidavits NNS affiliated VBN Affiliated VBN affiliates NNS affiliating VBG affinities NNS affirmed VBD affirming VBG afflicted VBN afflicts VBZ afflict VB afforded VBN afford VB afford VBP aftereffects NNS Aftereffects NNS after-hours NNS afternoons NNS aftershocks NNS Aftershocks NNS AFTERSHOCKS NNS aged VBN agencies NNS agendas NNS agents NNS Agents NNS ages NNS age VBP aggravated VBD aggravated VBN aggravates VBZ aggravate VB aggravate VBP aggravating VBG aggregates NNS aging VBG agitated VBN agonize VB agreed VBD Agreed VBD agreed VBN agreeing VBG agreements NNS agrees VBZ AGREES VBZ agree VB agree VBP agriproducts NNS Aichi NNS aided VBD aided VBN Aided VBN aides NNS Aides NNS aiding VBG aids NNS aids VBZ aid VB ailing VBG ailments NNS ails NNS aimed VBD aimed VBN Aimed VBN aiming VBG aims NNS aims VBZ Aims VBZ aim VB Aim VB aim VBP aircraft NNS aired VBD aired VBN airfields NNS airing VBG airlifted VBN airlifting VBG airliners NNS airlines NNS Airlines NNS airmen NNS airplanes NNS airports NNS airs NNS airs VBZ air VB air VBP airwaves NNS aisles NNS ai VBP ai VBZ alarmed VBN Alarmed VBN alarming VBG alarms NNS alarms VBZ Albanians NNS albums NNS alchemists NNS alcoholics NNS alerted VBD alerting VBG alerts VBZ alert VB alienated VBN alienates VBZ alienate VB alienating VBG aliens NNS aligned VBN align VBP allayed VBN allay VB allegations NNS alleged VBD alleged VBN alleges NNS alleges VBZ allege VB allege VBP alleging VBG allergies NNS alleviate VB alleviating VBG alleys NNS alliances NNS allied VBD allies NNS Allies NNS alligators NNS allocated VBD allocated VBN allocate VB allocate VBP allocating VBG allocations NNS allotments NNS allowances NNS allowed VB allowed VBD allowed VBN allowing VBG Allowing VBG allows VBZ allow VB Allow VB allow VBP alloys NNS allrightniks NNS alluded VBD allusions NNS allying VBG ally VB altered VBD altered VBN altering VBG alternates NNS alternates VBZ alternating VBG alternatives NNS alter VB Alter VB alter VBP aluminum-makers NNS alumni NNS amahs NNS amalgamate VB amalgamations NNS amassed VBD amassed VBN amasses VBZ amass VB amass VBP amateurs NNS amazed VBN amaze VB ambassadors NNS ambiguities NNS ambitions NNS amble VB ambushed VBD amended VBD amended VBN amending VBG Amending VBG amendments NNS amend VB amenities NNS Amenities NNS A-men NNS Americana NNS Americanized VBD Americans NNS Americans VBP amortize VB amounted VBD amounted VBN amounting VBG amounts NNS amounts VBZ amount VB amount VBP amphobiles NNS amplified VBN amplifiers NNS amplifying VBG amplify VB amps NNS amused VBN amusements NNS amuse VBP am VBP Am VBP AM VBP anachronisms NNS analyses NNS Analyses NNS analysts NNS Analysts NNS analyzed VBD analyzed VBN analyze VB analyze VBP analyzing VBG anchored VBN anchors NNS anchor VB anchor VBP and VBP anemias NNS anemics NNS angels NNS angered VBD angered VBN angering VBG angles NNS animals NNS Animals NNS animated VBN animosities NNS ankles NNS Anne VB anniversaries NNS announced VBD ANNOUNCED VBD announced VBN announcements NNS announces VBZ announce VB announce VBP announcing VBG annoyed VBD annoyed VBN annualized VBN Annualized VBN annuities NNS Annuities NNS ANNUITIES NNS Anointing VBG anomalies NNS answered VBD answered VBN answering VBG answers NNS Answers NNS answers VBZ answer VB answer VBP antagonists NNS antagonize VB anteaters NNS antecedents NNS antelope NNS anthers NNS anti-abortionists NNS antibodies NNS anticipated VBD anticipated VBN anticipates VBZ anticipate VB anticipate VBP anticipating VBG Anticipating VBG antics NNS anti-heroes NNS anti-infectives NNS antipathies NNS anti-programmers NNS antiquities NNS ants NNS anxieties NNS apartments NNS apologies NNS apologists NNS apologized VBD apologizes VBZ apologize VB apologizing VBG appalled VBN apparitions NNS appealed VBD appealed VBN appealing VBG Appealing VBG appeals NNS appeal VB appeal VBP appearances NNS appeared VBD appeared VBN appearing VBG appears NNS appears VBZ APPEARS VBZ appear VB appear VBP appeased VBD appease VB appendages NNS appended VBN append VB applauded VBD applauding VBG applauds VBZ applaud VB applaud VBP apples NNS appliances NNS applicants NNS applications NNS Applications NNS applied VBD applied VBN Applied VBN applies VBZ applying VBG Applying VBG apply VB apply VBP appointed VBD appointed VBN Appointed VBN appointees NNS appointments NNS appoint VB appraisals NNS appraised VBN appraisers NNS appraise VB appreciated VBD appreciated VBN appreciates VBZ appreciate VB appreciate VBP appreciating VBG apprehensions NNS apprised VBN approached VBD approached VBN approaches NNS approaches VBZ approaching VBG approach VB approach VBP appropriated VBD appropriated VBN appropriations NNS Appropriations NNS appropriators NNS approvals NNS approved VBD APPROVED VBD approved VBN approves VBZ approve VB approve VBP approving VBG approximates VBZ Arabs NNS arbitragers NNS Arbitragers NNS arbitrageurs NNS arbitraging VBG Arbitraging VBG arbitrates VBZ arbitrating VBG arborists NNS arbs NNS Arbs NNS arcades NNS arched VBD architects NNS Architects NNS arch VBP areas NNS Areas NNS arenas NNS are VBP Are VBP ARE VBP argued VBD argued VBN argues VBZ argue VB argue VBP arguing VBG Arguing VBG arguments NNS arisen VBN arises VBZ arise VB arise VBP ARISE VBP arising VBG Arkansas NNS armadillos NNS armed VBN Armed VBN armies NNS armpits NNS arms NNS Arms NNS arm VB aromas NNS arose VBD aroused VBD aroused VBN arouses VBZ arouse VB arousing VBG arpeggios NNS Arraignments NNS arranged VBD arranged VBN arrangements NNS Arrangements NNS arranges VBZ arrange VB arranging VBG arrears NNS arrested VBD arrested VBN Arrested VBN arrests NNS arrest VB arrivals NNS arrived VBD ARRIVED VBD arrived VBN arrives VBZ arrive VB arrive VBP arriving VBG arrows NNS arsenals NNS arteries NNS articles NNS articulate VB artifacts NNS artillerists NNS artists NNS Artists NNS arts NNS Arts NNS artworks NNS Asahi NNS ascending VBG ascertain VB ascribed VBN ascribe VBP ashes NNS Ashland VBP ashtrays NNS asked VBD Asked VBD asked VBN Asked VBN asking VBG asks VBZ ask VB Ask VB ask VBP aspects NNS aspens NNS aspersions NNS aspirations NNS aspired VBD aspires VBZ aspire VBP aspiring VBG assailed VBD assailed VBN assassinated VBD assassinated VBN assassinate VB assassinating VBG assassinations NNS assaults NNS assault VBP assemblages NNS assembled VBD assembled VBN assemble VB assemble VBP assemblies NNS assembling VBG asserted VBD asserted VBN asserting VBG assertions NNS asserts VBZ assert VB assert VBP asses NNS assessed VBD assessed VBN assessing VBG assessments NNS assess VB assess VBP assets NNS Assets NNS ASSETS NNS assigned VBN assignments NNS assigns VBZ assign VB assign VBP assimilate VB assistants NNS assisted VBN assisting VBG assists VBZ assist VB assist VBP associated VBN associates NNS Associates NNS associate VBP associating VBG associations NNS assuage VB assumed VBD assumed VBN assumes VBZ assume VB Assume VB assume VBP assuming VBG Assuming VBG assumptions NNS assurances NNS assured VBD assured VBN assures VBZ assure VB assuring VBG asteroids NNS astounds VBZ astronauts NNS ate VBD Ate VBD athletes NNS athletics NNS Atlantis NNS atolls NNS atoms NNS Atone VB atrocities NNS attached VBD attached VBN attaches VBZ attaching VBG attach VB attach VBP attacked VBD attacked VBN attackers NNS attacking VBG attacks NNS Attacks NNS attacks VBZ attack VB attack VBP attained VBN attain VB attarcks NNS attempted VBD attempted VBN attempting VBG attempts NNS Attempts NNS attempts VBZ attempt VB attempt VBP attendants NNS Attendants NNS attended VBD attended VBN attendees NNS attending VBG attends VBZ attend VB attend VBP Attention VB attests VBZ attest VB attics NNS attitudes NNS Attitudes NNS attorneys NNS Attorneys NNS attracted VBD attracted VBN Attracted VBN attracting VBG attractions NNS attracts VBZ attract VB attract VBP attributed VBD attributed VBN attributes NNS attributes VBZ attribute VB attribute VBP attributing VBG auctioned VBD auctioned VBN auctions NNS Auctions NNS auction VB audiences NNS audiocassettes NNS audiophiles NNS audited VBD audited VBN auditing VBG audition VB auditors NNS Auditors NNS audits NNS AUDITS NNS audit VB augment VB auspices NNS authored VBN authorities NNS Authorities NNS authorizations NNS authorized VBD authorized VBN authorizes VBZ authorize VB authorizing VBG authors NNS autions NNS auto-emissions NNS autographed VBN autographs NNS autograph VB auto\/homeowners NNS automakers NNS automated VBN Automated VBN automates VBZ automating VBG automobiles NNS Automobiles NNS auto-sales NNS autos NNS autumns NNS avenge VB avenues NNS averaged VBD averaged VBN averages NNS averages VBZ average VB average VBP averaging VBG averred VBD averted VBN averting VBG averts VBZ avert VB aviators NNS avoided VBD AVOIDED VBD avoided VBN avoiding VBG Avoiding VBG avoids VBZ avoid VB avoid VBP awaited VBD awaiting VBG awaits VBZ await VB await VBP awakened VBN awarded VBD awarded VBN awarding VBG awards NNS awards VBZ award VB awoke VBD axioms NNS axles NNS B-2s NNS Babelists NNS babies NNS Babies NNS backdated VBD back-dating VBG backed VBD BACKED VBD backed VBN backers NNS backfired VBD backfired VBN backfires VBZ backfire VB backfire VBP backflips VBZ backgrounds NNS backing VBG backlogs NNS backlots NNS backpackers NNS backpacks NNS backpedaling VBG backslapping VBG backs NNS backs VBZ Backs VBZ back-ups NNS back VB back VBP bacteria NNS badges NNS bags NNS bailed VBD Bailiffs NNS bailing VBG bailouts NNS bail VB bakeries NNS bakers NNS balanced VBN balance-of-payments NNS balances NNS BALANCES NNS balances VBZ balance VB balancing VBG balconies NNS bales NNS balked VBD balking VBG BALKS VBZ balk VB balk VBP ballerinas NNS ballets NNS ballooned VBD ballooned VBN ballooning VBG balloonists NNS balloons NNS balloons VBZ balloon VB ballots NNS BALLOTS NNS ballparks NNS ballplayers NNS balls NNS ballyhooed VBN bamboozled VBN bananas NNS bandages NNS banded VBN bandied VBN bands NNS band VB bangs VBZ bang VB banished VBN banish VB bankers NNS Bankers NNS BANKERS NNS banking VBG bankrolling VBG bankroll VB bankroll VBP bankruptcies NNS bankrupt VB bankrupt VBP banks NNS Banks NNS bank VB Bank VB bank VBP banned VBD banned VBN banners NNS banning VBG banshees NNS bans NNS bans VBZ ban VB ban VBP barbed VBN barbers NNS barbs NNS bargained VBD bargained VBN bargain-hunters NNS bargain-hunt VB bargaining VBG bargains NNS bargain VB barges NNS barges VBZ baring VBG barking VBG barnacles NNS barns NNS barons NNS barred VBD barred VBN Barred VBN barreling VBG barrels NNS Barrels NNS barricades NNS barriers NNS barring VBG Barring VBG bars NNS bars VBZ bartenders NNS bartered VBN bar VB Bar VB baseballs NNS based VBD based VBN Based VBN basements NNS bases NNS bases VBZ base VB base VBP bashing VBG Bashing VBG bash VB basics NNS basing VBG baskets NNS Baskets NNS bastions NNS batches NNS bathed VBN bathrooms NNS baths NNS bats NNS battalions NNS batted VBD batted VBN battered VBD battered VBN batteries NNS battering VBG batter VB batting VBG battled VBD BATTLED VBD battled VBN battlegroups NNS battlements NNS battles NNS battles VBZ battle VB battle VBP battling VBG bays NNS beaches NNS beads NNS beamed VBN beaming VBG beams VBZ beanballs NNS beans NNS bearings NNS bearing VBG bears NNS Bears NNS bears VBZ bear VB bear VBP beasties NNS beasts NNS beaten VBN beating VBG beats NNS beats VBZ beat VB beat VBD beat VBP became VBD becomes VBZ become VB Become VB BECOME VB become VBD become VBN become VBP becoming VBG bedeviled VBN bedevil VB bedfellows NNS Bedfellows NNS bedpans NNS bedrooms NNS beds NNS beefed VBD beefed VBN beefing VBG beef VB been VBN Been VBN been VBP beeping VBG beeps NNS beers NNS bees NNS befallen VBN befall VBP befell VBD befriended VBN befuddled VBD began VBD beggars NNS begged VBN begging VBG beginnings NNS beginning VBG Beginning VBG begins VBZ Begins VBZ begin VB begin VBP begot VBD begs VBZ begun VBN beg VB behaved VBD behaved VBN behaves VBZ behave VB behaving VBG behaviors NNS Beheading VBG behemoths NNS beings NNS being VBG Being VBG BEING VBG being VBG|JJ beleaguered VBN belfries NNS belied VBD beliefs NNS belie VB belie VBP believed VBD believed VBN believes VBZ believe VB believe VBP believing VBG belittle VB bellies NNS bellow VB bellringers NNS bells NNS bellwethers NNS belly-flopped VBD belonged VBD belongings NNS belonging VBG belongs VBZ belong VB belong VBP belting VBG belts NNS bemoaning VBG benches NNS benchmarks NNS bending VBG bend VB benefactors NNS beneficiaries NNS Beneficiaries NNS benefited VBD benefited VBN benefiting VBG benefits NNS BENEFITS NNS benefits VBZ benefit VB Benefit VB benefit VBP bequeathed VBD bequests NNS berated VBD berries NNS beset VBN besieged VBN bested VBD bested VBN bestirred VBN bestowed VBN betas NNS betrayed VBN bets NNS Bets NNS betters NNS better VB betting VBG bet VB bet VBD bet VBN bet VBP be VB Be VB BE VB be VBP beverages NNS beware VB Beware VB BEWARE VB bewildered VBN bewildering VBG bewitched VBN biased VBN biases NNS bibles NNS bickered VBN bickering VBG bicycles NNS bidders NNS bidding VBG bids NNS Bids NNS bids VBZ bid VB bid VBD bid VBN bid VBP bifurcate VB bikers NNS Bikers NNS bikes NNS Biking VBG bilges NNS bilking VBG billed VBD billed VBN billings NNS Billings NNS billing VBG billions NNS Billions NNS billowing VBG bills NNS Bills NNS BILLS NNS bills VBZ bill VB bill VBP binders NNS binding VBG binges NNS binoculars NNS bins NNS Bioengineers NNS biographers NNS biologists NNS biomedical-products NNS biopsies NNS birds NNS BIRDS NNS birthdays NNS births NNS biscuits NNS Bishops NNS bites NNS bites VBZ bite VB biting VBG bits NNS bitten VBN Bitten VBN blabs VBZ blacked VBN blackened VBN blacklist VB blackmailed VBN blackmailers NNS blackmailing VBG blackmail VB blackouts NNS blacks NNS Blacks NNS Blackstone VBP blades NNS blamed VBD blamed VBN Blamed VBN blames VBZ blame VB blame VBP blaming VBG Blaming VBG Blandings NNS blanketed VBD blankets NNS blanket VBP blared VBD blasted VBD blasting VBG blasts NNS blast VB blaze VBP blazing VBG bleachers NNS bled VBD bleeding VBG bleed VB blemishes NNS blending VBG blends NNS blend VB blessed VBN blessings NNS bless VB blew VBD blindfolded VBD blinds NNS blini NNS blinked VBD blinkers NNS blinks VBZ blink VB blips NNS blip VB blip VBP blitzes NNS blocked VBD blocked VBN Blocked VBN blocking VBG blocks NNS blocks VBZ block VB block VBP Blondes NNS bloodied VBN bloodletting VBG bloods NNS blooming VBG blossomed VBD blossomed VBN blossoms NNS blotting VBG blowing VBG blown VBN blows NNS blow VB bludgeoned VBN bludgeon VB bluebloods NNS blue-chips NNS blues NNS Blues NNS blundered VBD blunders NNS blunder VB blunted VBD blunted VBN blunt VB blurred VBD blurring VBG blurting VBG blurt VBP blur VB BMWs NNS boarding VBG boardrooms NNS Boardrooms NNS boards NNS board VB boasted VBD boasted VBN boasts VBZ boast VB boast VBP boaters NNS boating VBG boats NNS bodegas NNS bodes VBZ bode VB bodies NNS bodyworkers NNS Boeing VBG bogged VBD bogging VBG bog VB boiled VBD boilers NNS boiling VBG boils VBZ boil VB bolstered VBD bolstered VBN Bolstered VBN bolstering VBG Bolstering VBG bolsters VBZ bolster VB bolted VBN bolts NNS bombarded VBD bombarded VBN bombarding VBG bombed VBD bombed VBN bombers NNS bombings NNS bomblets NNS bombs NNS bomb VB bonded VBN bond-holders NNS bondholders NNS bondholdings NNS bonds NNS Bonds NNS BONDS NNS boned VBN bones NNS bonnets NNS bonuses NNS Bonuses NNS booed VBD booked VBD booked VBN bookers NNS bookings NNS Bookings NNS booking VBG booklets NNS books NNS Books NNS bookstores NNS book VB boomed VBD boomed VBN boomers NNS booming VBG booms NNS boom VB boosted VBD boosted VBN boosters NNS boosting VBG boosts NNS BOOSTS NNS boosts VBZ boost VB Boost VB boost VBP booths NNS Booths NNS bootlegged VBN boots NNS boozing VBG bordered VBN bordering VBG borders NNS bored VBN bore VBD boring VBG borne VBN born VBN Born VBN borrowed VBD borrowed VBN Borrowed VBN borrowers NNS Borrowers NNS borrowings NNS borrowing VBG borrowing VBG|NN borrows VBZ borrow VB borrow VBP bosses NNS Bosses NNS botched VBN bothered VBD bothered VBN bothering VBG bother VB bother VBP bottled VBN bottlenecks NNS bottlers NNS bottles NNS bottle VB bottling VBG bottomed VBD bottomed VBN bottoming VBG bottoms NNS bottom VB bought VBD bought VBN bounced VBD bounces NNS bounces VBZ bounce VB bounce VBP bouncing VBG boundaries NNS bounding VBG bounds NNS bounds VBZ bound VBN bourbons NNS bourses NNS boutiques NNS bouts NNS Bouygues NNS bowed VBD bowed VBN Bowing VBG bowling VBG bowls VBZ bowl VBP bow VB bow VBP boxes NNS boycotted VBN boycott VB boyfriends NNS boys NNS Boys NNS braced VBD braced VBN brace VB bracing VBG brags VBZ brag VB braids NNS brains NNS brakes NNS Brakes NNS braking VBG branched VBN branches NNS Branches NNS branching VBG branch VB brandished VBD brandishing VBG brands NNS Brands NNS brands VBZ brassieres NNS brats NNS braved VBD brave VB braving VBG Brawls NNS brazen VB Brazilians NNS breached VBD breached VBN breaches NNS breach VB breaded VBN breakdowns NNS breakers NNS breaking VBG Breaking VBG breaks NNS breaks VBZ breakthroughs NNS break VB break VBP breasts NNS breathed VBD breathe VB breathing VBG breaths NNS bred VBD bred VBN breeders NNS Breeders NNS breeding VBG breed VB breed VBP breezes NNS brethren NNS brewed VBN breweries NNS Breweries NNS brewers NNS brewing VBG brew VB bribed VBD bribed VBN bribes NNS bribe VB bribing VBG bricks NNS bridges NNS bridge VB bridging VBG briefcases NNS briefed VBD briefed VBN briefings NNS briefs NNS BRIEFS NNS brief VB brigades NNS brightened VBD brightened VBN brightening VBG brimmed VBD bringing VBG brings VBZ bring VB bring VBP bristled VBD bristles VBZ bristle VBP British NNS broadcasters NNS Broadcasters NNS broadcasting VBG Broadcasting VBG broadcasts NNS Broadcasts NNS broadcasts VBZ broadcast VB broadcast VBD broadcast VBN broadened VBD broadened VBN broadening VBG broadens VBZ broaden VB brochures NNS broken VBN brokerages NNS broker-dealers NNS brokered VBD brokering VBG brokers NNS Brokers NNS broke VBD Broncos NNS broncs NNS brothers NNS brought VBD brought VBN browbeat VB browse VB browsing VBG bruised VBN bruises NNS bruising VBG brushbacks NNS brushed VBD brushed VBN brushes NNS brushing VBG brush VB Brussels NNS bubbles NNS bubble VB Buccaneers NNS bucked VBD bucked VBN buckets NNS bucking VBG Bucking VBG buckled VBD buckle VB buckling VBG bucks NNS bucks VBZ buck VB buck VBP buddies NNS budding VBG budged VBD budged VBN budgeted VBN budgeteers NNS budgets NNS budge VB buds NNS buffer VB buffeted VBN buffets NNS Buffets NNS buffet VB buffing VBG buffs NNS bugged VBN bugs NNS Bugs NNS builders NNS Builders NNS building-materials NNS building-products NNS buildings NNS Buildings NNS building VBG Building VBG builds VBZ build VB build VBN build VBP built VBD built VBN Built VBN bulbs NNS bulging VBG bulkheads NNS bulldozed VBN bulldozers NNS bulletins NNS bullets NNS bullhorns NNS bullied VBD bullies VBZ bulls NNS Bulls NNS bullying VBG bumble VB bumble VBP bumbling VBG bumped VBD bumped VBN bumps NNS bump VB Bums NNS bunches NNS bundled VBN bundles NNS bundling VBG bungled VBN buns NNS buoyed VBD buoyed VBN Buoyed VBN buoying VBG buoys VBZ buoy VB burbles VBZ burdened VBD burdened VBN burdens NNS Burdens NNS burdens VBZ bureaucracies NNS bureaucrats NNS Bureaucrats NNS bureaus NNS burgeoning VBG burgers NNS burglaries NNS burglarized VBN burgs NNS burials NNS buried VBD buried VBN Buried VBN burned VBD burned VBN Burned VBN burning VBG burnishing VBG burnouts NNS burns NNS burns VBZ burnt VBN burn VB burn VBP bursting VBG bursts NNS bursts VBZ burst VBD burst VBP burying VBG bury VB bury VBP buses NNS bushels NNS bushes NNS busies NNS business-communications NNS businesses NNS Businesses NNS business-machines NNS businessmen NNS Businessmen NNS business-partners NNS busloads NNS busted VBD busted VBN busting VBG busts NNS butlers NNS butterflies NNS buttons NNS buttressed VBN buttresses VBZ buttress VB butt VB buy-backs NNS buy-back VB buyers NNS Buyers NNS BUYERS NNS buyings NNS buying VBG Buying VBG buy-outs NNS buyouts NNS buys NNS buys VBZ buy VB Buy VB buy VB|NN buy VBP buzzes VBZ buzzing VBG buzz VB buzzwords NNS bylaws NNS bylines NNS bypass VB bystanders NNS cabin-crew NNS cabinets NNS cables NNS cabs NNS caches NNS cadets NNS cadge VBP cafes NNS cafeterias NNS caked VBD calamities NNS Calaveras NNS calculated VBD calculated VBN Calculated VBN calculates VBZ calculate VB calculate VBP calculating VBG Calculating VBG calculations NNS calculators NNS calibrated VBN Californians NNS called VB called VBD Called VBD called VBN Called VBN CALLED VBN callers NNS Callers NNS calling VBG Calling VBG calls NNS Calls NNS calls VBZ call VB Call VB CALL VB call VBP calmed VBN calming VBG calm VB calories NNS Caltrans NNS calves NNS cameras NNS came VBD camouflaged VBN campaigned VBN campaigning VBG campaigns NNS campaign VB camped VBD campers NNS camps NNS campuses NNS Campuses NNS canals NNS canceled VBD canceled VBN canceled VBN|JJ canceling VBG cancellations NNS cancels VBZ cancel VB cancers NNS cancer-suppressors NNS candidates NNS candies NNS candles NNS canned VBN canning VBG cans NNS canvases NNS canvassed VBN canyons NNS capabilities NNS capacities NNS capacitors NNS capital-assets NNS capital-draining VBG capital-gains NNS capitalgains NNS capital-goods NNS capitalists NNS capitalized VBD capitalized VBN capitalize VB capitalizing VBG Capitalizing VBG capitals NNS capital-to-assets NNS capita NNS capitulated VBD capped VBD capped VBN capping VBG caps NNS capsules NNS captain VBP captioned VBD captivating VBG captives NNS captured VBD captured VBN captured VBN|JJ capture VB capturing VBG cap VB cap VBP carats NNS carbide-products NNS car-dealers NNS cardholders NNS cardinals NNS cards NNS cared VBD cared VBN careened VBD careening VBG careen VB careers NNS Careers NNS cares VBZ care VB care VBP caricatures NNS caricature VB carillons NNS caring VBG carnivores NNS carpenters NNS carpetbaggers NNS carpeted VBD carpets NNS carp VBP carried VBD carried VBN Carried VBN carriers NNS Carriers NNS carries VBZ carry-forwards NNS carryforwards NNS carrying VBG Carrying VBG carry VB carry VBP cars NNS Cars NNS carted VBD carting VBG Cartons NNS cartoonists NNS cartoons NNS cartridges NNS carts NNS cart VBP carved VBD carved VBN carvers NNS carves VBZ carve VB carve VBP carving VBG cascaded VBD cascading VBG caseloads NNS cases NNS Cases NNS cases VBZ cashed VBD cashed VBN cashing VBG cash VB casings NNS casinos NNS Casinos NNS caskets NNS cassettes NNS castigated VBN castigate VB castigating VBG castings NNS casting VBG castling VBG casts VBZ cast VB cast VBD cast VBN Cast VBN casualties NNS cataclysms NNS cataloging VBG catalogs NNS catapult VB catapult VBP Cataracts NNS catastrophes NNS catchers NNS catches VBZ catching VBG Catching VBG catch VB catch VBP categories NNS categorized VBN catered VBD catering VBG caters VBZ cater VB cater VBP CATFISH NNS catheters NNS cathodes NNS cats NNS cattle NNS Cattle NNS caught VBD caught VBN caused VBD caused VBN causes NNS causes VBZ cause VB cause VBP causing VBG cautioned VBD cautioning VBG cautions VBZ caution VB caution VBP caveats NNS caved VBD caves NNS C.D.s NNS CDs NNS ceased VBD ceased VBN ceases VBZ cease VB cease VBP ceded VBD ceded VBN cede VB ceding VBG ceilings NNS celebrated VBD celebrates VBZ celebrate VB celebrate VBP celebrating VBG celebrations NNS celebrities NNS cellars NNS cellists NNS cells NNS celluloids NNS cemented VBN Cementing VBG cement-makers NNS cement VB censored VBN censor VBP censured VBD centenarians NNS centered VBD centered VBN centering VBG centers NNS centers VBZ Centers VBZ center VBP centimeters NNS centralized VBN centralize VB cents NNS Cents NNS centuries NNS centurions NNS CEOs NNS ceramics NNS cereals NNS ceremonies NNS certificates NNS Certificates NNS CERTIFICATES NNS certified VBN certify VB CFCs NNS chafed VBN chafe VBP chains NNS Chains NNS chain VBP chaired VBD chaired VBN chairmen NNS chairs NNS chairs VBZ chalked VBN chalking VBG challenged VBD challenged VBN challengers NNS challenges NNS challenges VBZ Challenges VBZ challenge VB challenge VBP challenging VBG chambers NNS Chambers NNS Champagnes NNS championed VBD championed VBN championing VBG championships NNS champions NNS champion VBP champs NNS chances NNS chance VBP chandeliers NNS changed VBD CHANGED VBD changed VBN CHANGED VBN changes NNS Changes NNS changes VBZ change VB Change VB change VBP changing VBG Changing VBG channeled VBD channels NNS channel VB channel VBP Chans NNS chanted VBD chanted VBN chanting VBG chants NNS chaps NNS chapters NNS characteristics NNS characterized VBD characterized VBN characterizes VBZ characterize VB characterize VBP characterizing VBG characters NNS Characters NNS charged VBD charged VBN charge-offs NNS charges NNS Charges NNS charges VBZ charge VB Charge VB charge VBP charging VBG charities NNS Charities NNS charlatans NNS Charlestonians NNS charred VBN chartered VBD chartered VBN charts NNS charts VBZ chart VB chart VBP chased VBD chasers NNS chase VB chasing VBG chassis NNS chastened VBD chastened VBN chastised VBD chastised VBN chastises VBZ chatting VBG chat VB chauffeurs NNS chauffeur VB cheapens VBZ cheated VBD cheaters NNS cheating VBG cheat VB cheat VBP checkbooks NNS checked VBD checked VBN checking VBG checkpoints NNS checks NNS checks VBZ check VB Check VB check VBP cheered VBD cheered VBN cheering VBG cheerleaders NNS cheers NNS cheer VB cheeses NNS chefs NNS chelicerates NNS chemicals NNS Chemicals NNS chemical-weapons NNS chemists NNS cherished VBN cherishes VBZ cherries NNS cherubs NNS chewed VBD chewing VBG chews NNS chew VB chew VBP chickens NNS Chickens NNS chided VBD chided VBN chides VBZ chiefs NNS children NNS Children NNS CHILDREN NNS chilled VBN chilling VBG chill VB chimes VBZ chimneys NNS chimpanzees NNS Chinese NNS chipped VBN chipping VBG chips NNS chisel VB chlorofluorocarbons NNS choices NNS Choices NNS choked VBD choked VBN choke VB choking VBG chooses VBZ choose VB Choose VB choose VBP choosing VBG chopped VBN chopping VBG chops NNS chopsticks NNS chop VB chords NNS choreographers NNS chores NNS Chores NNS chortled VBD choruses NNS chosen VBN chose VBD Christians NNS chromosomes NNS Chronicles NNS chronicles VBZ chucked VBD chuckles NNS chuckles VBZ chuckling VBG chug VBP chums NNS chunks NNS churches NNS church-goers NNS churn VB churn VBP cials NNS cigarettes NNS cigars NNS cinch VB Cincinnati NNS circled VBD circles NNS circles VBZ circuits NNS circulars NNS circulated VBD circulated VBN circulate VB circulate VBP circulating VBG circulations NNS circumstances NNS circumventing VBG circumvents VBZ circumvent VB citations NNS cited VBD cited VBN Cited VBN cites VBZ cite VB cite VBP Citicorp VB cities NNS Cities NNS citing VBG Citing VBG citizens NNS Citizens NNS CITIZENS NNS civics NNS civilians NNS civil-rights NNS clad VBN Clad VBN claimants NNS CLAIMANTS NNS claimed VBD claimed VBN claiming VBG Claiming VBG claims NNS Claims NNS claims VBZ CLAIMS VBZ claim VB claim VBP clambered VBD clamored VBD clampdowns NNS clamped VBN clamping VBG clamp VB clanging VBG clanking VBG clarifications NNS clarified VBN clarifies VBZ clarifying VBG clarify VB clashed VBD clashed VBN clashes NNS clash VB classed VBN classes NNS Classes NNS classics NNS Classics NNS classifications NNS classified VBD classified VBN classifies VBZ classify VB classmates NNS classrooms NNS clauses NNS Claws NNS cleaned VBD cleaned VBN cleaners NNS clean-fuels NNS cleaning VBG Cleaning VBG cleansed VBD cleansed VBN cleansers NNS cleanse VB cleansing VBG cleans VBZ clean VB clearances NNS cleared VBD cleared VBN clearing VBG clears VBZ CLEARS VBZ clear VB clear VBP clergy NNS clerics NNS clerks NNS clicked VBD clientele NNS clients NNS Clients NNS climbed VBD climbed VBN climbers NNS climbing VBG climbs VBZ climb VB clinched VBD clinching VBG cling VB cling VBP clinical-products NNS clinics NNS clinkers NNS clipped VBD clipped VBN clippings NNS clips NNS clip VB cliques NNS cloak VBP clobbered VBD clobbered VBN clobber VB clocked VBN clocks NNS clocks VBZ clogged VBN clogging VBG cloned VBN clones NNS closed VBD closed VBN Closed VBN closed VBN|VBD closes NNS closes VBZ Closes VBZ close VB Close VB close VBP closings NNS closing VBG closing VBG|NN closures NNS clothed VBN clothes NNS clothiers NNS clouded VBN clouds NNS clouds VBZ cloud VB cloud VBP clowns NNS clubbed VBD CLUBBING VBG clubs NNS Clubs NNS CLUBS NNS clues NNS clumps NNS clustered VBN clusters NNS clutching VBG clutch VBP cluttered VBN clutter VB C'mon VB coaches NNS Coach VB co-anchored VBN coasted VBD coasters NNS coasts NNS coated VBN coatings NNS coats NNS Coats NNS coattails NNS co-authored VBN Co-authors NNS coaxing VBG coax VB cobbled VBD cobbled VBN Cobbs NNS co-chairmen NNS cockatoos NNS cockroaches NNS cocktails NNS coconuts NNS coddled VBN coded VBN co-defendants NNS CODE-NAMED VBN codes NNS code VB co-developers NNS codified VBN Codifying VBG co-edits VBZ coerces VBZ co-exist VB coextrude VBP coffers NNS co-founded VBD co-founders NNS cognoscenti NNS cohere VB cohorts NNS co-host VB coincided VBD coincides VBZ coincide VB coined VBN coins NNS colas NNS cold-cuts NNS coli NNS collaborated VBD collaborate VB collaborate VBP collaborating VBG collaborators NNS collages NNS collapsed VBD collapsed VBN collapses NNS collapses VBZ collapse VB collapsing VBG collars NNS collateralized VBN colleagues NNS Colleagues NNS collected VBD collected VBN collectibles NNS Collectibles NNS collecting VBG collections NNS collectives NNS collectivizers NNS collectors NNS Collectors NNS collects VBZ collect VB collect VBP colleges NNS Colleges NNS college-sports NNS colloquies NNS colonialists NNS colonies NNS colonists NNS color-coded VBN color-coding VBG colored VBN colors NNS color VB columnists NNS columns NNS co-managed VBN co-managing VBG combatants NNS combating VBG Combatting VBG combat VB combed VBN combinations NNS combined VBD combined VBN Combined VBN combines NNS combines VBZ combine VB combine VBP combing VBG combining VBG comedies NNS comestibles NNS comes VBZ Comes VBZ comets NNS come VB Come VB come VBD come VBN Come VBN come VBP comforted VBN comforting VBG comforts NNS coming VBG Coming VBG commanded VBN commanders NNS commanding VBG commandos NNS command VB command VBP commemorated VBD commemorate VB commemorate VBP commenced VBD commencing VBG commends VBZ commend VB commentaries NNS commentators NNS Commentators NNS commented VBD commenting VBG Commenting VBG comments NNS Comments NNS COMMENTS NNS comments VBZ comment VB commercialize VB commercializing VBG commercials NNS Commercials NNS commissioned VBD commissioned VBN commissioners NNS commissioning VBG Commissioning VBG commissions NNS Commissions NNS commissions VBZ commitments NNS commits VBZ committed VBD committed VBN committees NNS Committees NNS committees VBZ committes NNS committing VBG commit VB commit VBP commodities NNS Commodities NNS commoditize VB communicated VBD communicated VBN communicate VB communicating VBG communications NNS communiques NNS communists NNS Communists NNS communities NNS commuters NNS COMMUTERS NNS commutes NNS commuting VBG compacted VBN companies NNS Companies NNS COMPANIES NNS companions NNS Compaq VB compared VBD compared VBN Compared VBN compares VBZ compare VB Compare VB COMPARE VB compare VBP comparing VBG Comparing VBG comparisons NNS compatriots NNS compelled VBN Compelled VBN compels VBZ compel VB compensated VBN compensates VBZ compensate VB compensations NNS competed VBD competed VBN competes VBZ Competes VBZ compete VB compete VBP competing VBG Competing VBG competing VBG|JJ competitions NNS competitors NNS Competitors NNS compiled VBN Compiled VBN compiles VBZ compile VB compiling VBG complained VBD complained VBN complaining VBG complains VBZ complaints NNS Complaints NNS complain VB complain VBP complements VBZ complement VB completed VBD completed VBN completes VBZ complete VB completing VBG completions NNS complexes NNS complexities NNS complicated VBD complicated VBN complicates VBZ complicate VB complications NNS complied VBD complied VBN compliments NNS compliment VBP complying VBG Complying VBG comply VB components NNS composed VBN composers NNS composites NNS compositions NNS compounded VBD compounded VBN compounding VBG Compounding VBG compounds NNS compound VB compressed VBN compressors NNS comprised VBN comprises VBZ comprise VB comprise VBP comprising VBG compromised VBN compromises NNS Compromises NNS compromises VBZ compromise VB compromising VBG compulsions NNS Computations NNS computerized VBN computerize VB computerizing VBG computer-products NNS computer-services NNS computers NNS Computers NNS COMPUTERS NNS computer-systems NNS computes VBZ compute VB computing VBG comrades NNS concealed VBN concealing VBG conceal VB conceded VBD conceded VBN concedes VBZ concede VB concede VBP conceding VBG conceived VBD conceived VBN conceive VB conceiving VBG concentrated VBD concentrated VBN concentrates VBZ concentrate VB concentrate VBP concentrating VBG concentrations NNS conceptions NNS concepts NNS concerned VBD concerned VBN Concerned VBN concerning VBG Concerning VBG concerns NNS Concerns NNS concerns VBZ concern VB concertos NNS concerts NNS concessions NNS concluded VBD concluded VBN concludes VBZ conclude VB conclude VBP concluding VBG conclusions NNS concocted VBN concoctions NNS Concocts VBZ concurred VBD concur VB condemned VBD condemned VBN condemning VBG condemns VBZ condemn VB condensers NNS conditioned VBN conditioners NNS conditions NNS condominiums NNS condoms NNS condoned VBD condone VB condos NNS conducted VBD conducted VBN conducting VBG conducts VBZ conduct VB conduct VBP conduits NNS Conduits NNS cones NNS conferees NNS Conferees NNS conferences NNS conferred VBD conferring VBG confers VBZ confer VB confer VBP confessed VBD confessed VBN confesses VBZ confessing VBG confessions NNS confidants NNS confided VBD confides VBZ Confiding VBG configurations NNS confined VBN confines NNS confirmed VBD confirmed VBN confirming VBG Confirming VBG confirms VBZ confirm VB confirm VBP confiscated VBD confiscated VBN confiscate VB confiscating VBG conflicted VBN conflicting VBG conflicts NNS conflict VBP conforming VBG conforms VBZ conform VB conform VBP confrontations NNS CONFRONTATIONS NNS confronted VBN Confronted VBN confronting VBG confronts VBZ confront VB confront VBP confused VBD confused VBN confuses VBZ confuse VB confusing VBG confusions NNS conglomerates NNS congratulated VBD congratulated VBN congratulate VB congressmen NNS Congressmen NNS conjures VBZ conjure VBP connected VBN connecting VBG connections NNS connects VBZ connect VB connect VBP connotations NNS connote VB conquer VB conscripts NNS consented VBD consented VBN consenting VBG consents NNS consent VB consequences NNS conservationists NNS Conservationists NNS conservatives NNS Conservatives NNS CONSERVATIVES NNS conserve VB considerations NNS considered VBD considered VBN Considered VBN considering VBG Considering VBG considers VBZ consider VB Consider VB consider VBP consigned VBN consigns VBZ consisted VBD consisted VBN consisting VBG Consisting VBG consists VBZ consist VB consist VBP cons NNS consoles VBZ console VB consolidated VBD consolidated VBN consolidates VBZ consolidate VB consolidating VBG consolidations NNS consonants NNS consorting VBG conspiracies NNS conspirators NNS conspired VBD conspire VBP conspiring VBG constituencies NNS constituents NNS constituted VBD constitutes VBZ constitute VB constitute VBP constrained VBN constrains VBZ constraints NNS constrain VB constricting VBG constrictors NNS constructed VBN constructing VBG constructions NNS construct VB construed VBD construed VBN construe VB construe VBP consultants NNS Consultants NNS consultations NNS consulted VBN consulting VBG consult VB consumed VBD consumed VBN consumer-electronics NNS consumer-goods NNS consumer-products NNS consumers NNS Consumers NNS consume VB consume VBP consummated VBN contacted VBD contacted VBN contacting VBG contacts NNS contact VB contained VBD contained VBN containers NNS containing VBG contains VBZ contain VB contain VBP contaminated VBN cont'd. VBN contemplated VBD contemplated VBN contemplates VBZ contemplate VB contemplate VBP contemplating VBG contemporaries NNS contemporize VB contended VBD contended VBN contenders NNS contending VBG contends VBZ contend VB contend VBP contentions NNS contents NNS content VB contestants NNS contested VBD contested VBN contesting VBG contests NNS contest VB Continentals NNS contingencies NNS continued VBD continued VBN Continued VBN continues VBZ continue VB continue VBP continuing VBG Continuing VBG contraceptives NNS contract-drilling VBG contracted VBD contracted VBN contracting VBG Contracting VBG contractions NNS contractors NNS Contractors NNS contracts NNS Contracts NNS contracts VBZ contract VB contract VBP contradicting VBG contradictions NNS contradicts VBZ contradict VB Contras NNS contrasted VBD contrasted VBN contrasting VBG Contrasts NNS contrasts VBZ contrast VB contrast VBP contravened VBN contribued VBD contributed VBD contributed VBN contributes VBZ contribute VB contribute VBP contributing VBG Contributing VBG contributions NNS contributors NNS controlled VBD controlled VBN controllers NNS controlling VBG controls NNS controls VBZ control VB control VBP controversies NNS con VB convenants NNS convened VBD convened VBN convenes VBZ convene VB convene VBP convening VBG conventional-arms NNS conventioners NNS convention-goers NNS conventions NNS converged VBD conversations NNS conversions NNS converted VBD converted VBN converters NNS convertibles NNS converting VBG converts NNS converts VBZ convert VB convert VBP conveyed VBD conveys VBZ convey VB convey VBP convicted VBD convicted VBN convictions NNS CONVICTS VBZ convict VB convinced VBD convinced VBN convinced VBN|JJ convinces VBZ convince VB convince VBP convincing VBG convoluted VBN convolutions NNS convoys NNS convulsions NNS cookbooks NNS cooked VBN cookies NNS cooking VBG cooks NNS cook VB coolants NNS cooled VBD cooled VBN cooling VBG cools VBZ cool VB Cool VB cool VBP cooperated VBD cooperated VBN cooperate VB cooperate VBP cooperating VBG cooperatives NNS coordinated VBN coordinates VBZ coordinate VB coordinating VBG Coors NNS co-payments NNS cope VB copied VBD copied VBN copiers NNS copies NNS co-pilot NNS coping VBG co-presidents NNS co-produce VB copycats NNS copying VBG copyrighted VBN copyrights NNS copy VB copy VBP Copy VBP cores NNS corkscrews NNS cornered VBN corners NNS cornerstones NNS corner VB cornices NNS corporate-earnings NNS corporates NNS corporations NNS Corporations NNS corral VB corrected VBD corrected VBN correcting VBG corrections NNS corrects VBZ correct VB correct VBP correlate VBP corresponded VBD correspondents NNS Corresponding VBG correspond VB corridors NNS corroborate VBP Corvettes NNS cosmetics NNS Cosmetics NNS cosmologies NNS co-sponsored VBN co-sponsors NNS co-sponsor VB Costa NNS costing VBG costs NNS Costs NNS costs VBZ costumed VBN costumes NNS cost VB cost VBD cost VBN cost VBP cots NNS cottages NNS couched VBN couching VBG coughed VBD coughed VBN coughing VBG coughs NNS cough VB councilors NNS councils NNS counseled VBN counseling VBG counselors NNS counsels VBZ counsel VB counted VBD counted VBN countenance VB counteracted VBN counteract VB counterattack VB counterbalanced VBN counterbidders NNS counterbids NNS counter-claims NNS counterclaims NNS countered VBD Countered VBD countered VBN countering VBG countermeasures NNS counterparts NNS counterprogram VB counters NNS countersued VBD countersued VBN countersuing VBG countervailing VBG counter VB counter VBP counties NNS counting VBG Counting VBG countries NNS Countries NNS countrymen NNS counts NNS counts VBZ count VB count VBP Count VBP coupled VBN Coupled VBN couples NNS couplets NNS coup-makers NNS coupons NNS Coupons NNS coups NNS couriers NNS coursed VBN courses NNS courted VBD courted VBN courtesies NNS courthouses NNS courting VBG courtrooms NNS courts NNS Courts NNS COURTS NNS courts VBZ court VB cousins NNS covenants NNS coverages NNS covered VBD covered VBN coverings NNS covering VBG covers NNS covers VBZ coverts NNS cover VB cover VBP coveted VBN covets VBZ cowards NNS cowboys NNS Cowboys NNS cower VBP co-workers NNS cows NNS CPAs NNS crabs NNS cracked VBD cracked VBN cracking VBG Cracking VBG crackle VBP cracks NNS crack VB crack VBP crafted VBD crafted VBN crafting VBG craftsmen NNS craft VB crammed VBD cramming VBG cramps NNS crams VBZ cranes NNS craning VBG cranked VBD cranked VBN cranks NNS crank VB crashed VBD CRASHED VBD crashes NNS crashes VBZ crashing VBG Crashing VBG crash VB cratering VBG crates NNS crave VBP crawled VBD crawling VBG crawls VBZ crawl VB creak VB creamed VBN creams NNS created VBD created VBN Created VBN creates VBZ Creates VBZ create VB Create VB create VBP creating VBG creations NNS creators NNS Creators NNS creatures NNS credentials NNS credit-data NN|NNS credited VBD credited VBN crediting VBG creditors NNS Creditors NNS credit-ratings NNS credits NNS credits VBZ credit VB credit VBP creeping VBG crematoriums NNS crept VBD crept VBN crest VB crevasses NNS crevices NNS crews NNS cries VBZ crimes NNS criminalize VB criminals NNS crimping VBG crimp VB cringed VBD crippled VBN cripples NNS crippling VBG crises NNS Crises NNS crisscrossing VBG criss-cross VBP criteria NNS criticisms NNS criticized VBD criticized VBN criticizes VBZ criticize VB criticize VBP criticizing VBG critics NNS Critics NNS croak VBP croissants NNS cronies NNS crooks NNS crooned VBD croons VBZ cropped VBD cropped VBN cropping VBG crops NNS crop VB CROSS-BRED VBD crossed VBD crossed VBN crosses VBZ crossing VBG cross-pollinated VBN crossroads NNS cross-shareholdings NNS cross VB cross VBP crouched VBD Crouched VBN crowded VBD crowded VBN crowds NNS crowds VBZ crowd VBP crowed VBN crowned VBN crows VBZ CRs NNS crudes NNS cruisers NNS cruise VBP cruising VBG crumbled VBD crumbled VBN crumbles VBZ crumble VB crumbling VBG crumpled VBD crumpled VBN crunched VBD Crunch VB crushed VBN crushes VBZ crushing VBG crush VB crying VBG cryptographers NNS crystals NNS cry VB cry VBP cubs NNS cuckoos NNS cues NNS cuff VB culled VBN culminated VBD culminated VBN culminates VBZ culminating VBG culprits NNS cultivated VBN cultivates VBZ cultivating VBG cultures NNS cups NNS curators NNS curbed VBD curbed VBN curbing VBG CURBING VBG curbs NNS curb VB Curb VB cured VBN cures NNS cure VB curing VBG curled VBD curl VB currencies NNS currents NNS curry VB cursed VBD curses NNS curtailed VBD curtailed VBN curtailing VBG curtail VB curtail VBP curtains NNS cushioned VBN cushioning VBG cushion VB customers NNS Customers NNS customized VBN customs NNS Customs NNS cutbacks NNS cutouts NNS cuts NNS cuts VBZ cutters NNS cutting-tools NNS cutting VBG Cutting VBG cutting VBG|NN cut VB Cut VB cut VBD cut VBN cut VBP cuvees NNS CVB NNP cycads NNS Cycads NNS cycles NNS cyclicals NNS cycling VBG cyclists NNS c-Yields NNS czars NNS dabbled VBD dabble VB dabbling VBG dabs VBZ dailies NNS dalliances NNS damaged VBD damaged VBN damages NNS damages VBZ damage VB damage VBP damaging VBG damped VBN damped VBP dampened VBD dampen VB damping VBG damp VB damp VBP dams NNS dancers NNS dances NNS dance VB dancing VBG dangers NNS Dangers NNS dangled VBN dangling VBG dared VBD dare VB dare VBP darlings NNS dashed VBD dashed VBN dashes NNS dash VB databases NNS data NN|NNS data NNS Data NNS data NNS|NN dated VBN dates NNS dates VBZ date VB date VBP dating VBG daughters NNS daunted VBD daunting VBG dawdling VBG dawning VBG dawns VBZ days. NNS days NNS Days NNS dazzled VBN deactivates VBZ deadbeats NNS deadlines NNS deadlocked VBN dead VBN Deafening VBG dealerships NNS dealers NNS Dealers NNS dealings NNS dealing VBG Dealing VBG dealmakers NNS deals NNS deals VBZ dealt VBD dealt VBN deal VB deal VBP deaths NNS debacles NNS debated VBN debates NNS Debates NNS debate VB debating VBG debentures NNS debtholders NNS debtors NNS debts NNS debunk VB debuted VBD debut VB decades NNS Decades NNS decapitalize VBP deceased VBN deceived VBD deceived VBN deceive VB decelerated VBN decelerating VBG decentralized VBN decentralizing VBG decertified VBN decided VBD decided VBN decides VBZ decide VB Decide VB decide VBP deciding VBG Deciding VBG decimated VBN decision-makers NNS decisions NNS Decisions NNS decked VBN deckhands NNS decking VBG decks NNS declarations NNS declared VBD declared VBN declares VBZ declare VB declare VBP declaring VBG Declaring VBG declassifying VBG declined VBD declined VBN decliners NNS Decliners NNS declines NNS Declines NNS declines VBZ decline VB decline VBP declining VBG Declining VBG decontaminated VBN decorated VBN Decorated VBN decorators NNS decoys NNS decreased VBD decreased VBN decreases NNS decreases VBZ decrease VB decrease VBP decreasing VBG decreed VBN decribed VBD decried VBD decries VBZ decrying VBG Dec. VB dedicated VBD dedicated VBN deduces VBZ deducted VBN deductibles NNS deducting VBG deductions NNS Deductions NNS deduct VB deeds NNS deemed VBD deemed VBN de-emphasized VBN de-emphasize VB deems VBZ deem VBP deepened VBD deepening VBG deer NNS Deer NNS defaulted VBD defaulted VBN defaulters NNS defaulting VBG defaults NNS Defaults NNS default VB defeated VBD defeated VBN defeating VBG defeats NNS defeats VBZ defeat VB defected VBD defecting VBG defections NNS Defections NNS defects NNS defect VB defendants NNS Defendants NNS defended VBD defended VBN defenders NNS Defenders NNS defending VBG Defending VBG defends VBZ defend VB defend VBP defense-electronics NNS defenses NNS deferred VBN deferring VBG defer VB deficiencies NNS deficits NNS defied VBD defies VBZ defined VBD defined VBN defines VBZ define VB defining VBG Defining VBG definitions NNS deflated VBD deflated VBN deflate VB deflators NNS deflected VBD deflecting VBG deflect VB deformed VBN defrauded VBD defrauded VBN defrauding VBG defraud VB defunct VB defuse VB Defuse VB defying VBG defy VBP degenerated VBD degenerated VBN degenerate VB degrees NNS delayed VBD delayed VBN delaying NN|VBG delaying VBG delays NNS delays VBZ DELAYS VBZ delay VB delay VBP delegates NNS Delegates NNS delegate VB delegating VBG deleted VBN delete VB deleting VBG deletions NNS deliberate VB deliberating VBG deliberations NNS delighted VBD delighted VBN delights VBZ delight VB DELIGHT VBP deli NNS delinquencies NNS delinquents NNS delisted VBN delivered VBD delivered VBN deliveries NNS Deliveries NNS delivering VBG delivers VBZ deliver VB Deliver VB deliver VBP delousing VBG deluged VBD deluged VBN delved VBN delves VBZ demagogues NNS demanded VBD demanded VBN demanding VBG demands NNS demands VBZ demand VB demand VBP demeaned VBN demeanors NNS demilitarize VB demobilize VB demobilizing VBG democracies NNS democratized VBN democratize VB Democrats NNS demographics NNS Demographics NNS demolished VBD demolished VBN demolishing VBG demolish VB demonized VBN demons NNS demonstrated VBD demonstrated VBN demonstrates VBZ demonstrate VB demonstrate VBP demonstrating VBG demonstrations NNS Demonstrations NNS demonstrators NNS demoted VBN demurs VBZ denationalized VBN denials NNS denied VBD denied VBN Denied VBN denies VBZ denims NNS Denizens NNS denominated VBN denominations NNS denounced VBD denounced VBN denounce VB denounce VBP denouncing VBG dentists NNS dents NNS denuclearized VBN denude VB denying VBG deny VB deny VBP departed VBD departing VBG Departing VBG departments NNS departures NNS depart VB depart VBP depended VBD depended VBN dependents NNS depending VBG Depending VBG depends VBZ depend VB depend VBP depicted VBN depicting VBG Depicting VBG depicts VBZ depict VB depict VBP depleted VBD depleted VBN depletes VBZ deplete VB deplores VBZ deploring VBG deployed VBD deployed VBN deported VBD deposed VBN deposited VBD deposited VBN depositing VBG depositions NNS depositors NNS deposits NNS Deposits NNS deposits VBZ deposit VB depots NNS depreciated VBD depredations NNS depressed VBD depressed VBN depresses VBZ depressing VBG depressions NNS depress VB depress VBP deprivations NNS deprived VBD deprived VBN deprived VBN|JJ deprives VBZ deprive VB depriving VBG deprogrammings NNS depths NNS deputies NNS Deputies NNS derailed VBN derailing VBG derail VB deregulated VBN deregulate VB derided VBD derided VBN derivatives NNS derived VBD derived VBN derives VBZ derive VB deriving VBG Descendants NNS descended VBD descending VBG descends VBZ descents NNS described VBD described VBN describes VBZ describe VB describe VBP describing VBG Describing VBG descriptions NNS deserted VBN deserts NNS desert VB deserved VBD deserves VBZ deserve VB deserve VBP designated VBN designates VBZ designate VB designating VBG designations NNS designed VBD designed VBN designees NNS designers NNS designing VBG designs NNS Designs NNS designs VBZ design VB desired VBD desired VBN desires NNS desire VBP desist VB desks NNS despairs VBZ despised VBN despise VB despots NNS destabilize VB destabilizing VBG destinations NNS Destinations NNS destined VBN destroyed VBD destroyed VBN destroying VBG destroys VBZ destroy VB destroy VBP detached VBN Detached VBN detailed VBN detailing VBG details NNS Details NNS detained VBD detained VBN detaining VBG detected VBD detected VBN detecting VBG detectives NNS detectors NNS detect VB detect VBP detergents NNS deteriorated VBD deteriorated VBN deteriorates VBZ deteriorate VB deteriorate VBP deteriorating VBG determined VBD determined VBN determines VBZ determine VB determine VBP determining VBG Determining VBG deterred VBD deterrents NNS deterring VBG deters VBZ deter VB deter VBP detests VBZ dethroned VBN detractors NNS detracts VBZ detract VB devaluations NNS devalued VBD devalued VBN devastated VBD devastated VBN devastating VBG developed VBD developed VBN Developed VBN developers NNS Developers NNS developing VBG Developing VBG developments NNS Developments NNS develops VBZ develop VB Develop VB develop VBP deviated VBD deviations NNS devices NNS devils NNS devised VBD devised VBN devises VBZ devise VB devise VBP devoted VBD devoted VBN devotees NNS Devotees NNS devotes VBZ devote VB devote VBP devoured VBN devouring VBG dewatering VBG diabetics NNS diagnosed VBD diagnosed VBN diagnosing VBG diagnostics NNS diagramming VBG dialects NNS dialing VBG DIALING VBG dials NNS dial VB diamonds NNS diapers NNS diaries NNS dibenzofurans NNS dice NNS dickered VBD dictated VBD dictated VBN Dictates NNS dictates VBZ dictate VB dictate VBP dictatorships NNS dictators NNS did VBD Did VBD died VBD Died VBD DIED VBD died VBN Died VBN die-hards NNS Diehards NNS diesels NNS dies VBZ Dies VBZ die VB Die VB die VBP Die VBP differed VBD differences NNS Differences NNS differentials NNS differentiate VB differentiating VBG differing VBG differs VBZ differ VB differ VBP difficulties NNS digested VBN digesting VBG digest VB digging VBG digits NNS dignify VB dignitaries NNS digs NNS DIGS NNS digs VBZ dig VB diluted VBD diluted VBN diluted VBN|JJ dilute VB diluting VBG dimensions NNS dimes NNS diminished VBD diminished VBN diminishes VBZ diminishing VBG diminish VB diminish VBP dimming VBG dined VBD dined VBN diners NNS dining VBG dinners NNS dinosaurs NNS dioxins NNS diplomats NNS dipped VBD dipped VBN dipping VBG dips NNS dip VB Dip VB directed VBD directed VBN directing VBG directions NNS directives NNS directories NNS directors NNS Directors NNS directs VBZ direct VB Dirks NNS disabilities NNS disabled VBN disabled-workers NNS disabling VBG disadvantages NNS disagreed VBD disagreed VBN disagreements NNS disagrees VBZ disagree VB disagree VBP disallowed VBD disappeared VBD disappeared VBN disappears VBZ disappear VB disappear VBP disappointed VBD disappointed VBN disappointments NNS Disappointments NNS DISAPPOINTMENTS NNS disappoint VB disapproved VBD disapproved VBN disapproves VBZ disapprove VBP disarming VBG disarm VB disassemble VB disassociate VB disasters NNS Disasters NNS disavowed VBD disbanded VBN disbanding VBG disband VB disbursed VBN disbursements NNS discarded VBD discarded VBN discard VB discerns VBZ discern VB discharged VBN discharges NNS discharge VB disciples NNS disciplined VBD disciplined VBN disciplining VBG disclaims VBZ disclosed VBD disclosed VBN discloses VBZ disclose VB disclosing VBG disclosures NNS Disclosures NNS discolored VBN discomfit VB disconnected VBN disconnect VB discontinued VBN discontinue VB discontinuing VBG discos NNS Discos NNS discounted VBD discounted VBN Discounted VBN discounting VBG discounts NNS discounts VBZ discount VB discouraged VBD discouraged VBN discourages VBZ discourage VB discourage VBP discouraging VBG discovered VBD discovered VBN discoveries NNS discovering VBG discovers VBZ discover VB discover VBP discredited VBN discredit VB discrepancies NNS Discrepancies NNS discrepencies NNS discriminating VBG discs NNS discussed VBD discussed VBN discussing VBG Discussing VBG discussions NNS discuss VB discuss VBP disdaining VBG disdain VB diseases NNS disembark VBP disenchanted VBN disengage VB disgorge VB disgruntled VBN disguised VBN disguises NNS disguise VB disgusted VBN Disgusted VBN dishes NNS dish VB dishwashers NNS disinclined VBN disinfectants NNS disintegrated VBD disintegrating VBG disks NNS dislikes VBZ dislike VB dislike VBP dislocations NNS dismantled VBN dismantle VB Dismantle VB dismantling VBG dismayed VBN dismember VB dismissed VBD dismissed VBN dismisses VBZ dismissing VBG Dismissing VBG dismiss VB dismiss VBP disobey VB disorders NNS disparaged VBD disparage VB disparaging VBG disparities NNS dispatched VBD dispatched VBN dispatchers NNS dispatches NNS dispatching VBG dispatch VB dispelled VBD dispel VB dispensed VBD dispense VB dispense VBP dispensing VBG dispersants NNS dispersed VBD dispersing VBG displaced VBN displace VB displayed VBD displayed VBN displaying VBG displays NNS displays VBZ display VB display VBP displeases VBZ disposables NNS disposals NNS disposed VBD disposed VBN disposes VBZ dispose VB disposing VBG dispositions NNS disprove VB disputed VBD disputed VBN disputes NNS disputes VBZ dispute VB dispute VBP disqualified VBN disqualify VB disregarded VBD disregard VB disrupted VBD disrupted VBN disrupting VBG disruptions NNS disrupt VB dissatisfied VBN dissected VBN dissecting VBG disseminated VBN disseminate VB disseminating VBG dissented VBD dissented VBN dissenters NNS dissents NNS dissidents NNS Dissidents NNS dissipated VBN dissipates VBZ dissipate VB dissociate VB dissociating VBG dissolved VBN dissolves VBZ dissolve VB dissolving VBG dissuade VB distances NNS distance VB distancing VBG distilled VBN distillers NNS distilling VBG distinctions NNS distinguished VBD distinguished VBN distinguish VB distorted VBN distortions NNS distorts VBZ distort VB distort VBP distracted VBD distracted VBN distracting VBG distractions NNS distract VB distributed VBD distributed VBN Distributed VBN distributes VBZ distribute VB distribute VBP distributing VBG Distributing VBG distributions NNS distributors NNS districts NNS districts\/states NNS disturbances NNS disturbed VBN disturbing VBG disturbs VBZ disturb VB dithering VBG dived VBD diverge VB diverging VBG diversifed VBN diversifications NNS diversified VBN diversifying VBG diversify VB Diversify VB diversions NNS divers NNS diverted VBN diverting VBG divert VB divert VBP divested VBD divesting VBG Divesting VBG divestitures NNS divest VB divest VBP dives VBZ dive VB dive VBP divided VBD divided VBN dividends NNS divides VBZ divide VB Divide VB divide VBP dividing VBG dividing VBG|NN|JJ diving VBG divisions NNS divorced VBD divorced VBN divulge VB divvied VBN divvying VBG dizzying VBG dockets NNS dock-siders NNS doctors NNS Doctors NNS DOCTORS NNS doctor VB doctrines NNS docudramas NNS documentaries NNS documented VBN documenting VBG documents NNS Documents NNS documents VBZ document VB dodged VBD dodge VBP does VBZ Does VBZ dogged VBD dogged VBN dogging VBG do-gooders NNS dogs NNS Dogs NNS DOGS NNS dogs VBZ doing VBG Doing VBG doldrums NNS doled VBD dole VB dole VBP doling VBG dollars NNS dolls NNS dolphins NNS domes NNS dominated VBD dominated VBN dominates VBZ dominate VB dominate VBP dominating VBG donated VBD donated VBN donate VB donate VBP donating VBG Donating VBG donations NNS Donations NNS done VBN donned VBD Donning VBG donors NNS dons NNS dons VBZ DON'T VB don VB doomed VBD doomed VBN dooming VBG doomsayers NNS doom VB doormen NNS doors NNS doses NNS dossiers NNS doted VBN doth VBZ dotting VBG dot VBP double-crossed VBD doubled VBD Doubled VBD doubled VBN doubles VBZ double VB double VBP doubling VBG doubted VBD doubters NNS doubts NNS Doubts NNS doubts VBZ doubt VB doubt VBP do VB Do VB do VBP Do VBP dovetails VBZ dove VBD downed VBD downgraded VBD downgraded VBN Downgraded VBN downgrades NNS downgrading VBG downpayments NNS downplayed VBD downsized VBN downsize VB downsizing VBG downs NNS downturns NNS down VBP dozens NNS Dozens NNS drafted VBD drafted VBN drafting VBG draftsmen NNS draft VB dragged VBD dragged VBN dragging VBG Dragging VBG drags VBZ drag VB drag VBP drained VBN draining VBG drains VBZ drain VB dramatizations NNS DRAMs NNS drapes NNS drape VB drawbacks NNS drawings NNS drawing VBG Drawing VBG drawn VBN draws NNS draws VBZ draw VB draw VBP dreaded VBN dreamed VBD dreamed VBN dreaming VBG dreams NNS Dreams NNS dreams VBZ dreamt VBD dream VB Dream VB dream VBP dressed VBD dressed VBN dresses NNS dresses VBZ dressing VBG dress VB dress VBP drew VBD dried VBD dried VBN drifted VBD drifting VBG drift VB drift VBP drilled VBD drilled VBN drillers NNS drilling VBG drills NNS drill VB drinking VBG drinks NNS drink VB drink VBP dripping VBG driven VBN drivers NNS drives NNS drives VBZ drive VB drive VBP driving VBG Driving VBG drooled VBD drooling VBG droped VBD droplets NNS dropouts NNS dropped VBD dropped VBN droppers NNS dropping VBG drops NNS drops VBZ drop VB drop VBP droughts NNS droves NNS drove VBD drowned VBD drowned VBN drown VB drug-sales NNS drugs NNS Drugs NNS drugstores NNS drumming VBG drums NNS drum VB drying VBG dry VB Ds NNS dubbed VBD dubbed VBN Dubbed VBN dubs VBZ ducking VBG ducklings NNS ducks NNS Ducks NNS ducks VBZ duck VB ducts NNS duds NNS dueling VBG duels NNS due NNS dues NNS duffers NNS dug VBD dulled VBN dummies NNS dumped VBD dumped VBN dumping VBG dumps NNS dumps VBZ dump VB dump VBP dunes NNS dupes VBZ duplicated VBN duplicate VB Duplicating VBG duplications NNS durable-goods NNS durables NNS dusted VBD dusting VBG duties NNS 'd VBD dwarfed VBN dwarfs VBZ dwarf VB dwarf VBP dwellers NNS dwellings NNS dwindled VBD dwindling VBG dyed VBN dyes NNS dying VBG dynamics NNS dynamos NNS earmarked VBD earmarked VBN earmarking VBG earmark VB earned VBD earned VBN earners NNS earnigs NNS earnings NNS Earnings NNS EARNINGS NNS earning VBG earns VBZ Earns VBZ earn VB earn VBP earrings NNS ears NNS earthlings NNS earthquakes NNS earthworms NNS eased VBD eased VBN eases VBZ ease VB ease VBP easing VBG Easterners NNS East NNS EAST NNS eaten VBN eaters NNS eating VBG eats VBZ eat VB eat VBP eavesdrop VB ebbs VBZ ebb VB eccentrics NNS echelons NNS echoed VBD echoed VBN echoes NNS echoing VBG Echoing VBG echo VB echo VBP eclairs NNS eclipse VB eclipsing VBG economics NNS economies NNS economists NNS Economists NNS economize VB edged VBD edged VBN edges NNS edge VB edging VBG editions NNS editorials NNS Editorials NNS editors NNS edit VB educated VBN educate VB educating VBG educations NNS educators NNS Educators NNS effects NNS effects VBZ effect VB efficiencies NNS efforts NNS Efforts NNS eggs NNS egos NNS ejected VBN eked VBD eke VB elaborate VB elaborating VBG Elaborating VBG elapsed VBN elbows NNS elders NNS elected VBD elected VBN Elected VBN elections NNS Elections NNS ELECTIONS NNS electrical-products NNS electrified VBN electrochemicals NNS electrodes NNS electrogalvanized VBN electrogalvanizing VBG electromagnets NNS electronic-data NN|NNS electronics NNS electronic-systems NNS electro-optics NNS elect VB elect VBP elements NNS Elements NNS elephants NNS elevated VBD elevates VBZ elevations NNS elevators NNS eliminated VBD eliminated VBN eliminates VBZ eliminate VB Eliminate VB eliminate VBP eliminating VBG Eliminating VBG elite NNS elites NNS elitists NNS elongate VB eluded VBD eluding VBG emasculate VB embargoed VBD embargoes NNS embargos NNS embarked VBD embarked VBN embarking VBG embark VB embarrassed VBD embarrassed VBN embarrassing VBG embarrass VB embassies NNS embattled VBN embedded VBN Embedded VBN embellish VB embezzled VBD embezzling VBG emblems NNS embodied VBN embodies VBZ embody VBP emboldened VBN Emboldened VBN embraced VBD embraced VBN embraces VBZ embrace VB embracing VBG embroiled VBN emerged VBD emerged VBN emergencies NNS emerges VBZ emerge VB emerge VBP emerging VBG Emerging VBG emigrated VBD emigrate VB emigres NNS emissaries NNS emissions NNS emitted VBN emoted VBD emote VB emotions NNS empathize VB emphasized VBD emphasized VBN emphasizes VBZ emphasize VB emphasize VBP emphasizing VBG emphaticize VB empires NNS employed VBN employees NNS Employees NNS EMPLOYEES NNS employees VBZ employers NNS Employers NNS employing VBG employs VBZ employ VB employ VBP empowered VBN empowering VBG empowers VBZ empower VBP emptied VBN emptying VBG empty VB empty VBP emulated VBN emulate VB emulating VBG enabled VBD enabled VBN enables VBZ enable VB enable VBP enabling VBG enacted VBD enacted VBN enacting VBG enact VB encapsulate VB encasing VBG encircling VBG enclosed VBN enclosing VBG encompassed VBD encompasses VBZ encompass VB encountered VBD encountered VBN encounters NNS encounters VBZ encounter VB encouraged VBD encouraged VBN Encouraged VBN encourages VBZ encourage VB Encourage VB encourage VBP encouraging VBG Encouraging VBG encroaching VBG encrusted VBN encrypting VBG encumbered VBN endangered-species NNS endangered VBN endanger VB endeavoring VBG endeavors NNS endeavor VB ended VBD ENDED VBD ended VBN endings NNS ending VBG Ending VBG endorsed VBD endorsed VBN endorsements NNS endorsers NNS endorses VBZ endorse VB endorse VBP endorsing VBG endowed VBD endowed VBN endow VB ends NNS Ends NNS ends VBZ Ends VBZ end-tailed VBN endured VBD endured VBN endure VB endure VBP Endure VBP enduring VBG end VB End VB end VBP enemies NNS energies NNS energized VBN energy-services NNS enforced VBN enforcers NNS Enforcers NNS enforces VBZ enforce VB enforce VBP enforcing VBG engaged VBD engaged VBN engagements NNS engages VBZ engage VB engage VBP engaging VBG engineered VBN engineering VBG engineers NNS Engineers NNS engineer VB engines NNS English NNS engraved VBN engulfed VBD engulfed VBN enhanced VBD enhanced VBN enhancements NNS enhances VBZ enhance VB enhancing VBG enjoined VBD enjoined VBN enjoin VB enjoyed VBD enjoyed VBN enjoying VBG Enjoying VBG enjoys VBZ enjoy VB enjoy VBP enlarged VBN enlargers NNS enlarge VB enlarging VBG enlightened VBN enlightening VBG enlighten VB enlisted VBD enlisting VBG enlist VB enlivening VBG ennumerated VBD enraged VBD enriching VBG enrich VBP enrolled VBD enrolled VBN enrollees NNS enrollments NNS enroll VB enroll VBP ensconced VBN ensembles NNS ensnarled VBN ensued VBD ensue VB ensuing VBG ensures VBZ ensure VB ensure VBP ensuring VBG entailed VBN entails VBZ entail VB entangled VBN entered VBD entered VBN entering VBG enterprises NNS enters VBZ entertained VBD entertained VBN entertainers NNS entertaining VBG entertain VB enter VB Enter VB enter VBP enthusiasms NNS enthusiasts NNS Enthusiasts NNS enticed VBD entice VB entice VBP enticing VBG entities NNS entitled VBD entitled VBN entitlements NNS entitles VBZ entitle VB entitle VBP entitling VBG entombed VBN entranced VBN entrants NNS entrenched VBN entrench VB entrepreneurs NNS Entrepreneurs NNS entries NNS entrusted VBN entrust VB entrust VBP entwined VBN envelopes NNS environmentalists NNS Environmentalists NNS environments NNS envisaged VBD envisaged VBN envisioned VBN envisions VBZ envision VB envision VBP envy VBP eons NNS epileptics NNS episodes NNS epitomize VBP equaled VBD equaled VBN equaling VBG equals VBZ equal VB equal VBP equated VBN equates VBZ equate VB equestrians NNS equipped VBD equipped VBN Equipped VBN equipping VBG equips VBZ equip VB equities NNS Equities NNS equivalents NNS Equivalents NNS eradicate VB erased VBD erased VBN erases VBZ erase VB erasing VBG Erasing VBG erasures NNS erected VBD erected VBN erect VB eroded VBD eroded VBN erodes VBZ Erodes VBZ erode VB erode VBP eroding VBG errata NNS erred VBD erred VBN errors NNS errs VBZ err VB Err VB err VBP erupted VBD erupted VBN erupts VBZ erupt VB escalated VBD escalated VBN escalate VB escalating VBG escalators NNS escaped VBD escaped VBN escape VB escaping VBG eschewed VBN Escorts NNS escorts VBZ escrowed VBN espouse VBP ESPs NNS essays NNS essentials NNS established VBD established VBN Established VBN establishes VBZ establishing VBG Establishing VBG establishments NNS establish VB Establish VB establshed VBN estimated VBD estimated VBN Estimated VBN estimates NNS Estimates NNS estimates VBZ estimate VB estimate VBP estimating VBG Estimating VBG estimators NNS estranged VBN ethics NNS euphemisms NNS Eurobonds NNS Eurodebentures NNS EURODOLLARS NNPS|NNS EURODOLLARS NNS Euroissues NNS Euronotes NNS Europeans NNS evacuated VBN evacuate VB evaded VBN evaders NNS evades VBZ evade VB evaluated VBD evaluated VBN evaluates VBZ evaluate VB evaluate VBP evaluating VBG Evaluating VBG evaluations NNS Evaluations NNS evangelists NNS evaporated VBD evaporated VBN evaporate VB evened VBN evenings NNS evens VBZ events NNS Events NNS even VB Everglades NNS evidenced VBN evil-doers NNS evinced VBD evinced VBN eviscerating VBG evokes VBZ evoke VBP evoking VBG evolved VBD evolved VBN evolve VB evolving VBG exacerbated VBD exacerbated VBN exacerbates VBZ exacerbate VB exacerbating VBG exaggerated VBN exaggerate VB examinations NNS examined VBD examined VBN examiners NNS Examiners NNS examines VBZ examine VB EXAMINE VB examine VBP examining VBG examples NNS Examples NNS exams NNS excavated VBN excavating VBG excavators NNS exceeded VBD exceeded VBN exceeding VBG exceeds VBZ exceed VB exceed VBP excel VB excel VBP exceptions NNS except VB excerpts NNS Excerpts NNS excesses NNS exchanged VBD exchanged VBN exchanges NNS exchange VB exchange VBP exchanging VBG excised VBD excised VBN excise VB excited VBN excite VB exclaims VBZ excluded VBD excluded VBN Excluded VBN Excludes NNS excludes VBZ exclude VB exclude VBP excluding VBG Excluding VBG exclusions NNS excorciate VB excoriated VBN excursions NNS excused VBN excuses NNS excuse VB excutives NNS execs NNS executed VBD Executed VBD executed VBN executes VBZ execute VB execute VBP executing VBG executions NNS Executions NNS executives NNS Executives NNS EXECUTIVES NNS executors NNS exemplified VBN exemplifies VBZ ex-employees NNS exempted VBN exempting VBG exemptions NNS exempt VB exercised VBD exercised VBN exercises NNS exercises VBZ exercise VB exercise VBP exercising VBG exerpts NNS exerted VBN exerting VBG exert VB exhaled VBD exhausted VBD exhausted VBN exhausting VBG exhaust VB exhaust VBP exhibited VBN exhibiting VBG exhibitions NNS exhibitors NNS exhibits NNS exhibits VBZ exhibit VB exhort VB exiled VBD exiled VBN existed VBD existed VBN existing VBG Existing VBG exists VBZ exist VB exist VBP exited VBD exits NNS exits VBZ exit VB exonerated VBD exonerated VBN exonerating VBG exorcise VB exorcisms NNS expanded VBD expanded VBN expanding VBG expands VBZ EXPANDS VBZ expand VB expand VBP expansionists NNS expansions NNS expectations NNS Expectations NNS expected VB expected VBD expected VBN expecting VBG Expects NNS expects VBZ expect VB Expect VB expect VBP EXPECT VBP expedients NNS expedited VBN expedite VB expediting VBG expelled VBD expelled VBN expel VB expended VBN expenditures NNS expenditures VBZ expenses NNS Expenses NNS experienced VBD experienced VBN Experienced VBN experiences NNS experiences VBZ experience VB experience VBP experiencing VBG experimented VBD experimented VBN experimenting VBG experiments NNS Experiments NNS experts NNS Experts NNS expirations NNS expired VBD expired VBN expires VBZ expire VB expire VBP expiring VBG explained VBD explained VBN explaining VBG Explaining VBG explains VBZ Explains VBZ explain VB explain VBP explanations NNS exploded VBD exploded VBN explodes VBZ explode VB exploding VBG exploited VBN exploiters NNS exploiting VBG exploits NNS exploits VBZ exploit VB exploit VBP explores VBZ explore VB explore VBP exploring VBG explosions NNS Explosions NNS explosives NNS exported VBD exported VBN exporters NNS exporting VBG exports NNS Exports NNS exports VBZ export VB export VBP exposed VBD exposed VBN expose VB expose VBP exposing VBG exposures NNS expounding VBG expressed VBD expressed VBN expresses VBZ expressing VBG expressions NNS express VB express VBP expunged VBN expunge VB extended VBD extended VBN extending VBG extends VBZ extend VB extend VBP extensions NNS Extensions NNS Exteriors NNS extinguish VB extorted VBD extorting VBG extort VB extracted VBD extracted VBN extracting VBG extracts NNS extract VB extradited VBN extraditions NNS extrapolated VBN extras NNS extremes NNS extremists NNS extricate VB extrusions NNS exuded VBD exude VBP eyeballing VBG eyeballs NNS eyebrows NNS eyed VBD eyeglasses NNS eyeing VBG eyes NNS Eyes NNS eyewitnesses NNS eying VBG F16s NNS F-18s NNS F18s NNS fabricated VBN fabricate VB fabrications NNS fabricators NNS fabrics NNS facades NNS faced VBD faced VBN Faced VBN facelifts NNS faces NNS faces VBZ face VB face VBP facilitate VB facilitate VBP facilitating VBG facilities NNS facings NNS facing VBG FACING VBG facsimiles NNS factions NNS factories NNS Factories NNS factoring VBG factors NNS factors VBZ factor VB factory-jobs NNS facts NNS faded VBD faded VBN fades NNS fade VB fading VBG fads NNS failed VBD FAILED VBD failed VBN failings NNS failing VBG fails VBZ Fails VBZ failures NNS Failures NNS fail VB fail VBP fainting VBG faint VB fairs NNS fajitas NNS faked VBD faked VBN faking VBG fallen VBN falling VBG falls NNS falls VBZ fall VB fall VBP falsified VBN falsifying VBG falsify VB faltered VBD faltered VBN faltering VBG falters VBZ falter VBP famed VBN familiarize VB families NNS Families NNS famines NNS fanatics NNS fancies VBZ fangs NNS fanned VBD fans NNS Fans NNS fans VBZ fantasies NNS fantasize VB fantasize VBP fan VB fared VBD fared VBN fares NNS Fares NNS fares VBZ fare VB fare VBP Fare VBP farmers NNS Farmers NNS FARMERS NNS farming VBG farms NNS farmsteads NNS farms VBZ farm VB farmwives NNS fascinated VBN fascists NNS fashioned VBN fashions NNS fashion VB fastballs NNS fastened VBN fasteners NNS fatalities NNS Fatalities NNS fathers NNS Fathers NNS fats NNS fattened VBD fattened VBN fatten VBP faulted VBN faultlines NNS faults NNS fault VBP fauna NNS favored VBD favored VBN favoring VBG favorites NNS favors NNS favors VBZ favor VB favor VBP fawning VBG faxed VBD faxes NNS feared VBD feared VBN fearing VBG fears NNS Fears NNS fears VBZ fear VB fear VBP feasted VBN feasts NNS feathers NNS feats NNS featured VBD featured VBN features NNS features VBZ feature VB feature VBP featuring VBG fed VBD fed VBN feeding VBG feedlots NNS Feedlots NNS feeds VBZ feed VB feed VBP feelers NNS feelings NNS Feelings NNS feeling VBG Feeling VBG feels VBZ Feels VBZ feel VB feel VBP fees NNS Fees NNS feet NNS fellas NNS FELLED VBD felled VBN fellows NNS fell VBD fell VBN felonies NNS felons NNS felt VBD felt VBN females NNS feminists NNS fences NNS fended VBD fending VBG Fending VBG fend VB ferreting VBG ferret VB ferries NNS ferrying VBG ferry VB fertilized VBN fertilizers NNS fertilizing VBG fester VB festivals NNS festivities NNS festooned VBN festooning VBG fetched VBD fetches VBZ fetching VBG fetch VB fetch VBP fetuses NNS feuded VBD feuding VBG fiberglass NNS fibers NNS ficials NNS fidgeting VBG fiefdoms NNS fielded VBD fielded VBN fielding VBG fields NNS fields VBZ field VB fighter-bombers NNS fighters NNS fighting VBG fights NNS fights VBZ fight VB Fight VB fight VBP figured VBD figured VBN figures NNS Figures NNS figures VBZ figure VB figure VBP figuring VBG Figuring VBG filberts NNS filched VBN filed VBD filed VBN filers NNS files NNS files VBZ file VB file VBP filings NNS filing VBG Filipinos NNS filled VBD filled VBN filling VBG Filling VBG fills VBZ fill VB fill VBP filmed VBD filmed VBN Filmed VBN filming VBG film-makers NNS films NNS filtered VBN Filtered VBN filtering VBG filters NNS filters VBZ finagled VBN finalists NNS finalized VBD finalized VBN finalizing VBG finals NNS financed VBD financed VBN finances NNS FINANCES NNS finances VBZ finance VB financial-crimes NNS financial-services NNS financiers NNS financings NNS financing VBG findings NNS finding VBG Finding VBG finds NNS finds VBZ find VB find VBP Find VBP fine-arts NNS fined VBD fined VBN fines NNS finessed VBN fine-tuning VB fine VB fingered VBN fingering VBG fingerlings NNS fingerprints NNS fingers NNS finger VB fining VBG finished VB finished VBD finished VBN finishes NNS finishes VBZ finishing VBG finish VB finish VBP fireballs NNS fired VBD fired VBN firefighters NNS firehoops NNS firemen NNS fireplaces NNS fireproofing VBG fires NNS fires VBZ Fires VBZ fire VB fireworks NNS firings NNS firing VBG firmed VBD firmed VBN firming VBG firms NNS Firms NNS FIRMS NNS firm VB fisheries NNS fishermen NNS fishing VBG fissures NNS fists NNS fits NNS fits VBZ fit VB fit VBN fit VBP fixed VBD fixed VBN fixes NNS fixing VBG fixtures NNS fix VB fizzes VBZ fizzled VBD fizzled VBN flabbergasted VBN flags NNS Flags NNS flamed VBD flames NNS Flanked VBN flapping VBG flaps NNS flared VBN flare VBP flaring VBG flashbacks NNS flashed VBD flashed VBN flashes NNS flashes VBZ flashing VBG flashlights NNS flash VB flash VBP flats NNS flattened VBD flattened VBN flattening VBG flatten VB flaunts VBZ flaunt VB flavors NNS flawed VBN flaws NNS Flaws NNS flay VB fleas NNS fledging VBG fled VBD fled VBN fleeced VBN fleeing VBG fleets NNS fleets VBZ flee VB fleshpots NNS flew VBD flexing VBG flickered VBD flicking VBG flied VBD fliers NNS flies NNS flies VBZ flight-attendants NNS flights NNS Flights NNS flinch VB flinging VBG flings NNS flipped VBD flipping VBG flips VBZ flip VB flirted VBD flirted VBN flirting VBG floated VBD floated VBN floating VBG Floating VBG floats VBZ float VB float VBP flocked VBD flocked VBN flocking VBG flog VB flooded VBD flooded VBN flooding VBG flood VB floors NNS flopped VBD flopped VBN floppies NNS flora NNS flotations NNS floundered VBN floundering VBG flounder VB flourished VBD flourished VBN flourishing VBG flourish VB flouting VBG flowed VBD flowers NNS Flowers NNS flowing VBG flown VBN flows NNS flows VBZ flow VB flow VBP fluctuated VBD fluctuated VBN fluctuates VBZ fluctuate VB fluctuate VBP fluctuating VBG fluctuations NNS fluids NNS flunking VBG flunk VBP fluoropolymers NNS flush VB flush VBP flying VBG fly VB Fly VB fly VBP foaming VBG focused VBD focused VBN focuses VBZ focusing VBG Focusing VBG focus VB focus VBP foes NNS FOES NNS fog VB foiled VBD foiled VBN Foiled VBN foiling VBG foil VB folded VBD folded VBN folders NNS folding VBG fold VB fold VBP folks NNS Folks NNS follies NNS followed VBD followed VBN followers NNS following VBG Following VBG follows VBZ follow VB Follow VB follow VBP fomenting VBG food-fish NNS food-services NNS foods NNS foodstuffs NNS fooled VBN fooling VBG fools NNS fool VB foothills NNS Foothills NNS footing VBG footnoted VBN footnotes NNS footsteps NNS foot VB foot VBP forays NNS forbade VBD forbidden VBN forbidding VBG forbids VBZ forced VBD forced VBN forces NNS Forces NNS forces VBZ force VB force VBP FORCE VBP forcing VBG forecasters NNS forecasting VBG forecasts NNS Forecasts NNS forecasts VBZ forecast VB forecast VBD forecast VBN forecast VBP foreclosed VBD foreclosed VBN Foreclosed VBN foreclosed VBN|JJ foreclosures NNS Foreclosures NNS forefathers NNS foreigners NNS Foreigners NNS forensics NNS forerunners NNS foresaw VBD foreseen VBN foresees VBZ foresee VB foresee VBP foreshadowed VBN forest-products NNS Forest-products NNS forests NNS forfeitures NNS forfeit VB forged VBN forgeries NNS forgets VBZ forgetting VBG forget VB Forget VB forget VBP forge VB forgings NNS forging VBG forgiven VBN forgive VB forgiving VBG forgotten VBN forgot VBD forgot VBN forgo VB forked VBD forked VBN forklifts NNS fork VB formalities NNS formalizes VBZ formats NNS formed VBD formed VBN Formed VBN forming VBG forms NNS forms VBZ formulas NNS formulated VBN formulates VBZ formulate VB formulating VBG formulations NNS form VB form VBP forsaken VBN fortified VBN fortunes NNS forums NNS forward VB fossils NNS fostered VBD fostered VBN fostering VBG foster VB fought VBD fought VBN fouled VBN foundations NNS founded VBD founded VBN Founded VBN foundered VBD foundering VBG founders NNS Founders NNS founding VBG Founding VBG found VBD found VBN fountains NNS Four-fifths NNS foxes NNS fractioning VBG fractions NNS fractured VBD fractured VBN fragmented VBN fragments NNS fragments VBZ frailties NNS framed VBN framers NNS frames NNS frame VB framing VBG franchised VBN franchisees NNS Franchisees NNS franchisers NNS franchises NNS franchises VBZ franchise VB franchising VBG Franciscans NNS francs NNS fraternities NNS frauds NNS frayed VBN fray VB freaked VBN freaks NNS freedoms NNS freed VBD FREED VBD freed VBN Freed VBN freeholders NNS freeing VBG frees VBZ free VB free VBP freeways NNS freezers NNS freezes NNS freezes VBZ freeze VB freighters NNS freight VB French NNS frequencies NNS frequents VBZ freshmen NNS frets VBZ fretted VBD fretting VBG fret VB fret VBP frictions NNS Fridays NNS fried VBN friendships NNS friends NNS Friends NNS friers NNS friezes NNS frigates NNS frightened VBD frightened VBN frighten VB fringes NNS fripperies NNS frittered VBN frittering VBG frocks NNS frogmen NNS frogs NNS frolicked VBN frolic VB fronds NNS fronts NNS froth VB frozen VBN froze VBD fruits NNS frustrated VBD frustrated VBN frustrate VB frustrating VBG frustrations NNS frying VBG fudge VB fudge VBP fueled VBD fueled VBN fueling VBG Fueling VBG fuel-services NNS fuels NNS fuel VB fugitives NNS Fuji NNS fulfilled VBD fulfilled VBN fulfilling VBG fulfills VBZ fulfill VB fulfill VBP fulminations NNS fumes NNS fumes VBZ fuming VBG fumpered VBD functionaries NNS functioned VBD functioning VBG functions NNS function VB function VBP fundamentalists NNS fundamentals NNS funded VBD funded VBN Funded VBN funding VBG fund-raisers NNS fundraisers NNS Fund-Raisers NNS fundraising VBG funds NNS Funds NNS FUNDS NNS funds VBZ fund VB fund VBP fungi NNS funneled VBD funneled VBN funneling VBG funnel VB furloughed VBN furloughs NNS furnaces NNS furnished VBN furnishings NNS furnishing VBG furnish VB furriers NNS furrows VBZ furs NNS furthering VBG furthers VBZ further VB fusses VBZ futures NNS Futures NNS FUTURES NNS gadgets NNS gagged VBN gained VB gained VBD gained VBN gainers NNS Gainers NNS gaining VBG gains NNS Gains NNS gains VBZ gain VB gain VBP galaxies NNS galleries NNS gallons NNS galloping VBG gallstones NNS galvanized VBD galvanize VB galvanizing VBG gamblers NNS gamble VB gambling VBG games NNS gangbusters NNS gangs NNS gangsters NNS gaped VBN garages NNS gardeners NNS gardenettes NNS gardening VBG gardens NNS garden VB garments NNS garnered VBD garnered VBN garner VB garner VBP gases NNS Gases NNS gas-gathering VBG gasolines NNS gasped VBD gasp VB gates NNS gathered VBD gathered VBN Gathered VBN gatherings NNS gathering VBG gathers VBZ gather VB gather VBP gauges VBZ gauge VB gauging VBG gave VBD Gave VBD gaze VBP geared VBN gearing VBG gears VBZ gear VB gear VBP geeks NNS Geeks NNS gemsbok NNS gems NNS gender VB generalists NNS generalizations NNS Generalizations NNS generalize VB generated VBD generated VBN generates VBZ generate VB generate VBP generating VBG generations NNS generators NNS genes NNS genres NNS gentleladies NNS gentlemen NNS geosciences NNS Germans NNS GERMANS NNS germs NNS gestured VBD gestures NNS gets VBZ Gets VBZ getting VBG Getting VBG get-togethers NNS get VB Get VB GET VB get VBP Get VBP geysers NNS ghettos NNS ghostbusters NNS Ghostbusters NNS ghosts NNS giants NNS Giants NNS gifts NNS gilts NNS Gilts NNS gimmicks NNS girded VBD girding VBG girls NNS giveaways NNS Giveaways NNS givebacks NNS given VBN Given VBN gives VBZ Gives VBZ giveth VBZ give VB Give VB give VBP GIVE VBP giving VBG Giving VBG gizmos NNS glamorized VBN glamorize VB glanced VBD glares VBZ glasses NNS glaze VB glaze VBP gleaming VBG gleaned VBN glean VB glide VB gliding VBG glimpses NNS glitches NNS glitterati NNS gloated VBD gloaters NNS gloating VBG gloats VBZ gloat VB globalists NNS gloss VB gloss VBP gloves NNS glowed VBD glowing VBG glued VBN glues NNS gluts NNS glutted VBN glut VB glycols NNS gnaw VB goals NNS goats NNS gobbled VBN gobbling VBG goblins NNS Gods NNS goes VBZ Goes VBZ goings-on NNS going VBG Going VBG goldbanded VBN golds NNS golfers NNS golfs NNS Goliaths NNS gone VBN Gone VBN gone VBN|JJ gon VB gon VBG goodies NNS goods NNS Goodyear VBP gored VBN gore VB gorillas NNS gossiping VBG gotten VBN got VB got VBD Got VBD got VBN got VBP go VB Go VB go VBP governed VBD governed VBN governing VBG governmental-affairs NNS government-relations NNS government-securities NNS government-set VBN governmentset VBN governments NNS Governments NNS governors NNS Governors NNS govern VB Govern VB govern VBP grabbed VBD grabbed VBN grabbing VBG grabs NNS grab VB grab VBP grace VB graders NNS grades NNS Grads NNS graduated VBD graduated VBN graduates NNS Graduates NNS graduates VBZ grafted VBN grains NNS Grains NNS GRAINS NNS Grammys NNS grams NNS grandchildren NNS grandees NNS grandkids NNS grandmasters NNS grandmothers NNS grandparents NNS Granges NNS granted VBD granted VBN granting VBG grants NNS grants VBZ grant VB grant VBP grapes NNS Grapes NNS graphics NNS Graphics NNS graphs NNS grappled VBD grapples VBZ grapple VB grasping VBG grasp VB grasp VBP grassroots NNS gratuities NNS graying VBG grazed VBD grazers NNS graze VBP great-grandchildren NNS greats NNS greenhouses NNS greens NNS Greens NNS greeted VBD greeted VBN Greetings NNS greeting VBG greets VBZ greet VB grenades NNS grew VBD GREW VBD grew VBN gridlocked VBN grievances NNS grill VBP grimaced VBD grimaces NNS grimace VB grinders NNS grinding VBG grinds VBZ grind VB gringos NNS Grinned VBD grinning VBG grins NNS gripes NNS gripes VBZ gripped VBD gripping VBG grips NNS groans VBZ groceries NNS groped VBD grossing VBG grottoes NNS groundbreakers NNS grounded VBN ground-handling NNS grounding VBG grounds NNS ground VB ground VBD ground VBN grouped VBN groups NNS Groups NNS group VB groused VBD Groused VBD grouses VBZ grovels VBZ growers NNS growing VBG Growing VBG GROWING VBG growls VBZ grown VBN grows VBZ Grows VBZ growths NNS growth VB grow VB grow VBP grueling VBG grumbled VBD grumble VBP guaranteed VBD guaranteed VBN Guaranteed VBN guaranteeing VBG guarantees NNS guarantees VBZ guarantee VB guarantee VBP guarded VBD guarded VBN guarding VBG guards NNS guards VBZ guard VB guard VBP Guber-Peters NNS guerrillas NNS guessed VBD guessing VBG guess VB guess VBP guests NNS Guests NNS guided VBD guided VBN guidelines NNS Guidelines NNS guideposts NNS guides NNS guide VB guiding VBG guilders NNS gunboats NNS gunmen NNS Gunmen NNS gunned VBN gunners NNS gunslinging VBG guns NNS Guns NNS gurus NNS Gurus NNS gushes VBZ gush VBP guts NNS gut VB guys NNS Guys NNS guzzle VB gymnastics NNS gyrated VBD gyrate VB gyrating VBG gyrations NNS habeas NNS habitats NNS habits NNS hackers NNS hackles NNS hacks NNS hack VB had VBD Had VBD had VBN haggle VB hagglings NNS haggling VBG hailed VBD hailed VBN hailing VBG hails VBZ half-states NNS halls NNS hallways NNS halogenated VBD halted VBD halted VBN halting VBG halts NNS halts VBZ halt VB halt VBP halved VBD halved VBN halves NNS halves VBZ halve VB hamburgers NNS hammered VBN hammering VBG hammer VB hamming VBG hampered VBD hampered VBN hampering VBG hampers VBZ hamper VB hamstring VB hamstrung VBN hamstrung VBP handbills NNS handbooks NNS hand-carried VBN handcuffed VBN handcuffs NNS handed VBD handed VBN handicapped NNS handicapped VBN handicap VB handled VBD handled VBN handlers NNS handles VBZ handle VB handle VBP handling VBG handpicked VBN hands NNS handstands NNS hands VBZ hand VB hanged VBN hanging VBG hangs VBZ hang VB hang VBP happened VBD Happened VBD happened VBN Happened VBN happenings NNS happening VBG happens VBZ happen VB happen VBP harangues NNS harangues VBZ harassed VBD harassing VBG harass VB harboring VBG harbors NNS harbors VBZ harbor VBP hardened VBN hardships NNS harmed VBD harmed VBN harms NNS harms VBZ harm VB harm VBP harnessing VBG harped VBD harping VBG harp VB harried VBN harvested VBN harvests NNS harvest VBP hashing VBG hassles NNS hastened VBN hasten VB has VBN has VBP has VBZ Has VBZ HAS VBZ hatched VBN hatch VB hated VBN hates VBZ hate VB hate VBP hats NNS hauled VBD haulers NNS hauling VBG haul VB haunted VBN haunting VBG haunts NNS haunts VBZ haunt VB haunt VBP havens NNS have VB Have VB have VBD have VBN have VBP Have VBP having VBG Having VBG Hawaii NNS hawkers NNS hawking VBG hawks NNS hawk VBP hazards NNS HCFCs NNS HDTVs NNS headaches NNS headed VBD headed VBN Headed VBN heading VBG Heading VBG headlights NNS headlined VBD headlined VBN headlines NNS headphones NNS headquarters NNS headrests NNS headsets NNS heads NNS heads VBZ head VB head VBP healed VBN healing VBG health-products NNS heal VB heaped VBD heaped VBN heaping VBG heard VBD heard VBN Heard VBN hearings NNS Hearings NNS hearing VBG Hearing VBG hears VBZ HEARS VBZ heartened VBN hearts NNS hear VB hear VBP heated VBN heaters NNS heating VBG Heating VBG heats VBZ heat VB heaved VBD heaved VBN heavens NNS heaves VBD heavyweights NNS heckled VBN hedgers NNS hedges NNS hedge VB hedge VBP hedging VBG hedging VBG|JJ heebie-jeebies NNS heeded VBD heeded VBN heed VB heels NNS heighborhoods NNS heightened VBD heightened VBN heighten VB heights NNS heirs NNS held VBD held VBN helicopters NNS helped VBD helped VBN Helped VBN helped VBP helping VBG Helping VBG helps VBZ Helps VBZ help VB Help VB help VBP hemorrhaged VBN hemorrhaging VBG hemorrhoids NNS hens NNS Hens NNS heralded VBN herald VB herald VBP herbicides NNS Hercules NNS herding VBG herds NNS herniated VBN heroes NNS herons NNS hesitate VB hesitate VBP hesitating VBG hewed VBD hewn VBN hews VBZ hidden VBN hideouts NNS hiders NNS hides NNS hide VB hiding VBG hid VBD highlands NNS highlighted VBD highlighted VBN highlighting VBG highlights NNS Highlights NNS highlights VBZ highlight VB high-rises NNS highs NNS hightailing VBG hightops NNS highways NNS Highways NNS hiked VBN hikers NNS hikes NNS hills NNS hindered VBD hindered VBN hindering VBG hinders VBZ hinder VB hinge VB hinted VBD hinted VBN hinterlands NNS hinting VBG hints NNS hints VBZ hint VB hint VBP hips NNS hired VBD hired VBN Hired VBN hires NNS hires VBZ hire VB hire VBP hiring VBG Hispanics NNS hissed VBD historians NNS historical-claims NNS historicized VBN hitched VBN hitches NNS hits NNS hits VBZ hitters NNS hitting VBG hit VB hit VBD hit VBN Hit VBN hit VBP Hit VBP hoarding VBG hoards NNS hoard VBP hobbies NNS hobbled VBN hobbles VBZ hobbling VBG hobbyists NNS hobos NNS Hoe VB hog NNS hogs NNS Hogs NNS hoisted VBN holders NNS Holders NNS holdings NNS Holdings NNS holding VBG Holding VBG holdouts NNS holdovers NNS holds VBZ holdups NNS hold VB Hold VB HOLD VB hold VB|NN hold VBP Hold VBP holed VBN holes NNS hole VBP holidays NNS holler VB HomeFed VBN homeowners NNS homered VBD homers NNS homer VB homes NNS homicides NNS homosexuals NNS Homosexuals NNS Hondurans NNS Honduras NNS honed VBN hone VB honorariums NNS honored VBN honoring VBG honors NNS honor VB honor VBP hoods NNS hoodwinked VBN hooked VBD hooked VBN hooking VBG hooks VBZ hookups NNS hook VB hooves NNS hoped VBD hoped VBN hopes NNS Hopes NNS HOPES NNS hopes VBZ hope VB hope VBP hoping VBG Hoping VBG hopping VBG hopscotched VBD hops VBZ horizons NNS hormones NNS horns NNS horoscopes NNS horribles NNS horrors NNS horses NNS Horses NNS horticultural-products NNS hosannas NNS hoses NNS Hoses NNS hoses VBZ hospitalizations NNS hospitalized VBN hospitals NNS Hospitals NNS hostages NNS hosted VBD hostilities NNS hosting VBG hosts NNS hosts VBZ host VB hot-cereals NNS hotel-casinos NNS hoteliers NNS hotels NNS hotlines NNS hounded VBD hounding VBG hours NNS Hours NNS housed VBN households NNS houses NNS houses VBZ house VB house VBP housewares NNS housewives NNS Housings NNS hovered VBD hovered VBN hovering VBG howling VBG huckstering VBG huddled VBD hugged VBD hugging VBG hugs NNS hulking VBG humanities NNS humanizing VBG human-resources NNS human-rights NNS humans NNS humbled VBN hum VB hunched VBN hundreds NNS Hundreds NNS hung VBD hung VBN hunker VB hunted VBN hunter-gatherers NNS hunters NNS hunting VBG hunts VBZ hunt VB hunt VBP hurdles NNS hurled VBN hurling VBG hurl VBP hurricanes NNS hurried VBN hurries VBZ hurry VB hurting VBG hurtling VBG hurts VBZ Hurts VBZ hurt VB hurt VBD hurt VBN hurt VBP husbands NNS HUSBANDS NNS hustlers NNS hustles VBZ hybrids NNS hyenas NNS hyped VBD hypermarkets NNS hyping VBG hypnotized VBN hypocrites NNS hypothesized VBN ice-baggers NNS ideals NNS ideas NNS Ideas NNS identified VBD identified VBN identifies VBZ identifying VBG identify VB identify VBP identities NNS ideologies NNS ideologues NNS Ideologues NNS idiots NNS idled VBD idled VBN idle VB idling VBG ignited VBD ignited VBN ignite VB ignored VBD ignored VBN ignores VBZ ignore VB Ignore VB ignore VBP ignoring VBG Ignoring VBG illegalities NNS illnesses NNS ills NNS illuminates VBZ illuminate VB illusions NNS illustrated VBD illustrated VBN illustrates VBZ illustrate VB illustrate VBP illustrations NNS images NNS imagined VBD imagined VBN imagines VBZ imagine VB Imagine VB imagine VBP imagining VBG imbalances NNS imitated VBN imitate VBP imitating VBG immersed VBN immigrants NNS immigrated VBD immunities NNS impacted VBN impacts NNS impact VB impaired VBN impair VB impart VB impeached VBN impeded VBN impedes VBZ impede VB impediments NNS impeding VBG impelled VBN impending VBG imperatives NNS imperfections NNS imperialists NNS imperiled VBN impersonations NNS implanted VBD implanted VBN implanting VBG implant VB implemented VBD implemented VBN implementing VBG implements VBZ implement VB implicated VBN implicate VB implications NNS implied VBD implied VBN implies VBZ implores VBZ imploring VBG implying VBG imply VB imply VBP imported VBD imported VBN imported VBN|JJ importers NNS importing VBG imports NNS Imports NNS imports VBZ import VB import VBP imposed VBD imposed VBN imposes VBZ impose VB Impose VB imposing VBG impounded VBN impound VB impoverished VBN impressed VBD impressed VBN impresses VBZ impressionists NNS impress VB imprisoned VBN Imprisoned VBN imprisoning VBG imprison VB improprieties NNS improved VBD improved VBN improvements NNS improves VBZ improve VB improve VBP improving VBG Improving VBG improvised VBD impugn VB impulses NNS inaugurated VBN incarcerate VB incensed VBN incentives NNS inched VBD inched VBN inches NNS inching VBG inch VB incidents NNS incisions NNS inciting VBG inclined VBN included VBD included VBN Included VBN includes VBZ Includes VBZ include VB include VBP including VBG Including VBG incomes NNS incongruities NNS inconsistencies NNS incorporated VBD incorporated VBN incorporates VBZ incorporate VB incorporating VBG increased VB increased VBD increased VBN Increased VBN increased VBN|JJ increases NNS Increases NNS increases VBZ increase VB Increase VB increase VBP increasing VBG Increasing VBG increments NNS incriminating VBG incumbents NNS incurred VBD incurred VBN incurring VBG incur VB incur VBP indemnify VB independents NNS indexed VBN indexers NNS indexes NNS Indexes NNS index-futures NNS indexing VBG index-options NNS indicated VBD indicated VBN indicates VBZ indicate VB indicate VBP indicating VBG indications NNS indicators NNS indices NNS indicted VBD indicted VBN indictments NNS indict VB individuals NNS Individuals NNS indoctrinated VBN induced VBD induced VBN inducements NNS induces VBZ induce VB inducing VBG indulgences NNS indulges VBZ indulge VB indulging VBG industrialists NNS industrialized VBN industrialize VB industrials NNS industrials VBZ industries NNS Industries NNS inefficiencies NNS inequalities NNS inequities NNS infants NNS Infants NNS infected VBD infected VBN infecting VBG infections NNS inferences NNS inferred VBN infiltrated VBN infiltrate VB infiltrating VBG inflame VB inflated VBD inflated VBN inflates VBZ inflate VB inflating VBG Inflation-adjusted VBN inflicted VBD inflicted VBN inflict VB inflows NNS influenced VBD influenced VBN influences NNS influences VBZ influence VB influence VBP influencing VBG information-services NNS information-systems NNS informed VBD informed VBN informing VBG informs VBZ inform VB infractions NNS infringed VBD infringed VBN infringes VBZ infringe VB infringing VBG infuriated VBD infuriate VB infused VBN infuse VB ingest VB ingots NNS ingrates NNS ingratiate VB ingredients NNS inhabited VBN inhabits VBZ inhabit VBP inherited VBD inherited VBN inherits VBZ inherit VBP inhibited VBD inhibit VB inhibit VBP initialed VBD initialing VBG initials NNS initiated VBD initiated VBN initiate VB initiate VBP initiating VBG initiatiors NNS initiatives NNS initiatives VBZ injected VBD injected VBN injecting VBG injections NNS injects VBZ inject VB Inject VB injunctions NNS injured VBD injured VBN injure VB injuries NNS injuring VBG injustices NNS inks NNS inmates NNS innings NNS innocents NNS innoculating VBG innovated VBD innovate VB innovations NNS innovators NNS i NNS inns NNS innuendoes NNS inputs NNS inquired VBD inquiries NNS inquiring VBG inroads NNS insects NNS inserted VBD inserted VBN inserting VBG inserts NNS insert VB insiders NNS Insiders NNS insights NNS insinuating VBG insisted VBD insisted VBN insisting VBG Insisting VBG insists VBZ insist VB insist VBP inspected VBD inspected VBN inspecting VBG inspections NNS inspectors NNS inspectors VBZ Inspects VBZ inspect VB inspect VBP inspirations NNS inspired VBD inspired VBN Inspired VBN inspire VB inspire VBP installations NNS installed VBD installed VBN installing VBG installments NNS install VB install VBP instances NNS instigated VBD instill VB instincts NNS instituted VBD instituted VBN institute VB instituting VBG institutions NNS Institutions NNS instructed VBD instructed VBN instructing VBG instructions NNS instructors NNS instructs VBZ instruct VB instrumentalists NNS instruments NNS Instruments NNS insulated VBN insulate VB insulate VBP insulating VBG insulins NNS insulting VBG insult VB insurance-claims NNS insured VBD insured VBN Insureres NNS insurers NNS Insurers NNS INSURERS NNS insures VBZ insure VB insure VBP insurgents NNS insuring VBG integrated-technologies NNS integrated VBN integrate VB integrating VBG intellectuals NNS intended VBD intended VBN intends VBZ intend VB intend VBP intensified VBD intensified VBN intensifying VBG intensify VB intentions NNS intents NNS interactions NNS interceded VBD intercepted VBN Intercepting VBG interconnected VBN interconnect VB interested VBN interests NNS interests VBZ interest VB interfered VBD interferes VBZ interfere VB interfere VBP interfering VBG interior-furnishings NNS interiors NNS interjects VBZ interloping VBG intermediaries NNS intermixed VBD internationalists NNS international-operations NNS interpretations NNS interpreted VBD interpreted VBN interpreting VBG interprets VBZ interpret VB interpret VBP interrogated VBN interrogators NNS interrupted VBN interrupting VBG interruptions NNS interrupt VB intersections NNS interspersed VBN intersperses VBZ interstates NNS intertitles NNS intertwined VBN intertwining VBG intervals NNS intervened VBD intervened VBN intervene VB intervening VBG interventionists NNS interventions NNS interviewed VBD interviewed VBN interviewing VBG interviews NNS Interviews NNS interview VB intimate VB intimidated VBN intimidate VB intimidating VBG intimidations NNS intones VBZ intrigued VBN intrigues NNS introduced VBD introduced VBN Introduced VBN introduces VBZ introduce VB introducing VBG Introducing VBG introductions NNS intrude VBP intrusions NNS inundated VBN invaded VBD invaders NNS invades VBZ invade VB invade VBP invading VBG invalidated VBD invented VBD invented VBN inventing VBG inventions NNS inventories NNS Inventories NNS inventors NNS invent VB invent VBP inverted VBN invested VBD invested VBN investigated VBD investigated VBN investigates VBZ investigate VB investigating VBG Investigating VBG investigations NNS investigators NNS Investigators NNS investing VBG Investing VBG investing VBG|NN investments NNS investor-relations NNS investors NNS Investors NNS invests VBZ invest VB invest VBP invitations NNS invited VBD invited VBN invites VBZ invite VB inviting VBG invoices NNS invoked VBD invoked VBN invokes VBZ invoke VB invoking VBG involved VB involved VBD involved VBN Involved VBN involves VBZ involve VB involve VBP involving VBG IOUs NNS IPOs NNS IRAs NNS Irises NNS irked VBD irked VBN irks VBZ irk VB ironies NNS irons NNS iron VB irradiated VBN irregularities NNS irritated VBN irritates VBZ islands NNS islands VBZ is NNS isolated VBD isolated VBN isolates VBZ isolate VB isolate VBP issued VBD issued VBN issuers NNS issues NNS Issues NNS ISSUES NNS issues VBZ issue VB issue VBP issuing VBG is VBZ Is VBZ IS VBZ itemize VB items NNS Items NNS jabs NNS Jackals NNS jacked VBD jacked VBN jackets NNS Jackets NNS jackhammers NNS jacking VBG jack VB jailed VBD jailed VBN JAILED VBN jails NNS jammed VBD jammed VBN jams NNS jam VB Jan. VB Japanese-Americans NNS Japanese NNS Japanese VBP jarring VBG jars NNS jaunts NNS JAUNTS NNS jeans NNS Jeeps NNS jelled VBD jeopardized VBN jeopardizes VBZ jeopardize VB jeopardizing VBG jerked VBN jetliners NNS jets NNS Jets NNS jettisoning VBG jet VBP jewelers NNS jewels NNS Jews NNS jiggling VBG jillions NNS jingling VBG jinks NNS jinxed VBN jitters NNS jobs NNS Jobs NNS jockeys NNS jocks NNS jogs VBZ joined VBD joined VBN joining VBG Joining VBG joins VBZ JOINS VBZ joint-implants NNS joints NNS join VB JOIN VB join VBP joked VBD jokes NNS jokes VBZ joke VB jolted VBD jolted VBN jolts NNS jolt VB Jolt VB jostle VBP journalists NNS Journalists NNS journals NNS jousting VBG joys NNS judged VBN judgeships NNS judges NNS Judges NNS judge VB Judge VB judge VBP judging VBG Judging VBG judgments NNS jugglers NNS juggle VB juggling VBG jugs NNS juices NNS jumbos NNS jumped VBD jumped VBN jumping VBG Jumping VBG jumps NNS jumps VBZ jump VB jump VBP junctures NNS juniors NNS junk-bond NNS junkets NNS junk-holders NNS junkholders NNS Junk-holders NNS junkies NNS junk VB juries NNS jurisdictions NNS jurists NNS jurors NNS jury-rigged VBD justices NNS justified VBN justifies VBZ justifying VBG justify VB jutting VBG Jutting VBG juxtapose VBP kayoed VBN KC-135s NNS keen VB Keepers NNS keeping VBG Keeping VBG keeps VBZ keep VB Keep VB keep VBP kept VBD kept VBN ketchup VB|IN keyboards NNS keyed VBN keys NNS key VB kickbacks NNS kicked VBD kicked VBN kickers NNS kicking VBG kicks NNS kicks VBZ kick VB kick VBP kiddies NNS kidding VBG kidnapped VBD kidnapped VBN kidnappers NNS kidnap VB kids NNS Kids NNS killed VBD killed VBN killers NNS killings NNS killing VBG kills VBZ kill VB Kill VB kill VBP kilobytes NNS kilograms NNS kilometers NNS kindled VBN kinds NNS kinfolk NNS kingpins NNS kings NNS Kissing VBG Kiss VB kits NNS kneaded VBN kneading VBG knees NNS knew VBD knights NNS knitted VBN knitting VBG knit VBN knocked VBD knocked VBN knocking VBG knocks VBZ knock VB knots NNS knowing VBG Knowing VBG knowns NNS known VBN Known VBN knows VBZ know VB know VBP Know VBP knuckles NNS kowtow VB kronor NNS kudos NNS labeled VBD labeled VBN labeling VBG Labeling VBG labels NNS labels VBZ label VB label VBP laboratories NNS laborers NNS laboring VBG Laboring VBG labors NNS labs NNS laced VBN lacked VBD lacked VBN lackeys NNS lacking VBG lacks VBZ lack VBP ladies NNS laggards NNS lagged VBD lagged VBN lagging VBG lagoons NNS lags NNS lags VBZ Lags VBZ lag VB lag VBP laid VBD laid VBN lakes NNS lambasted VBD lambastes VBZ lamented VBD laments VBZ laminated VBN lampposts NNS lamps NNS landed VBD landed VBN landfills NNS landholdings NNS landings NNS landing VBG landlords NNS landowners NNS Landowners NNS landscapers NNS landscapes NNS landslides NNS lands NNS land VB Land VBP lanes NNS languages NNS languished VBD languished VBN languishes VBZ languishing VBG languish VB lapsed VBN lapses NNS lapses VBZ laps NNS laptops NNS Laptops NNS lap VBP lasers NNS lashed VBD lashing VBG lash VB lash VBP lasted VBD lasted VBN lasting VBG lasts VBZ last VB last VBP latched VBN latches VBZ latching VBG latch VBP lathes NNS lauded VBD lauded VBN laughed VBD laughed VBN laughing VBG laughs NNS laughs VBZ laugh VB launched VBD launched VBN launches NNS launches VBZ launching VBG launch VB laundered VBD laundered VBN Laundered VBN launderers NNS laundering VBG launder VB laurels NNS lavished VBN lavishing VBG lawbreakers NNS lawmakers NNS Lawmakers NNS LAWMAKERS NNS law-making NNS lawns NNS laws NNS lawsuits NNS Lawsuits NNS lawyers NNS Lawyers NNS LAWYERS NNS laxatives NNS layers NNS laying VBG layoffs NNS LAYOFFS NNS lays VBZ lay VB lay VBD lay VBP LBOs NNS leaders NNS LEADERS NNS leading VBG Leading VBG leads NNS leads VBZ Leads VBZ lead VB lead VBN lead VBP leafing VBG leaflets NNS leaguers NNS leagues NNS leaked VBD leaked VBN leakers NNS leaking VBG leaks NNS leaned VBD leaned VBN leaning VBG leans VBZ lean VB Lean VB lean VBP leaped VBD leaped VBN leapfrog VB leaping VBG Leaping VBG leaps NNS leapt VBD leap VB leap VBP learned VBD learned VBN learning VBG Learning VBG learns VBZ learn VB learn VBP leased VBD leased VBN leases NNS leases VBZ lease VB lease VBP leasing VBG leasing VBG|NN leathers NNS leaves NNS leaves VBZ leave VB Leave VB leave VBP leaving VBG Leaving VBG lectured VBD lectures VBZ lecture VB lecture VBP ledgers NNS led VBD led VBN Led VBN leeches NNS leftists NNS leftovers NNS left VBD Left VBD left VBN Left VBN legalizing VBG legal-services NNS legions NNS legislate VB Legislating VBG legislators NNS Legislators NNS legislatures NNS legitimized VBN legitimize VB legs NNS lemmings NNS lemons NNS lenders NNS Lenders NNS lending VBG lends VBZ lend VB Lend VB lend VBP lengthened VBD lengthened VBN lengthens VBZ lengthen VB lengthen VBP lengths NNS lenses NNS LENSES NNS lent VBD lent VBN leotards NNS lesbians NNS lesions NNS lessening VBG lessen VB lessers NNS lessons NNS lets VBZ letters NNS Letters NNS letting VBG Letting VBG let VB Let VB let VBD Let VBD let VBN let VBP leveled VBD leveled VBN leveling VBG levels NNS level VB level VBP leveraged VBN Leveraged VBN LEVERAGED VBN leverage VB leveraging VBG levied VBD levy VB liabilities NNS liaisons NNS Liaisons NNS liars NNS libeled VBN liberalizations NNS liberalized VBD liberalized VBN liberalize VB liberalize VBP liberalizing VBG liberals NNS Liberals NNS liberated VBD libertarians NNS liberties NNS librarians NNS libraries NNS lice NNS licensed VBD licensed VBN licenses NNS licenses VBZ license VB license VBP licensing VBG licking VBG Lids NNS lied VBD lies NNS lies VBZ LIES VBZ lieutenants NNS lie VB lie VBP lifeguards NNS lifes NNS lifted VBD lifted VBN lifting VBG lifts NNS lifts VBZ lift VB Lift VB lift VBP lighted VBN lightened VBD lightening VBG lighten VB lighting VBG lights NNS Lights NNS light VB liked VBD liked VBN likened VBD likened VBN likening VBG likes NNS likes VBZ like VB like VBP limbs NNS limitations NNS limited VBD limited VBN limiting VBG Limiting VBG limits NNS limits VBZ limit VB limit VBP limousines NNS limping VBG linebackers NNS lined VBD lined VBN liners NNS lines NNS Lines NNS lineups NNS line VB line VBP lingering VBG lingers VBZ linger VB Linger VB lining VBG linkages NNS linked VBD linked VBN linking VBG Linking VBG links NNS links VBZ link VB link VBP Link VBP lions NNS lipoproteins NNS lips NNS lipsticks NNS liquefied VBN liquefies VBZ liquefy VB liquefy VBP liquidated VBD liquidated VBN liquidate VB liquidating VBG Liquidating VBG liquids NNS lire NNS listed VBD listed VBN listened VBD listened VBN listeners NNS Listeners NNS listening VBG listens VBZ listen VB Listen VB listen VBP listings NNS listing VBG lists NNS lists VBZ list VB list VBP lithographs NNS litigants NNS litigators NNS littered VBN litter VBP lit VBD lit VBN lived VBD lived VBN live-hauled VBD live-haulers NNS lives NNS Lives NNS lives VBZ Lives VBZ live VB Live VB live VBP Live VBP living VBG Living VBG loaded VBD loaded VBN loadings NNS loading VBG loads NNS load VB loafers NNS loaned VBD loaned VBN loans NNS Loans NNS loans VBZ loan VB loathed VBD loathed VBN loathes VBZ loaves NNS lobbied VBD lobbied VBN lobbies NNS lobbying VBG lobbyists NNS lobby VB locales NNS localities NNS localized VBN locals NNS located VBN Located VBN locate VB locate VBP locating VBG locations NNS locked VBD locked VBN locking VBG locks NNS locks VBZ lock VB lock VBP locutions NNS lodged VBD lodged VBN lodge VB lodgings NNS logged VBD logged VBN loggers NNS logging VBG logistics NNS logos NNS log-rolled VBD logs NNS logs VBZ log VB log VBP long-term NNS long VBP looked VBD looked VBN lookee-loos NNS looking VBG Looking VBG looks NNS looks VBZ look VB Look VB look VBP looming VBG looms NNS looms VBZ loom VB loom VBP LOOM VBZ loonies NNS loopholes NNS loops NNS loosened VBN loosening VBG loosen VB loose VB looting VBG loot VB lopped VBD lorded VBD lords NNS losers NNS loses NNS loses VBZ lose VB lose VBP losing VBG losses NNS Losses NNS LOSSES NNS lost VBD Lost VBD lost VBN lotions NNS lots NNS Lots NNS lotteries NNS loudspeakers NNS louis NNS lounges NNS lovebirds NNS loved VBD loved VBN lovers NNS loves VBZ Loves VBZ love VB love VBP lowered VBD lowered VBN lowering VBG lowers VBZ lower VB lower VBP low-lifes NNS lows NNS loyalties NNS lubricants NNS lucked VBD lugged VBD lugging VBG lugs NNS lulled VBN lumber VBP luminaries NNS lumped VBN lumping VBG lumps NNS lumps VBZ lunch VB lunged VBD lunging VBG lungs NNS lurched VBD lurching VBG lurch VBP lured VBD lured VBN lures NNS lures VBZ lure VB Lure VBP luring VBG lurking VBG luxuries NNS lying VBG lyrics NNS Lyrics NNS machetes NNS Machiguengas NNS machines NNS Machines NNS machinists NNS Machinists NNS Machinists NNS|NNPS made VBD made VBN Made VBN mafias NNS mafiosi NNS magazines NNS Magazines NNS maggots NNS magicians NNS magistrates NNS magnetized VBN magnets NNS magnified VBD magnified VBN magnify VB Magnolias NNS maharajahs NNS mailed VBD mailed VBN mailers NNS mailings NNS mailing VBG mailmen NNS mail-sorting VBG mail VB mail VBP mainframes NNS mains NNS maintained VBD maintained VBN maintaining VBG maintains VBZ maintain VB maintain VBP Maintain VBP majoring VBG majors NNS makers NNS Makers NNS makes NNS makes VBZ make VB Make VB MAKE VB make VBP Make VBP making VBG Making VBG MAKING VBG Makin VBG malefactors NNS males NNS malfunctions NNS maligned VBN malls NNS mammoths NNS manacles NNS managed VBD managed VBN Managed VBN managements NNS managers NNS Managers NNS manages VBZ manage VB manage VBP managing VBG Managing VBG mandated VBD mandated VBN mandates NNS mandates VBZ mandate VB mandating VBG maneuvered VBD maneuvered VBN maneuverings NNS maneuvering VBG maneuvers NNS MANEUVERS NNS maneuvers VBZ maneuver VB manhandled VBN manifestations NNS manifestos NNS manifest VBP maninstays NNS manipulated VBN manipulates VBZ manipulate VB manipulate VBP manipulating VBG manipulations NNS manipulators NNS manned VBD mannerisms NNS manners NNS manning VBG mansions NNS manuals NNS MANUALS NNS manuevering VBG manufactured VBD manufactured VBN manufacturers NNS Manufacturers NNS manufactures VBZ manufacture VB manufacture VBP manufacturing VBG man VB many NNS many VB mapped VBN mapping VBG maps NNS map VB map VBP marathons NNS marbles NNS marched VBD marched VBN marchers NNS marches NNS marching VBG march VB march VBP Margeotes NNS marginalia NNS marginalizing VBG margin-calls NNS margined VBN margining VBG margins NNS Margins NNS markdowns NNS marked VBD marked VBN marketed VBD marketed VBN marketeers NNS marketers NNS Marketers NNS marketing-communications NNS marketing VBG market-makers NNS marketplaces NNS markets NNS Markets NNS markets VBZ market VB market VBP marking VBG markka NNS marks NNS Marks NNS marks VBZ mark VB mark VBP marquees NNS marred VBN marriages NNS married VBD married VBN marrying VBG marry VB marshes NNS marveled VBD marvels NNS marvel VB masked VBD masked VBN masks NNS Masks VBZ masons NNS masquerading VBG massacres NNS massages NNS massage VB massaging VBG massed VBD masses NNS masseurs NNS masseuses NNS mass-media NNS mass-producing VBG mastered VBN masterpieces NNS masters NNS master VB matched VBD matched VBN matches NNS matches VBZ matching VBG match VB match VBP mated VBN materialized VBD materialized VBN materializes VBZ materialize VB materialize VBP materials NNS Materials NNS mates NNS mathematics NNS mating VBG mattered VBN matters NNS Matters NNS|VBZ matters VBZ Matters VBZ matter VB matter VBP matured VBN matures VBZ mature VB mature VBP maturing VBG maturities NNS Maturities NNS maul VB mavens NNS maximize VB maximizing VBG maxims NNS mayors NNS mazes NNS McDonald VB meadows NNS meals NNS meandered VBD meanders VBZ meanings NNS meaning VBG means NNS Means NNS means VBZ Means VBZ meant VBD meant VBN mean VB Mean VB mean VBP measured VBD measured VBN Measured VBN measurements NNS measures NNS Measures NNS measures VBZ measure VB measuring VBG Meats NNS MEATS NNS mechanics NNS mechanisms NNS medallions NNS meddle VB meddling VBG media NNS mediate VB mediators NNS medical-products NNS medicines NNS meetings NNS Meetings NNS meeting VBG Meeting VBG meets NNS meets VBZ meet VB Meet VB meet VBP megabytes NNS mega-crashes NNS Mega-hits NNS mega-issues NNS mega-mergers NNS mega-problems NNS mega-projects NNS megaquestions NNS mega-resorts NNS megawatts NNS melding VBG melds VBZ meld VB mellowed VBN melodies NNS melting VBG melts VBZ melt VB memberships NNS members NNS Members NNS mementos NNS memoirs NNS memorabilia NNS memoranda NNS memorandums NNS memorialized VBN memories NNS memorize VBP memos NNS MEMOS NNS mend VB men NNS Men NNS mentioned VBD mentioned VBN mentioning VBG mentions VBZ mention VB mention VBP mentors NNS menus NNS Mercantilists NNS Mercedes-Benzes NNS Mercedes NNS merchandised VBN merchandisers NNS merchandising VBG merchants NNS Merchants NNS merged VBD merged VBN mergers NNS merge VB merge VBP merging VBG meringues NNS merits NNS merit VB merit VBP mesh VB messages NNS messengers NNS messing VBG Messrs. NNS mess VB metabolized VBN metals NNS Metals NNS METALS NNS metal-workers NNS metalworkers NNS metaphors NNS meted VBN meters NNS methodologies NNS methods NNS meting VBG metrics NNS met VBD met VBN mice NNS Mice NNS micoprocessors NNS microbes NNS microchips NNS microcomputers NNS microeconomics NNS microelectronics NNS microphones NNS microprocessors NNS microwaves NNS mid-1940s NNS mid-1960s NNS mid-1970s NNS mid-1980s NNS mid-1990s NNS middlemen NNS midsized VBN miffed VBD miffed VBN MiG-29s NNS mighta MD|VB migrate VB migrations NNS miles NNS milestones NNS military-electronics NNS militate VB militias NNS milked VBN milks NNS milk VB milling VBG millionaires NNS million-plus NNS millions NNS Millions NNS mills NNS mimics NNS mimics VBZ mimic VB mimic VBP mince VB minded VBD minds NNS mind VB Mind VB minefields NNS minerals NNS miners NNS mines NNS mine VBP mingle VBP miniaturized VBN minicars NNS minicomputers NNS minimills NNS minimized VBN minimize VB minimizing VBG minimums NNS mining VBG miniseries NNS ministers NNS ministries NNS minisupercomputers NNS minivans NNS Minneapolis NNS minorities NNS minors NNS minted VBN minting VBG mints NNS minuses NNS Minuses NNS minutes NNS minutiae NNS MIPs NNS MIPS NNS mired VBN mirrored VBD mirrors VBZ mirror VB mirror VBP misadventures NNS miscalculated VBD miscalculated VBN miscarriages NNS misclassified VBN miscreants NNS misdeeds NNS misdemeanors NNS misfortunes NNS misguided VBN mishandled VBD mishandling VBG misinterpreted VBN misinterpret VB misjudged VBD misjudgments NNS mislaid VBN misleading VBG misled VBD misled VBN mismeasurements NNS misperceptions NNS misplaced VBN misquoting VBG misrepresentations NNS misrepresented VBD misrepresenting VBG misrepresents VBZ misrepresent VB misrouted VBN missed VBD missed VBN misses VBZ missiles NNS missing VBG missionaries NNS missions NNS misspent VBN misstated VBD misstated VBN misstatements NNS misstates VBZ miss VB miss VBP mistaken VBN mistakes NNS mistake VB mistreat VB mistresses NNS mistrials NNS mists NNS misunderstandings NNS Misunderstanding VBG misunderstood VBN misused VBD mites NNS mitigate VB mitigating VBG Mitsubishi NNS Mitsui NNS mixed VBD mixed VBN Mixed VBN mixers NNS mixes VBZ mixing VBG mixtures NNS mix VB moaning VBG moans VBZ moan VB mobilized VBD mobilized VBN mobilize VB mobilizing VBG mocked VBN mocking VBG modeled VBD modeled VBN modeling VBG models NNS model VB modems NNS moderated VBN moderates NNS moderate VB moderate VBP moderating VBG modernized VBD modernized VBN modernize VB modernizing VBG modes NNS modifications NNS Modifications NNS modified VBN modifies VBZ modify VB modulate VBP moisturizers NNS molded VBN molds NNS molecules NNS mollified VBN mollify VB moments NNS Mondays NNS monetarists NNS moneymakers NNS monitored VBD MONITORED VBD monitored VBN monitoring VBG monitors NNS monitors VBZ monitor VB monitor VBP monkeys NNS monoliths NNS monologues NNS monopolies NNS monopolized VBD monopolized VBN monopolize VB monopolizing VBG months NNS Months NNS Moonies NNS moonlighting VBG moons NNS mopping VBG mop VB morals NNS mores NNS mornings NNS morsels NNS mortgaged VBN mortgages NNS mortgage VB Moslems NNS motels NNS mothers NNS Mothers NNS motifs NNS motions NNS motions VBZ motivated VBN motivate VB motivate VBP motivating VBG motives NNS motorcycles NNS motorists NNS motorized VBN motors NNS mounds NNS mountains NNS mounted VBD mounted VBN mounting VBG mounts VBZ mount VB mount VBP mourning VBG Mourning VBG mousetraps NNS mouthed VBD mouths NNS moved VBD MOVED VBD moved VBN movements NNS moves NNS Moves NNS moves VBZ MOVES VBZ move VB Move VB move VBP Move VBP movies NNS moving VBG Moving VBG mow VB mucked VBN muddied VBN muddled VBN muffled VBN muffs NNS mulling VBG mulls VBZ mull VB multimedia NNS multinationals NNS multipled VBD multiples NNS multiplied VBN multiplying VBG multiply VB multiply VBP mumbled VBN mummies NNS Munching VBG municipalities NNS municipals NNS Municipals NNS MUNICIPALS NNS muni NNS munis NNS murals NNS murdered VBN murderers NNS murdering VBG murders NNS murmuring VBG muscled VBD muscles NNS muscling VBG Muscovites NNS muses NNS muses VBZ museums NNS Museums NNS muse VB mushroomed VBD mushroomed VBN mushrooms NNS musicians NNS muster VB mutated VBN mutate VB mutations NNS muted VBN mute VB mutilated VBN Mutinies NNS mutters NNS mutts NNS muzzles VBZ 'm VBP mysteries NNS myths NNS nabbing VBG naggings NNS nagging VBG nags NNS nailed VBN nails NNS nail VB name-droppers NNS name-drops VBZ named VBD named VBN Named VBN nameplates NNS names NNS Names NNS names VBZ name VB name VBP naming VBG narcotics NNS narratives NNS narrowed VBD narrowed VBN narrowing VBG Narrowing VBG narrows VBZ narrow VB nationalists NNS nationalized VBD nationalized VBN nationals NNS nations NNS natives NNS natural-foods NNS natural-resources NNS navies NNS navigate VB naysayers NNS naysay VB Nazis NNS Neanderthals NNS neared VBD nearing VBG near-monopolies NNS nears VBZ necessitated VBD necessitated VBN necessities NNS necks NNS neckties NNS needed VBD needed VBN needed VBN|JJ needing VBG needs NNS Needs NNS needs VBZ need VB need VBP negated VBN negatives NNS neglected VBD neglected VBN neglecting VBG negotiated VBD negotiated VBN negotiates VBZ negotiate VB negotiate VBP negotiating VBG negotiations NNS Negotiations NNS negotiators NNS Negotiators NNS neighbhorhoods NNS neighborhoods NNS neighboring VBG neighbors NNS neighbours NNS neophytes NNS nerds NNS Nerds NNS nerves NNS Nestled VBN nests NNS nets NNS nets VBZ netted VBD netted VBN netting VBG net VB networking VBG networks NNS Networks NNS network VB neurologists NNS neutralized VBN neutralizes VBZ neutrons NNS newborns NNS newcasts NNS newcomers NNS newscasts NNS newsies NNS newsletters NNS News NNS newspapers NNS Newspapers NNS newsprints NNS newsstands NNS Newsstands NNS news-weeklies NNS nibbling VBG niches NNS Nicholas NNS nicked VBN nicknamed VBN nicknames NNS nightclubs NNS nightmares NNS nights NNS nine-months NNS nine-tenths NNS nixed VBD noblemen NNS nods VBZ no-loads NNS nominated VBD nominated VBN nominate VB nominations NNS nominees NNS non-clients NNS non-communists NNS nonconformists NNS non-economists NNS non-lawyers NNS Non-lawyers NNS nonoperating VBG Nonperformers NNS nonperforming VBG Nonperforming VBG nonrecurring VBG non-seamen NNS nonstops NNS noodles NNS normalize VB norms NNS Norms NNS Northrop VB nose-dived VBD nosedived VBD nose-dived VBN nosediving VBG noses NNS Nos. NNS notched VBD notch VB notebooks NNS noted VBD noted VBN noteholders NNS notes NNS Notes NNS NOTES NNS notes VBZ note VB Note VB NOTE VB note VBP noticed VBD noticed VBN notices NNS notice VB notice VBP noticing VBG notifications NNS notified VBD notified VBN notifies VBZ notifying VBG notify VB notify VBP noting VBG Noting VBG notions NNS No. VB novels NNS novelties NNS novitiates NNS Nov. VB nozzles NNS nuances NNS nuclear-arms NNS nudge VB nullified VBN nullify VB nullify VBP number-crunchers NNS numbered VBD numbered VBN numbering VBG numbers NNS nursed VBD nurseries NNS nurses NNS nursing VBG nurtured VBD nurtured VBN nurture VB nurturing VBG nuts NNS oaks NNS Oases NNS oats NNS obey VBP obfuscate VB objected VBD objected VBN objecting VBG objections NNS Objections NNS objectives NNS objects NNS object VB object VBP obligated VBD obligated VBN obligations NNS Obligations NNS obliged VBN obliged VBN|JJ obliges VBZ obliterated VBN obscured VBD obscures VBZ obscure VB observations NNS observed VBD observed VBN observers NNS Observers NNS observes VBZ observe VB observe VBP observing VBG Observing VBG obsessed VBN obsoleted VBN obsoleting VBG obstacles NNS obstructed VBN obstructing VBG obstruct VBP obtained VBD obtained VBN obtaining VBG obtain VB obtain VBP obviate VB occasions NNS occasion VB occupations NNS occupied VBD occupied VBN occupies VBZ occupying VBG occupy VB occurred VBD occurred VBN occurrences NNS occurring VBG occurs VBZ occur VB occur VBP oceans NNS octaves NNS octogenarians NNS oddballs NNS oddities NNS ODDITIES NNS odds NNS Odds NNS offenders NNS offending VBG offends VBZ offend VB offensives NNS offered VBD offered VBN Offered VBN OFFERED VBN offerings NNS offering VBG Offering VBG offers NNS offers VBZ offer VB Offer VB offer VBP off-hours NNS officals NNS officers NNS Officers NNS offices NNS Offices NNS office-supplies NNS officials NNS Officials NNS OFFICIALS NNS offi NNS offsets NNS offsetting VBG Offsetting VBG offset VB offset VBD offset VBN offset VBP offshoots NNS offspring NNS ogles VBZ ogling VBG oils NNS Oils NNS olds NNS old-timers NNS olefins NNS Olympics NNS omens NNS omissions NNS omits VBZ omitted VBD omitted VBN omit VB omit VBP oncogenes NNS Oncogenes NNS ones NNS onlookers NNS Onlookers NNS on-ramps NNS ooze VB oozing VBG opened VBD opened VBN openended VBN openers NNS openings NNS opening VBG opens VBZ open VB Open VB open VBP operas NNS operated VBD operated VBD|VBN operated VBN operates VBZ operate VB operate VBP operating VBG Operating VBG operations NNS operatives NNS operators NNS opining VBG opinion-makers NNS opinions NNS Opinions NNS opponents NNS Opponents NNS opportunists NNS opportunities NNS opposed VBD opposed VBN Opposed VBN opposes VBZ oppose VB oppose VBP opposing VBG opted VBD opted VBN optical-products NNS optimists NNS opting VBG options NNS Options NNS OPTIONS NNS option VBP opt VB opt VBP oranges NNS orchardists NNS orchards NNS orchestras NNS orchestrated VBD orchestrated VBN orchestrating VBG orchids NNS ordained VBN ordered VBD ordered VBN ORDERED VBN ordering VBG orders NNS Orders NNS orders VBZ order VB ordinances NNS organisms NNS organizations NNS Organizations NNS organized VBD organized VBN ORGANIZED VBN organizers NNS organizes VBZ organize VB organizing VBG organs NNS oriented VBN originated VBN originated VBP originates VBZ originate VB originating VBG originations NNS originators NNS origins NNS Orkem VB ornaments NNS orphaned VBN orphans NNS others NNS Others NNS ounces NNS ousted VBD ousted VBN OUSTED VBN ousting VBG oust VB outages NNS outbidding VBG outbid VB outbid VBP outbreaks NNS outcomes NNS outdated VBN outdid VBD outdistanced VBN outdone VBN outfielders NNS outfits NNS outfit VB outflank VB outflows NNS Outflows NNS outfly VB outgained VBD outgrew VBD outgrown VBN outings NNS outlanders NNS outlasted VBD outlast VB outlawed VBD outlawed VBN outlawing VBG outlays NNS Outlays NNS outleaped VBD outlets NNS outlined VBD outlined VBN outlines VBZ outline VB outlining VBG outlooks NNS outmoded VBN outnumbered VBD out-of-staters NNS outpaced VBD outpaced VBN outpaced VBP outpace VB outpace VBP outpacing VBG outperformed VBD outperformed VBN outperforming VBG outperforms VBZ outperform VB outperform VBP outposts NNS outraged VBN outranks VBZ outselling VBG outsells VBZ outsell VB outshines VBZ outshine VB outsiders NNS outskirts NNS out-smart VB outsold VBD outstripped VBD outstripped VBN outstripping VBG outstrips VBZ outstrip VB out-trade VB outweighed VBD outweighed VBN outweigh VB outweigh VBP ovens NNS over-allotments NNS overalls NNS overarching VBG overbid VB overbid VBD overbought VBN overburden VB overcame VBD overcharges NNS overcollateralized VBN overcomes VBZ overcome VB overcome VBN overcome VBP overcommitted VBN overdoing VBG overdone VBN overdosed VBN overdosing VBG overemphasize VB overflowing VBG overhanging VBG overhauled VBN overhauling VBG overhaul VB overheated VBN overheating VBG overlaid VBN overlapping VBG overlap VB overlap VBP overlays VBZ overlooked VBD overlooked VBN overlooking VBG overlooks VBZ overlook VB overlook VBP over-magazined VBN overpaid VBD overpaid VBN overpaying VBG overpay VB overplanted VBN overpower VB overpriced VBN overpurchase VB overreacted VBN overreacting VBG Overreacting VBG overreact VB overreact VBP override VB overriding VBG overrode VBD overruled VBD overruled VBN overrule VB overruling VBG overruns NNS oversaw VB oversaw VBD overseeing VBG overseen VBN overseers NNS oversees VBZ oversee VB overshadowed VBD overshadowed VBN overshadowing VBG oversimplified VBN oversold VB oversold VBN overstated VBD overstated VBN overstate VB overstating VBG overstrained VBN oversubscribed VBN overtaken VBN overtaxed VBN overthrowing VBG overthrown VBN overthrow VB overtures NNS overturned VBD overturned VBN overturning VBG overturn VB overused VBN overvalued VBD overvalued VBN overweighted VBN overwhelmed VBD overwhelmed VBN overwhelming VBG overwhelm VB overwhelm VBP overworking VBG owed VBD owed VBN owes VBZ owe VB owe VBP owing VBG owned VBD owned VBN owners NNS Owners NNS owning VBG Owning VBG owns VBZ own VB own VBN own VBP paced VBN pacemakers NNS pace VB pacified VBD packaged-goods NNS Packaged-goods NNS packaged VBN packages NNS Packages NNS packages VBZ package VB packaging VBG packed VBD packed VBN packets NNS packing VBG packs NNS packs VBZ pack VB pack VBP PACs NNS PACS NNS pacts NNS paddles NNS paeans NNS pages NNS paid VBD paid VBN Paid VBN paid VBN|JJ PaineWebber VB pains NNS painted VBD painted VBN painters NNS paintings NNS Paintings NNS painting VBG paints NNS paint VB paint VBP paired VBN pairs NNS pair VB palazzi NNS paled VB paled VBD pales VBZ pale VBP palms NNS palmtops NNS Palmtops NNS palm VB pals NNS pampers VBZ pamphlets NNS pancakes NNS pandering VBG panelists NNS panels NNS pangs NNS panicked VBD panicking VBG panics NNS panic VB panjandrums NNS panned VBD panned VBN panning VBG Pantages NNS panties NNS pants NNS pan VB paper-goods NNS PAPER NNS paper-products NNS papers NNS PAPERS NNS parachute VB parachuting VBG parades NNS parakeets NNS parallels NNS parallels VBZ parallel VB paralyzed VBN paralyzing VBG Paramedics NNS parameters NNS paraphernalia NNS paraphrase VBP parasites NNS parastatals NNS parcels NNS parcel VB parcel VBP parched VBN pardoned VBD pared VBD pared VBN parents NNS Parents NNS pare VB Paribas NNS parimutuels NNS paring VBG parishes NNS parishioners NNS parities NNS parked VBD parked VBN parking VBG parks NNS park VB Park VB parlors NNS parried VBD parry VB partake VB parted VBD participants NNS Participants NNS participated VBD participated VBN participates VBZ participate VB participate VBP participating VBG participations NNS particulars NNS parties NNS Parties NNS parties VBZ parting VBG partisans NNS Partisans NNS partnerships NNS Partnerships NNS partners NNS Partners NNS PARTNERS NNS parts NNS Parts NNS part VB pashas NNS passages NNS passed VBD passed VBN Passed VBN passenger-kilometers NNS passengers NNS passers-by NNS passes NNS passes VBZ passing VBG passions NNS passports NNS pass VB pass VBP passwords NNS pasted VBN pastels NNS pasteurized VBN pastimes NNS pastors NNS patched VBN patch VB patented VBD patented VBN patents NNS paths NNS patients NNS Patients NNS patrolled VBN patrols NNS patronized VBN patronize VB patronizing VBG patrons NNS patterned VBN patterns NNS Patterns NNS paused VBD pauses NNS pauses VBZ pause VB pause VBP pausing VBG paved VBD paved VBN paves VBZ pave VB pawing VBG pawning VBG pawns NNS payables NNS paychecks NNS payers NNS Payers NNS paying VBG Paying VBG payments NNS Payments NNS PAYMENTS NNS payoffs NNS payouts NNS Payouts NNS payrolls NNS pays VBZ PAYS VBZ pay VB Pay VB PAY VB pay VBP PCBs NNS PCs NNS peacemakers NNS peaches NNS peaked VBD peaked VBN peaking VBG peaks NNS peak VB pealing VBG peals NNS peanuts NNS pearls NNS pears NNS peasants NNS Peasants NNS peas NNS peccadilloes NNS pecks NNS peck VBP peculiarities NNS pedaled VBN pedaling VBG pedal VB peddled VBN peddles VBZ peddle VB peddle VBP peddling VBG pedestrians NNS Pedigrees NNS Peeking VBG peeled VBN peering VBG peers NNS peers VBZ peer VB peer VBP pegged VBD pegged VBN pegging VBG pegs VBZ peg VB peg VBP pellets NNS penalized VBN penalizes VBZ penalize VB penalties NNS pence NNS pencils NNS Pencils NNS PENCILS NNS Pencil VB pending VBG Pending VBG pending VBG|JJ penetrated VBN penetrate VB penetrating VBG penises NNS penned VBN pennies NNS pensions NNS pens NNS peopled VBN people NNS People NNS peoples NNS peppered VBD peppering VBG peppers NNS Pepsi NNS perceived VBD perceived VBN perceives VBZ perceive VBP percentages NNS perceptions NNS perched VBN perfected VBN performances NNS performed VBD performed VBN performers NNS performing-arts NNS performing VBG Performing VBG performs VBZ perform VB perform VBP perils NNS periodicals NNS periods NNS Periods NNS peripherals NNS perishables NNS perished VBN perked VBD perked VBN perks NNS permeated VBD permeating VBG permits NNS permits VBZ permitted VBD permitted VBN permitting VBG permit VB permit VBP perpetrated VBN perpetuates VBZ perpetuate VB perpetuate VBP perpetuating VBG persecuted VBN persecuting VBG persisted VBD persisting VBG persists VBZ persist VB persist VBP personalities NNS personalized VBN personalize VB personnel NNS Personnel NNS persons NNS persuaded VBD persuaded VBN persuades VBZ persuade VB persuading VBG Persuading VBG pertains VBZ perturbed VBD peruse VB peruse VBP pervaded VBD pervade VBP perversities NNS pesatas NNS pesetas NNS pesos NNS pessimists NNS Pestered VBN pesticides NNS petitioned VBD petitions NNS petition VB petrochemicals NNS pets NNS PETS NNS pharaohs NNS pharmaceuticals NNS Pharmaceuticals NNS pharmacies NNS pharmacists NNS phased VBD phased VBN phases NNS phase VB phasing VBG phenomena NNS Phillips NNS philosophers NNS philosophies NNS phobias NNS phoned VBD phoned VBN phones NNS phones VBZ phone VBP phoning VBG photocopiers NNS photocopying VBG photocopy VB photofinishers NNS photographed VBN photographers NNS photographing VBG photographs NNS photographs VBZ photos NNS phrases NNS physicians NNS physics NNS pianos NNS piasters NNS picked VBD picked VBN pickers NNS picking VBG Pickin VBG pickles NNS picks NNS picks VBZ pickups NNS pick VB Pick VB pick VBP pictured VBN pictures NNS Pictures NNS pictures VBZ picture VBP picturing VBG pieced VBN pieces NNS Pieces NNS piece VB Pierce VB piers NNS pies NNS piggybacking VBG Piggybacking VBG piglets NNS pigments NNS pigs NNS piled VBD piled VBN piles NNS pile VB pile VBP pilings NNS piling VBG PILING VBG pillars NNS pilloried VBN pillorying VBG pillowcases NNS pillows NNS pills NNS pilots NNS Pilots NNS pimps NNS pinched VBD pinched VBN pinching VBG pinch VB pine VBP pinging VBG ping VB pinned VBN pinning VBG pin-pointed VBN pinpointed VBN pinpoint VB pins NNS pins VBZ pints NNS pin VB pioneered VBD pioneers NNS pioneer VB piped VBD piped VBN pipelines NNS pipes NNS pirated VBN pirates NNS piroghi NNS pistils NNS pistols NNS pistons NNS pitched VBD pitched VBN pitchers NNS pitches NNS pitches VBZ pitching VBG pitchmen NNS pitch VB pitch VBP pitfalls NNS Pitfalls NNS pits NNS pits VBZ pitted VBD pitted VBN pitting VBG pit VB pivot VB pizzas-with-everything NNS pizzerias NNS placated VBN placate VB placed VBD placed VBN placements NNS places NNS places VBZ place VB Place VB place VBP placing VBG plagued VBD plagued VBN plague VB plaguing VBG plainclothes NNS plains NNS Plains NNS plaintiffs NNS Plaintiffs NNS planes NNS planets NNS planks NNS planned VBD planned VBN planners NNS Planners NNS planning VBG plans NNS Plans NNS plans VBP plans VBZ Plans VBZ PLANS VBZ plantations NNS planted VBD planted VBN planting VBG plants NNS PLANTS NNS plant VB plan VB plan VBP plastics NNS Plastics NNS plates NNS platforms NNS platitudes NNS plaudits NNS played VBD played VBN players NNS Players NNS playgrounds NNS playing VBG Playing VBG playoffs NNS plays NNS plays VBZ Plays VBZ play VB play VBP pleaded VBD pleaded VBN pleadings NNS pleading VBG plead VB pleasantries NNS pleased VBD pleased VBN pleases VBZ please VB Please VB please VBP pleasing VBG pleas NNS pleasures NNS plea VB pledged VBD pledged VBN pledges NNS pledges VBZ pledging VBG pliers NNS plies VBZ plights NNS plods VBZ plots NNS plotted VBD plotters NNS plotting VBG plot VB plowed VBD plowed VBN plows VBZ plow VB ploys NNS Ploys NNS plucked VBN pluck VB plugged VBD plugging VBG Plugging VBG plug VB plug VBP plummeted VBD plummeted VBN plummeting VBG plummet VB plummet VBP plunged VBD plunged VBN plunges NNS plunges VBZ plunge VB plunge VBP plunging VBG plunking VBG pluses NNS Pluses NNS plying VBG Poachers NNS poaching VBG pocketing VBG pockets NNS Pockets NNS pockets VBZ pocket VB pockmarked VBN pointed VBD pointed VBN pointers NNS pointing VBG points NNS points VBZ point VB point VBP poised VBN poisoned VBN poisons NNS poked VBD pokes VBZ poking VBG polarized VBN poles NNS Poles NNS police NNS Police NNS polices NNS police VB policies NNS Policies NNS policing VBG policyholders NNS policy-makers NNS policy-making VBG polished VBD polished VBN polishing VBG|NN polish VB politicians NNS Politicians NNS politicized VBN politico-plaintiffs NNS politics NNS Politics NNS POLITICS NNS polled VBD polled VBN pollen-producing VBG pollinated VBN pollinate VB pollinate VBP pollinating VBG polls NNS Polls NNS pollsters NNS polls VBZ pollutants NNS polluters NNS pollute VB polluting VBG poll VB pols NNS poltergeists NNS polymers NNS polyols NNS polyps NNS polyrhythms NNS pondering VBG ponder VB ponder VBP ponds NNS ponied VBD ponies NNS pontificate VBP ponying VBG pooled VBN pooling VBG pools NNS pool VBP popping VBG pops VBZ popularized VBD popularized VBN popularize VB populated VBN populate VB populating VBG populations NNS pop VB pop VBP porcelains NNS porches NNS pored VBD pored VBN pores VBZ pork-barrelers NNS portables NNS portends VBZ portend VB portfolios NNS Portfolios NNS PORTING VBG portions NNS portraits NNS portrayals NNS portrayed VBD portrayed VBN portraying VBG portrays VBZ portray VB portray VBP ports NNS Ports NNS posed VBD posed VBN poses VBZ pose VB pose VBP posing VBG positioned VBD positioned VBN positions NNS positions VBZ position VB possessed VBD possessed VBN possesses VBZ possessing VBG possessions NNS possess VB possess VBP possibilities NNS postcards NNS posted VBD Posted VBD posted VBN Posted VBN posters NNS postings NNS posting VBG postmarked VBN postmarks NNS postponed VBD postponed VBN postpone VB postpone VBP postponing VBG posts NNS posts VBZ Posts VBZ posturing VBG post VB POTABLES NNS potatoes NNS potentates NNS potentialities NNS potholes NNS pots NNS pot VB pouches NNS pounce VB pounded VBN pounding VBG pounds NNS pound VB poured VBD poured VBN pouring VBG pours VBZ pour VB pour VBP powders NNS powered VBN powerhouses NNS powers NNS power VB practiced VBD practiced VBN practices NNS practice VB practicing VBG practitioners NNS pragmatists NNS prairies NNS praised VBD praised VBN praises NNS praises VBZ praise VB praise VBP praising VBG prancing VBG prayers NNS praying VBG preaching VBG preach VB preach VBP pre-approved VBN preapproved VBN prearranged VBN precautions NNS preceded VBN precedents NNS precedes VBZ precede VB preceding VBG precincts NNS precious-metals NNS Precious-metals NNS precipices NNS precipitated VBD precipitating VBG precluded VBN precludes VBZ preclude VB predates VBZ predators NNS predecessors NNS predetermined VBN predicated VBN predict\/advocate VBP predicted VBD predicted VBN predicting VBG Predicting VBG predictions NNS Predictions NNS predicts VBZ predict VB predict VBP predispose VB pre-empted VBD pre-empt VB preferences NNS preferred VBD preferred VBG preferred VBN preferring VBG prefers VBZ prefer VB prefer VBP prejudiced VBN prejudices NNS premiered VBD premieres NNS premiere VB premiering VBG premises NNS premiums NNS preoccupied VBN prepaid VB prepaid VBN preparations NNS preparatives NNS prepared VBD prepared VBN preparers NNS prepares VBZ prepare VB prepare VBP preparing VBG prepaying VBG prepayments NNS Prepayments NNS prepay VB prepping VBG pre-registered VBN pre-register VB prerogatives NNS presages VBZ presage VB preschoolers NNS prescribed VBN prescribes VBZ prescribe VB prescribe VBP prescriptions NNS presentations NNS presented VBD presented VBN presenters NNS presenting VBG presents NNS presents VBZ present VB Present VB present VBP preserved VBD preserved VBN preserves VBZ preserve VB preserving VBG presided VBD presided VBN presidents NNS Presidents NNS presides VBZ presiding VBG pre-signed VBN pressed VBD pressed VBN Pressed VBN presses NNS pressing VBG pressured VBD pressured VBN pressures NNS Pressures NNS pressure VB pressure VBP pressuring VBG press VB Press VB press VBP presumed VBN presumes VBZ presuming VBG pretending VBG pretend VB Pretend VB pretensions NNS pre-tested VBN pre-try VB prevailed VBD prevailed VBN prevailing VBG prevails VBZ prevail VB prevail VBP prevented VBD prevented VBN Prevented VBN preventing VBG prevents VBZ prevent VB Prevent VB prevent VBP previewing VBG previews NNS prey VBP priced VBD priced VBN price-earnings NNS price\/earnings NNS prices NNS Prices NNS PRICES NNS prices VBZ price VB price VBP pricings NNS pricing VBG pricked VBN priests NNS primed VBN primitives NNS princes NNS principals NNS principles NNS printed VBD printed VBN printers NNS printing VBG printing VBG|NN printouts NNS prints NNS prints VBZ print VB print VBP priorities NNS prisoners NNS prisons NNS privatized VBN privatize VB privatizing VBG privileges NNS prized VBN prizes NNS probabilities NNS probes NNS probe VB probe VBP probing VBG Probing VBG problematics NNS problems NNS procedures NNS proceeded VBD proceeded VBN proceedings NNS PROCEEDINGS NNS proceeding VBG proceeds NNS Proceeds NNS proceeds VBZ proceed VB proceed VBP processed VBD processed VBN processes NNS processes VBZ processing VBG Processing VBG processors NNS process VB process VBP proclaimed VBD proclaiming VBG proclaims VBZ proclaim VB proclaim VBP proclamations NNS procure VB prodded VBN prodding VBG prods VBZ produced VBD produced VBN producers NNS Producers NNS produces VBZ produce VB Produce VB produce VBP producing VBG productions NNS products NNS PRODUCTS NNS prod VB professed VBD professed VBN professes VBZ professionals NNS Professionals NNS professions NNS professors NNS profess VBP proffered VBD proffered VBN profferred VBN profiled VBN profiles NNS profile VB profited VBD profited VBN profiteering VBG profiteers NNS profiting VBG profit-sharing NNS profits NNS Profits NNS PROFITS NNS profits VBZ profit-taking NNS profitting VBG profit VB profit VBP progenitors NNS prognosticators NNS programmed VBN programmers NNS programming VBG programs NNS Programs NNS program VB program VBP progressed VBD progressed VBN progresses VBZ progressing VBG progressions NNS progressives NNS progress VB prohibited VBD prohibited VBN prohibiting VBG prohibitions NNS prohibits VBZ prohibit VB prohibit VBP projected VBD projected VBN Projected VBN projecting VBG Projecting VBG projections NNS projectors NNS projects NNS Projects NNS projects VBZ project VB project VBP proliferated VBN proliferate VBP proliferating VBG prolonged VBN prolong VB promised VBD promised VBN promises NNS promises VBZ Promises VBZ promise VB promise VBP promising VBG promoted VBD promoted VBN promoters NNS promotes VBZ promote VB promote VBP promoting VBG promotions NNS prompted VBD prompted VBN Prompted VBN prompting VBG prompts VBZ prompt VB promulgated VBD prongs NNS pronounced VBD pronounced VBN pronouncements NNS pronounces VBZ proof-of-purchases NNS proofreading VBG propagandists NNS propagandizes VBZ propagandize VB propelled VBD propelled VBN propelling VBG propel VB properties NNS proponents NNS Proponents NNS proportions NNS proposals NNS Proposals NNS PROPOSALS NNS proposed VBD proposed VBN proposes VBZ propose VB propose VBP proposing VBG propositions NNS propped VBD propped VBN propping VBG proprietorships NNS proprietors NNS propsed VBN props NNS prop VB proscribed VBN proscribes VBZ prosecuted VBD prosecuted VBN prosecute VB prosecuting VBG prosecutions NNS prosecutors NNS Prosecutors NNS PROSECUTORS NNS pros NNS Pros NNS prospects NNS Prospects NNS PROSPECTS NNS prospectuses NNS prospered VBN prosper VB prostitutes NNS protected VBD protected VBN protecting VBG protections NNS protectors NNS protects VBZ protect VB protect VBP proteges NNS proteins NNS proteins VBZ protested VBD protested VBN protesters NNS protesting VBG protestors NNS protests NNS protests VBZ protest VB protocols NNS prototypes NNS proved VBD proved VBN proven VBN Proverbs NNS proves VBZ Proves VBZ prove VB prove VBP provided VBD provided VBN Provided VBN providers NNS provides VBZ provide VB Provide VB provide VBP Provide VBP providing VBG provinces NNS proving VBG provisioning VBG provisions NNS provoked VBD provoked VBN provoke VB provoking VBG proxies NNS pruned VBN prune VB pseudo-lobbyists NNS psychics NNS psychologists NNS Psychologists NNS publications NNS publicized VBN publicize VB public-relations NNS public-works NNS Public-works NNS published VBD published VBN Published VBN publishers NNS Publishers NNS publishes VBZ publishing VBG publish VB pubs NNS puffers NNS pull-backs NNS pullbacks NNS pulled VBD pulled VBN pulling VBG pullouts NNS pulls NNS pulls VBZ pull VB Pull VB pull VBP pulverizing VBG pummeled VBD pummeled VBN pummel VB pumped VBD pumped VBN pumping VBG pumps NNS pump VB pump VBP punched VBD punched VBN punchers NNS punching VBG Punching VBG punch VB punch VBP pundits NNS punished VBN punishing VBG punish VB puns NNS punts NNS pupils NNS puppets NNS puppies NNS purchased VBD purchased VBN purchasers NNS purchases NNS purchases VBZ purchase VB purchase VBP purchasing VBG purchasing VBG|NN purged VBD purged VBN purges VBZ purge VB purging VBG purists NNS pur-poises NNS purports VBZ purport VBP purposes NNS purrs VBZ purse-snatchings NNS purses NNS pursued VBD pursued VBN pursuers NNS pursues VBZ pursue VB pursue VBP pursuing VBG pursuits NNS pushed VBD pushed VBN pushers NNS pushes VBZ pushing VBG Pushing VBG push VB push VBP Push VBP puts NNS puts VBZ Puts VBZ putting VBG Putting VBG put VB Put VB put VBD Put VBD put VBN Put VBN put VBP put VBP|VB puzzled VBD puzzled VBN puzzles NNS puzzle VB pyramiding VBG pyramids NNS quacks NNS quacks VBZ quadrupeds NNS quadrupled VBD quadrupled VBN quadruples VBZ quadrupling VBG quakes NNS qualifications NNS qualified VBD qualified VBN qualifies VBZ qualifying VBG qualify VB qualify VBP qualities NNS qualms NNS quantified VBN quantify VB quantities NNS quarreling VBG quarrel VB quarterbacks NNS quarters NNS quartets NNS quashed VBD quashed VBN quashing VBG queers NNS quell VB queried VBN queries NNS queries VBZ questioned VBD questioned VBN Questioned VBN questioning VBG questions NNS Questions NNS questions VBZ question VB question VBP queues NNS queuing VBG quibbling VBG quicken VB quieted VBD quieted VBN quieting VBG quiet VB quipped VBD quips NNS quips VBZ Quips VBZ quirks NNS quits VBZ quitting VBG quit VB quit VBD quit VBN quit VBP quivers NNS quiz VB quota-cheaters NNS quotas NNS quotations NNS quoted VBD quoted VBN quotes NNS quotes VBZ quote VB quote VBP quoting VBG Quoting VBG raced VBD racehorses NNS races NNS racetracks NNS racing VBG Racing VBG racked VBD racked VBN racketeering VBG rackets NNS racking VBG racks NNS radar-eluding VBG radicals NNS radioing VBG radios NNS rafters NNS raged VBD raged VBN rages NNS rage VB raging VBG raided VBD raided VBN raiders NNS raiding VBG raids NNS raid VB railbikes NNS railcars NNS railings NNS railroads NNS rails NNS rails VBZ railways NNS rained VBD rains NNS rain VB raised VBD raised VBN raisers NNS raises NNS raises VBZ raise VB Raise VB raise VBP raising VBG Raising VBG raked VBD raking VBG rallied VBD RALLIED VBD rallied VBN rallies NNS rallies VBZ rallying VBG rally VB Rally VB rally VBP rambled VBD ramifications NNS rammed VBD ramparts NNS ramps NNS ramp VB ranchers NNS ranches NNS rand NNS ranged VBD ranged VBN rangers NNS ranges NNS ranges VBZ range VB range VBP ranging VBG rang VBD ranked VBD ranked VBN rankings NNS ranking VBG rankled VBN ranks NNS ranks VBZ rank VB rank VBP ran VBD raped VBD raped VBN rapeseeds NNS raping VBG rapists NNS raptors NNS rarefied VBN ratcheting VBG rated VBD rated VBN Rated VBN ratepayers NNS rates NNS Rates NNS RATES NNS rates VBZ rate VB rate VBP ratified VBD ratified VBN ratifying VBG ratify VB ratings NNS Ratings NNS rating VBG rationalizations NNS rationalize VB rationalizing VBG rationed VBN ratios NNS RATIOS NNS rats NNS rattled VBD RATTLED VBD rattled VBN rattle VB rattling VBG rat VB ravaged VBN ravages NNS RAVAGES NNS raves VBZ rave VB ravines NNS raw-materials NNS rays NNS razed VBN razing VBG reached VBD reached VBN Reached VBN reaches NNS reaches VBZ reaching VBG Reaching VBG reach VB reach VBP reacted VBD reacted VBN reacting VBG reactions NNS reactivated VBD reactivated VBN reactors NNS reacts VBZ react VB react VBP readers NNS Readers NNS readied VBD readings NNS reading VBG Reading VBG readmit VB reads VBZ read VB Read VB read VBD read VBD|VBP read VBN read VBP read VBP|VBD reaffirmed VBD reaffirming VBG reaffirms VBZ reaffirm VB Reaganauts NNS realestate VB realigned VBD realigning VBG realignments NNS realign VB realists NNS realities NNS realized VBD realized VBN realizes VBZ realize VB realize VBP realizing VBG reallocated VBN reallocate VB realms NNS reams NNS reaped VBD reaped VBN reaping VBG reappointed VBN reapportion VBP reappraised VBD reappraised VBN reappraise VB reap VB REAP VBP Rearding VBG reared VBN rearing VBG rearm VB rearranges VBZ rearrange VB reasoned VBD reasoned VBN reasons NNS Reasons NNS reasons VBZ reason VB reason VBP reasserting VBG reasserts VBZ reassert VB reassessing VBG reassess VB reassigned VBD reassigned VBN reassignments NNS reassume VB reassurances NNS reassured VBD reassured VBN reassure VB reassuring VBG reauthorize VB reawakening VBG rebates NNS rebelled VBD rebels NNS rebounded VB rebounded VBD rebounded VBN rebounding VBG Rebounding VBG rebounds NNS rebounds VBZ rebound VB rebuffed VBD rebuffed VBN rebuilding VBG Rebuilding VBG rebuild VB rebuilt VBN rebuked VBD rebuts VBZ rebutted VBN rebut VB recalculated VBD recalculating VBG recalculations NNS recalled VBD recalled VBN recalling VBG recalls NNS recalls VBZ Recalls VBZ recall VB Recall VB recall VBP recanted VBD recanted VBN recapitalizations NNS recapitalized VBN recapture VB recede VBP receding VBG receipts NNS receivables NNS received VBD received VBN receivers NNS receives VBZ receive VB receive VBP receiving VBG recentralized VBN receptionists NNS receptors NNS recessed VBN recessions NNS recharging VBG recipes NNS recipients NNS recites VBZ reciting VBG reckoned VBN reckoning VBG reckons VBZ reckon VB reckon VBP reclaimed VBN reclaiming VBG reclaims VBZ reclaim VB reclaim VBP reclassified VBD reclining VBG recognized VBD recognized VBN recognizes VBZ recognize VB recognize VBP recognizing VBG recommendations NNS recommendatons NNS recommended VBD recommended VBN recommending VBG recommends VBZ recommend VB recommend VBP reconciles VBZ reconcile VB reconciling VBG reconnect VB reconsidered VBN reconsider VB reconstructed VBD reconstructed VBN reconstructing VBG reconstruct VB recorded VBD recorded VBN recorders NNS recordings NNS recording VBG records NNS RECORDS NNS records VBZ record VB Record VB recounted VBD recounted VBN recounting VBG recounts VBZ recouped VBD recouped VBN recoup VB recovered VBD recovered VBN recoveries NNS recovering VBG Recovering VBG recovers VBZ recover VB recover VBP recraft VB re-creactions NNS recreate VB Re-creating VBG re-creations NNS recruited VBD recruited VBN Recruited VBN recruiting VBG recruits NNS recruits VBZ recruit VB rectangles NNS rectified VBN rectifying VBG recuperate VB recurring VBG recused VBN recycled VBN recycles VBZ recycle VB recycling VBG reddened VB redeemed VBN redeeming VBG redeem VB redefined VBD redefine VB redefining VBG redemptions NNS redeploy VB redesigned VBD redesigned VBN redesigning VBG redesign VB redevelop VB red-flag VB redial VB redirected VBN rediscover VB redistribute VB redistributing VBG redlining VBG redoing VBG redoubling VBG redound VB redo VB redraw VB redress VB reds NNS Reds NNS reduced VBD reduced VBN reduces VBZ reduce VB reduce VBP reducing VBG Reducing VBG reductions NNS reefs NNS re-elected VBD re-elected VBN reeled VBD reeled VBN reeling VBG re-emerge VB re-emphasize VBP re-enacting VBG re-enactments NNS Re-enactments NNS re-entered VBD re-entering VBG re-enter VB re-establishing VBG re-establish VB reestablish VB re-evaluate VB re-evaluate VBP re-evaluating VBG re-examine VB reexamining VBG re-exports NNS refashioning VBG referees NNS references NNS referrals NNS referred VBD referred VBN referring VBG Referring VBG refers VBZ refer VB refer VBP refile VB refinanced VBD refinanced VBN refinance VB refinancing VBG refined VBN refineries NNS refiners NNS Refiners NNS refine VB refining VBG refitting VBG reflected VBD reflected VBN reflecting VBG Reflecting VBG reflects VBZ reflect VB reflect VBP refocused VBD refocused VBN refocuses VB refocusing VBG refocus VB reformed VBN reformers NNS reforming VBG reforms NNS reformulated VBN reform VB refrained VBN refrain VB refreshing VBG refrigerators NNS refueling VBG refugees NNS refunded VBN refunding VBG refunding VBG|NN refunds NNS refund VB refurbished VBN refurbishing VBG refurbish VB refused VBD refused VBN refusers NNS refuses VBZ refuse VB refuse VBP refusing VBG refuted VBD refute VB regained VBD regained VBN regaining VBG regains VBZ Regains VBZ regain VB regain VBP regarded VBD regarded VBN Regarded VBN regarding VBG Regarding VBG regards VBZ regard VB regard VBP regenerate VB regions NNS registered VBD registered VBN registering VBG registers NNS register VB register VBP registrants NNS registrations NNS regrets NNS regrets VBZ regretted VBD regretted VBN regret VB regret VBP regroup VB regroup VBP regulated VBN regulates VBZ regulate VB regulating VBG regulations NNS REGULATIONS NNS regulators NNS Regulators NNS regulators VBZ regummed VBD rehabilitated VBN rehabilitate VB rehashing VBG reigned VBD reigning VBG reignited VBD reignited VBN reignite VB reigniting VBG reimbursed VBD reimbursed VBN reimbursements NNS reimburses VBZ reimburse VB reimburse VBP reimpose VB reincorporated VBN reincorporating VBG reindicting VBG reinforced VBD reinforced VBN reinforcements NNS reinforces VBZ reinforce VB reinforce VBP reinforcing VBG Reinforcing VBG reining VBG reins NNS reinstalled VBN reinstated VBD reinstated VBN reinstate VB reinstating VBG reinstituting VBG reinsurers NNS reintegrated VBN reintroduced VBN rein VB reinvented VBD reinvent VB reinvested VBD reinvested VBN reinvesting VBG reinvest VB reinvest VBP reinvigorated VBN reinvigorate VB reinvigorating VBG reiterated VBD reiterates VBZ reiterating VBG REITs NNS rejected VBD rejected VBN rejecting VBG rejections NNS rejects VBZ reject VB reject VBP rejoice VBP rejoined VBD rejoining VBG Rejoins VBZ rejoin VB rejuvenate VB rekindled VBN Rekindled VBN rekindle VB rekindling VBG relabeling VBG related VBD related VBN Related VBN relates VBZ relate VB relate VBP relating VBG relationships NNS relations NNS Relations NNS relatives NNS relaunched VBN relaunch VB relaxed VBD relaxed VBN relaxing VBG relax VB relax VBP relayed VBD released VBD released VBN releases NNS releases VBZ release VB Release VB release VBP releasing VBG relegated VBN relented VBD relenting VBG relent VBP relics NNS relied VB relied VBD relied VBN relies VBZ relieved VBD relieved VBN relieve VB religions NNS relinquished VBD relinquished VBN relinquishing VBG relinquish VB relished VBD relishes VBZ relish VB relive VBP relocated VBD relocated VBN relocate VB relocating VBG relocations NNS relying VBG Relying VBG rely VB rely VBP remade VB remained VBD remained VBN remaining VBG Remaining VBG remains NNS Remains NNS remains VBZ remain VB remain VBP remake VB remanded VBD remarked VBD remarked VBN Remarketers NNS remarketings NNS remarks NNS remarks VBZ remark VB remedied VBN remedies NNS remedy VB remembered VBD remembered VBN remembering VBG remembers VBZ remember VB Remember VB remember VBP Remics NNS REMICs NNS reminded VBD reminded VBN reminders NNS reminding VBG reminds VBZ remind VB remind VBP remittances NNS remnants NNS remodeled VBN remodeling VBG remora NNS removed VBD removed VBN Removed VBN removes VBZ remove VB removing VBG remunerated VBN renamed VBD renamed VBN rename VB rendered VBD rendered VBN renderings NNS rendering VBG render VB rendezvoused VBD renege VB reneging VBG renegotiated VBN renegotiate VB renegotiating VBG renewals NNS renewed VBD renewed VBN Renewed VBN renewing VBG renews VBZ renew VB renounced VBD renounce VB renouncing VBG renovated VBN renovate VB renovating VBG renowned VBN rentals NNS rented VBN renters NNS renting VBG rents NNS Rents NNS rents VBZ rent VB reoffered VBN reopened VBD reopened VBN reopening VBG reopens VBZ reopen VB reorganized VBN reorganizes VBZ reorganize VB reoriented VBN repackaged VBN repackage VB repackaging VBG repaid VBD repaid VBN repainted VBN repaired VBD repaired VBN repairing VBG repairs NNS repairs VBZ repair VB repair VBP reparations NNS repassed VBN repatriate VB repaying VBG repayments NNS repay VB repealed VBN repeals VBZ repeal VB repeated VBD repeated VBN repeaters NNS repeating VBG repeats NNS repeats VBZ repeat VB repel VB repel VBP repercussions NNS replaced VBD replaced VBN replacements NNS replaces VBZ replace VB replace VBP replacing VBG Replacing VBG replaster VB replays NNS replenished VBN replenish VB replicated VBN replicate VB replicating VBG replied VBD Replied VBD replied VBN replies NNS replies VBZ reply VB repond VB reported VBD reported VBN reporters NNS reporting VBG reports NNS Reports NNS REPORTS NNS reports VBZ Reports VBZ report VB Report VB report VBP reposed VBN reposition VB repositories NNS repossess VB representations NNS representatives NNS Representatives NNS represented VBD represented VBN representing VBG representives NNS represents VBZ represent VB represent VBP repressed VBN repressing VBG repriced VBN reprinted VBN Reprinted VBN reprints VBZ reprint VB reprisals NNS reproduced VBD Reproduced VBN reproduce VB reprove VB reps NNS Republicans NNS republics NNS repudiate VB repurchased VBD repurchased VBN repurchases NNS repurchase VB repurchase VBD repurchase VBN repurchasing VBG reputations NNS reputed VBN requested VBD requested VBN requesting VBG requests NNS request VB request VBP required VBD required VBN requirements NNS requires VBZ require VB Require VB require VBP requiring VBG requisitioned VBD rerouted VBN rerouting VBG reruns NNS resales NNS rescheduled VBD rescheduled VBN reschedule VB rescinded VBD rescinding VBG rescind VB rescissions NNS rescued VBD rescuers NNS rescues VBZ rescue VB researched VBN researchers NNS Researchers NNS RESEARCHERS NNS researches VBZ researching VBG research VB resellers NNS reselling VBG resells VBZ resell VB resell VBP resemblances NNS resembles VBZ resemble VB resemble VBP resembling VBG resented VBD resent VB resent VBP reservations NNS reserved VBD reserved VBN Reserved VBN reserves NNS Reserves NNS reserves VBZ reserve VB reserve VBP reserving VBG reservoirs NNS reset VB reshaped VBD reshaped VBN reshape VB reshaping VBG reshuffled VBD reshuffle VB reshufflings NNS reshuffling VBG resided VBN residences NNS residents NNS Residents NNS resides VBZ reside VB residing VBG residues NNS resignations NNS RESIGNATIONS NNS resigned VBD RESIGNED VBD resigned VBN resigning VBG resign VB resins NNS resisted VBD resisted VBN resisting VBG resists VBZ resist VB Resist VB resist VBP resold VB resold VBN resold VBP resolutions NNS resolved VBD resolved VBN resolve VB resolving VBG resonated VBD resonates VBZ resonate VB resorts NNS resorts VBZ resort VB resort VBP resources NNS respected VBD respected VBN respects NNS respects VBZ respect VB responded VBD responded VBN respondents NNS responding VBG Responding VBG responds VBZ respond VB respond VBP responses NNS Responses NNS responsibilities NNS restarted VBD restarted VBN restarters NNS restarting VBG restart VB restated VBD restated VBN restate VB restating VBG restaurants NNS Restaurants NNS rested VBD restored VBD restored VBN restore VB restoring VBG restrained VBD restrained VBN restraining VBG restraints NNS restrain VB restricted VBD restricted VBN restricting VBG restrictions NNS restricts VBZ restrict VB Restrict VB restrict VBP restructured VBD restructured VBN restructures VBZ restructure VB restructure VBP restructurings NNS restructuring VBG rests VBZ rest VB Rest VB rest VB|NN rest VBP restyled VBN resubmit VB resulted VBD resulted VBN resulting VBG resulting VBG|JJ results NNS Results NNS results VBZ result VB result VBP resumed VBD resumed VBN resumes NNS resumes VBZ resume VB resume VBP resuming VBG re-supplied VBN resurfaced VBD resurfaced VBN resurging VBG resurrected VBD resurrected VBN resurrects VBZ resurrect VB resurrect VBP resuscitate VB resuscitating VBG retailers NNS Retailers NNS retailing VBG retail-sales NNS retails VBZ retail VB retail VBP retained VBD retained VBN retaining VBG retains VBZ retain VB retain VBP retaking VBG retaliating VBG retardants NNS retard VB rethinking VBG rethink VB retired VBD retired VBN Retired VBN retirees NNS retirements NNS retires VBZ retire VB retire VBP retiring VBG retooling VBG retools VBZ retorts NNS retorts VBZ retraced VBD retracted VBD retraining VBG retreated VBD retreated VBN retreating VBG retreats NNS retreat VB retrench VBP retrieved VBD retrieved VBN retrieve VB retrieve VBP retrofit VB retry VB returned VBD returned VBN returning VBG returns NNS Returns NNS returns VBZ return VB return VBP reunions NNS reunited VBN reunite VB reused VBN revalued VBN revamped VBD revamped VBN revamping VBG revamp VB 're VBP revealed VBD revealed VBN revealing VBG reveals VBZ Reveals VBZ reveal VB reveal VBP revelations NNS revelers NNS reveling VBG revels NNS revel VBP revenues NNS Revenues NNS reverberated VBN reverberate VB reverberating VBG reverberations NNS reversals NNS reversed VBD reversed VBN reverses VBZ reverse VB Reverse VBP reversing VBG Reversing VBG reverted VBN reverts VBZ reviewed VBD reviewed VBN reviewing VBG Reviewing VBG reviews NNS reviews VBZ review VB review VBP revised VBD revised VBN Revised VBN REVISED VBN revise VB revising VBG Revising VBG revisionists NNS revisions NNS revisited VBN revisits VBZ revisit VB Revitalized VBN revitalizing VBG revivals NNS Revivals NNS revived VBD revived VBN revive VB revive VBP reviving VBG revoked VBN revoke VB revoking VBG revolutionaries NNS revolutionized VBD revolutionize VB revolves VBZ revolve VB revolving VBG revved VBN rewarded VBN rewarding VBG rewards NNS Rewards NNS rewards VBZ reward VB Reward VB reworked VBN rewrite VB rewriting VBG rewritten VBN rhymed VBD rhymes VBZ rhyming VBG ribbies NNS ribbons NNS ribs NNS Ricans NNS riches NNS riders NNS rides NNS ride VB ride VBP ridges NNS ridiculed VBN ridicules VBZ riding VBG rid VB RID VB rid VBD rid VBN rid VBN|JJ riffing VBG rifles NNS rigged VBD rigged VBN righted VBN rights NNS Rights NNS RIGHTS NNS right-to-lifers NNS right-wingers NNS rigors NNS rigs NNS riles VBZ rile VBP rim VBP ringers NNS Ringers NNS ringing VBG rings NNS rings VBZ ring VB ring VBP riots NNS ripens VBZ ripen VBP ripoffs NNS ripped VBD ripped VBN Ripples NNS ripple VBP rippling VBG risen VBN rises NNS rises VBZ rise VB rise VBP rising VBG Rising VBG risked VBD risking VBG risks NNS Risks NNS risks VBZ risk VB risk VBP rites NNS rituals NNS rivaling VBG rivalries NNS rivals NNS Rivals NNS rivals VBZ rival VB rivers NNS riveted VBD riveted VBN rivets NNS rivets VBZ roadblocks NNS roads NNS Roads NNS roadways NNS roamed VBD roam VBP roaring VBG roars VBZ robbed VBN robberies NNS robbers NNS Robbers NNS robbing VBG robes NNS robots NNS rocked VBD rocked VBN rockers NNS rocketed VBD rocketing VBG rockets NNS rocking VBG rocks NNS rock VB rodents NNS rode VBD RODE VBD rods NNS rogues NNS roiling VBG roil VB roles NNS rollbacks NNS rolled VBD rolled VBN Rolled VBN rollers NNS rolling VBG rollovers NNS rolls NNS rolls VBZ roll VB roll VBP romancing VBG romanticized VBN romps NNS romp VBP roofers NNS roofs NNS rooftops NNS Roommates NNS rooms NNS roost VB rooted VBN rooters NNS rooting VBG roots NNS root VB root VBP ropes NNS rosarians NNS roses NNS rose VBD rose VBP rotated VBD rotate VB rotating VBG rotted VBN rotting VBG roughed VBD roughnecks NNS rounded VBD rounded VBN rounds NNS round VB round VBP roustabouts NNS routed VBN routes NNS routes VBZ routines NNS routing VBG rowed VBD rows NNS row VB royalties NNS rubbed VBD rubber-necking VBG rubber-stamp VB rubdowns NNS rubfests NNS Rubins NNS rubles NNS rubs NNS rubs VBZ rub VB rub VBP ruffled VBN Ruffled VBN rugs NNS ruined VBN Ruined VBN ruining VBG ruins NNS ruin VB ruin VBP ruled VBD ruled VBN rulers NNS rules NNS Rules NNS RULES NNS rules VBZ rule VB rule VBP rulings NNS ruling VBG rumbled VBD rumbles VBZ rumble VBP rumblings NNS rumbling VBG ruminated VBD ruminated VBN ruminations NNS rumored VBD rumored VBN rumors NNS Rumors NNS rumors VBZ rung VBN run-ins NNS runners NNS runners-up NNS running VBG Running VBG runs NNS runs VBZ runups NNS run VB run VBD run VBN run VBP Runways NNS ruptured VBD ruptured VBN rupturing VBG rushed VBD rushed VBN rushes VBZ rushing VBG rush VB rush VBP rusted VBN rusticated VBN rustlers NNS rustlings NNS RVs NNS Saatchi NNS sabers NNS sabotage VB sacked VBD sackings NNS sacking VBG sacks NNS sacrifices NNS sacrifices VBZ sacrifice VB sacrificing VBG saddled VBD saddled VBN safeguarded VBN safeguarding VBG safeguards NNS safeguard VB safety VB sages NNS sagged VBD sagged VBN sagging VBG sagging VBG|JJ Sagos NNS said VBD Said VBD SAID VBD said VBN Said VBN sailed VBD sailing VBG sailors NNS sails NNS saints NNS salaries NNS SALARIES NNS salarymen NNS salesmen NNS sales NNS Sales NNS SALES NNS salespeople NNS Salespeople NNS salicylates NNS salted VBD saluted VBD saluting VBG salvaged VBN salvage VB salve VB Samaritans NNS samovars NNS sampled VBN samples NNS sanctioned VBN sanctioning VBG sanctions NNS sanctions VBZ Sandinistas NNS sands NNS sandwiched VBD sandwiched VBN Sandwiched VBN sandwiches NNS sang VBD sanitationists NNS sanitized VBN sanitize VBP sank VBD San NNS sapped VBN sapping VBG sap VB satellites NNS satisfied VBD satisfied VBN satisfied VBN|JJ satisfies NNS satisfies VBZ satisfying VBG Satisfying VBG satisfy VB saturated VBD saturated VBN saturate VB sat VBD sat VBN saucers NNS sauces NNS saunas NNS saved VBD saved VBN savers\/investors NNS savers NNS saves VBZ save VB Save VB save VBP savings NNS Savings NNS saving VBG savored VBD savoring VBG savors NNS savor VB saw VBD saying VBG Saying VBG says VBZ Says VBZ say VB Say VB say VBP Say VBP scabs NNS scalawags NNS scaled VBD scaled VBN scales NNS scale VB scaling VBG scalps NNS scammed VBD scammers NNS scamper VBP scams NNS scandalized VBD scandals NNS scanners NNS scanning VBG scans NNS scans VBZ scan VB scape VB scared VBD scared VBN scares NNS scares VBZ scare VB scare VBP scarfing VBG scaring VBG Scaring VBG scarred VBN scars NNS scattered VBN scavengers NNS scenarios NNS Scenarios NNS scenes NNS scents NNS scheduled VBD scheduled VBN schedules NNS schedule VB scheduling VBG schemers NNS schemes NNS scheming VBG schmumpered VBD scholarships NNS scholars NNS schoolboys NNS schoolchildren NNS schoolmates NNS schools NNS Schools NNS schoolteachers NNS school VB sciences NNS scientists NNS Scientists NNS SciMed VBD SciMed VBN scoffed VBD scoffs VBZ scoff VBP scolded VBN scold VB scooped VBD scooping VBG scoops NNS scoops VBZ scoop VB scooted VBD scored VBD scored VBN scores NNS Scores NNS score VBP scoring VBG scorn VB scorn VBP scotched VBD scotches NNS scourges NNS scouring VBG scour VB scouting VBG scout VB scowls VBZ scowl VBP scrambled VBD scrambled VBN scrambles NNS scramble VBP scrambling VBG scrape VB scrapped VBD scrapped VBN scrapping VBG scraps NNS Scraps NNS scraps VBZ scrap VB SCRAP VBP scratched VBN scratching VBG scratch VB scratch VBP screamed VBD screaming VBG screams VBZ scream VB scream VBP screeched VBD screeching VBG screened VBD screenings NNS screening VBG screens NNS screens VBZ screen VB screenwriters NNS screwed VBD screwed VBN screws NNS scribbled VBD scribblers NNS scribbling VBG scribes NNS scrimped VBD scrimping VBG scripts NNS scriptwriters NNS scrounged VBD scrounge VBP scrubbers NNS scrutinized VBD scrutinized VBN scrutinize VB scrutinizing VBG sculptors NNS sculptures NNS scurries NNS scurrying VBG scurry VB scuttled VBD scuttled VBN scuttle VB sealants NNS sealed VBD sealed VBN sealing VBG seals NNS seal VB seamen NNS searched VBD searches NNS searches VBZ searching VBG search VB search VBP searing VBG Sears NNS seas NNS seasonings NNS seasons NNS seated VBN seating VBG seats NNS Seats NNS seats VBZ secede VB second-guessed VBN second-guess VB seconds NNS Seconds NNS secretaries NNS secrets NNS sections NNS sectors NNS secured VBD secured VBN secure VB secure VBP securing VBG securites NNS securities NNS Securities NNS sedans NNS seduce VB seducing VBG seeded VBN seeds NNS seeing VBG Seeing VBG seekers NNS seeking VBG Seeking VBG SEEKING VBG seeks VBZ seek VB seek VBP Seek VBP seemed VBD seemed VBN seems VBZ Seems VBZ seem VB seem VBP seen VBN Seen VBN seeped VBD seesawing VBG sees VBZ Sees VBZ seething VBG see VB See VB see VBP SEE VBP segmenting VBG segments NNS segregated VBD segregate VB seized VBD seized VBN Seizes VBZ seize VB seize VBP seizing VBG seizures NNS selected VBD selected VBN selecting VBG selections NNS Selections NNS selects VBZ select VB select VBP self-destructed VBD self-insure VBP self-reinsure VB self-starters NNS sellers NNS selling VBG Selling VBG sell-off NNS sell-offs NNS selloffs NNS sells VBZ sell VB Sell VB SELL VB sell VBP semantics NNS semesters NNS semi-celebrities NNS semiconductors NNS semifinished VBN Semifinished VBN seminars NNS senators NNS Senators NNS senders NNS sending VBG sends VBZ send VB Send VB send VBP seniors NNS Seniors NNS sensed VBD senses NNS sense VB sense VBP sensibilities NNS sensing VBG sensitives NNS sensitivities NNS sensitize VB Sens. NNS sensors NNS sentenced VBD sentenced VBN sentences NNS Sentences NNS sentencings NNS sentencing VBG Sentencing VBG sentiments NNS sent VBD sent VBN separated VBN SEPARATED VBN separate VB separate VBP separating VBG sequels NNS sequestered VBN sequester VB sequester VBP sequins NNS sergeants NNS serials NNS series NNS Series NNS servants NNS served VBD served VBN Served VBN servers NNS serves VBZ serve VB serve VBP serviced VBN services NNS SERVICES NNS services VBZ service VB servicing VBG serving VBG sessions NNS setbacks NNS sets NNS sets VBZ setters NNS settings NNS setting VBG Setting VBG settled VBD settled VBN settlements NNS Settlements NNS settlers NNS settles VBZ settle VB Settle VB settle VBP settling VBG set VB set VBD set VBN set VBP severed VBD severed VBN severing VBG sever VB sevices NNS sewers NNS sew VB sexes NNS shacks NNS shades NNS shadowing VBG shadows NNS shadow VB shags VBZ shaken VBN Shaken VBN shakes VBZ shake VB SHAKE VB shake VBP shaking VBG shaped VBN shapes NNS shapes VBZ shape VB shape VBP shaping VBG shards NNS sharecroppers NNS shared VBD shared VBN shareholders NNS Shareholders NNS shareholdings NNS shares NNS Shares NNS shares VBZ share VB share VBP sharing VBG Sharing VBG sharks NNS sharpening VBG sharpens VBZ sharpen VB shattered VBD shattered VBN shattering VBG shatters NNS shaved VBD shaved VBN shaves VBZ shave VB Shaving VBG sheared VBN sheaths NNS shedding VBG SHEDDING VBG shed VB shed VBD shed VBN shed VBP sheets NNS sheiks NNS shelled VBD shelling VBG shells NNS shell VB sheltered VBN sheltering VBG shelters NNS SHELTERS NNS shelved VBD shelves NNS shepherded VBD sheriffs NNS shied VBD shied VBN shielded VBD shielded VBN shielding VBG shields NNS shield VB shifted VBD shifted VBN shifting VBG shifts NNS shifts VBZ shift VB shift VBP shillings NNS shills NNS shimmered VBD shine VB shine VBP shining VBG Shining VBG shins NNS shipbuilders NNS shipments NNS Shipments NNS shipped VBD shipped VBN shippers NNS Shippers NNS shipping VBG shipsets NNS ships NNS ship VB ship VBP shipyards NNS shirking VBG shirk VB shirts NNS shivering VBG shivers NNS shocked VBD shocked VBN Shocked VBN shocks NNS shock VB shoehorned VBN shoe-horn VB shoelaces NNS shoemaking VBG shoes NNS shook VBD Shook VBD shootings NNS shooting VBG shoots NNS shoots VBZ shoot VB shoot VBP shopkeepers NNS shopped VBD shopped VBN shoppers NNS Shoppers NNS SHOPPERS NNS shopping VBG shops NNS Shops NNS SHOPS NNS shop VB shores NNS shore VB shoring VBG shorn VB Shorn VBN shortages NNS shortchanged VBN short-circuited VBN shortcomings NNS shorted VBN shortened VBN shortening VBG shorten VB shorting VBG Shorting VBG short-sellers NNS shorts NNS short VB shots NNS shot VBD shot VBN shouldering VBG shoulders NNS shoulder VB shoulder VBP shouted VBD shouting VBG shouts VBZ shout VB shout VBP shoved VBD shovels NNS shoves VBZ shove VB shoving VBG showcases NNS showed VBD showed VBN showers NNS shower VB showgirls NNS showings NNS showing VBG Showing VBG shown VBN showrooms NNS Showrooms NNS shows NNS show-stoppers NNS shows VBZ Shows VBZ show VB Show VB show VBP shrank VBD shredded VBD shrieked VBD shrines NNS shrinking VBG Shrinking VBG shrinks VBZ shrink VB shriveled VBD shrouded VBD shrubs NNS Shrubs NNS shrugged VBD shrugs NNS shrugs VBZ shrug VB shrunk VBN shudders NNS shuffled VBD shuffling VBG shunned VBD shunning VBG shun VB shun VBP SHUN VBP shutdowns NNS shuts NNS shuts VBZ shuttered VBD shuttering VBG shutting VBG shuttled VBD shuttled VBN shuttles NNS shuttling VBG shut VB shut VBD shut VBN shying VBG shy VB shy VBP siblings NNS sided VBD sided VBN sidelined VBN sidelines NNS sidelining VBG sides NNS SIDES NNS sidestepped VBD sidestepping VBG sidesteps VBZ sidestep VB sidestep VBP sidetrack VB side VB side VBP sidewalks NNS sifted VBN sift VB sift VBP sighed VBD sighing VBG sighs NNS sighs VBZ sightings NNS sights NNS signaled VBD signaled VBN SIGNALED VBN signaling VBG signalling VBG signals NNS signals VBZ signal VB SIGNAL VB signal VBP signatories NNS signatures NNS signboards NNS signed VBD signed VBN Signed VBN signified VBD signifying VBG signify VB signing VBG signs NNS Signs NNS signs VBZ sign VB silenced VBN silences NNS Silences NNS silence VB silted VBN similarities NNS simmering VBG simmer VB simplicities NNS simplified VBN simplifying VBG SIMPLIFYING VBG simplify VB simplify VBP simulates VBZ simulate VBP simulations NNS simulators NNS singers NNS singing VBG Singin VBG singled VBD singled VBN singles NNS single VB single VBP singling VBG sings VBZ sing VB sing VBP sinking VBG sink VB sink VBP sins NNS siphoned VBD siphoned VBN siphoning VBG siphon VB sipped VBD sipping VBG sirens NNS sisters NNS sitcoms NNS sites NNS sits VBZ sitting VBG Sitting VBG sitting VBG|JJ situated VBN situations NNS sit VB Sit VB sit VBP six-packs NNS sixties NNS Sixties NNS sized VBD sizes NNS size VBP sizzle VB sizzling VBG skateboards NNS skeptics NNS Skeptics NNS sketches NNS sketches VBZ sketching VBG skewed VBN skidded VBD SKIDDED VBD skidded VBN skidding VBG skids NNS skid VB skid VBD skiers NNS skies NNS skiing VBG skills NNS Skills NNS skimmers NNS skimp VB skim VB skins NNS skipped VBD skipper VB skipping VBG Skipping VBG skip VB skip VBP skirmished VBN skirmishes NNS skirted VBN skirting VBG skirts VBZ skis NNS ski VB SKr1.5 NNS SKr205 NNS SKr20 NNS SKr225 NNS SKr29 NNS Skulls NNS skyrocketed VBD skyrocketed VBN skyrocketing VBG slabs NNS slackened VBD slackened VBN slackening VBG slacks NNS slack VB slain VBN slam-dunk VB slammed VBD slam VBP slapped VBD slapped VBN slaps NNS slaps VBZ slap VB slashed VBD slashed VBN slashes NNS slashing VBG slash VB slash VBP slated VBD slated VBN Slated VBN slats NNS slaughtered VBN slayings NNS Slay VBP sleeping VBG Sleeping VBG sleeps VBZ sleep VB sleep VBP Sleep VBP sleeves NNS slept VBD slice VB slice VBP slicing VBG slide-packs NNS slides NNS Slides NNS slides VBZ slide VB sliding VBG slid VBD slid VBN slimmed VBN slimming VBG slingers NNS slings NNS slipped VBD slipped VBN slipping VBG slips VBZ slip VB slip VBP slithered VBD slithering VBG slits NNS slivered VBN slogans NNS slogs VBZ slog VB slopes NNS slopes VBZ sloshing VBG slots NNS slough VB slowdowns NNS slowed VBD slowed VBN slowing VBG Slowing VBG slows VBZ slow VB S&Ls NNS slumped VBD slumped VBN slumping VBG slumps NNS slump VB slump VBP slums NNS slurs NNS smacks VBZ smack VBP smarting VBG smashed VBD smashed VBN smashing VBG smash VB smelled VBD smelling VBG smells VBZ smell VBP smelt VBN smidgins NNS smiled VBD smiles NNS smiles VBZ smile VBP smiling VBG Smiling VBG smoked VBD smokers NNS Smokers NNS smokescreens NNS smokes VBZ smoke VB smoke VBP smoking VBG smoldering VBG smolder VBP smoothed VBN smooth VB smothered VBN smothering VBG smother VB smuggle VB snafus NNS snagged VBN snags NNS snakes NNS snakes VBZ snaking VBG snapped VBD snapped VBN snapping VBG snapshots NNS snaps VBZ snap VBP snare VB snarls NNS snatched VBD snatched VBN snatch VB sneaked VBD sneakers NNS sneaking VBG sneak VBP sniffed VBD sniffing VBG sniffs VBZ sniff VB sniped VBD snippets NNS sniveling VBG 's NNS snooping VBG Snoozing VBG snoring VBG snorts VBZ snowballed VBD snowball VB snowbirds NNS snubbed VBD snubbing VBG soaking VBG soak VB soaps NNS soapsuds NNS soared VBD soared VBN soaring VBG Soaring VBG soars NNS soar VB soar VBP sobered VBD sob VB socalled VBN social-affairs NNS socialists NNS Socialists NNS socialize VB socializing VBG societies NNS sociologists NNS socks NNS sock VB Sock VB sodas NNS sofas NNS soft-drinks NNS softened VBD softened VBN softening VBG softens VBZ soften VB softies NNS soiled VBD soil-nutrients NNS soils NNS soirees NNS soldiers NNS Soldiers NNS sold VBD sold VBN solicitations NNS solicited VBD solicited VBN soliciting VBG solicitors NNS solicits VBZ solicit VB solidified VBD solidify VB solidify VBP solutions NNS solved VBD Solved VBD solved VBN solvents NNS solves VBZ solve VB solving VBG Solving VBG songs NNS SONGsters NNS songwriters NNS sons NNS soothe VB soothing VBG sophisticates NNS sops NNS soreheads NNS sorted VBN sorting VBG Sorting VBG sorts NNS sort VB sought VBD sought VBN soulmates NNS souls NNS sounded VBD sounded VBN soundings NNS sounding VBG sounds NNS sounds VBZ Sounds VBZ sound VB Sound VB sound VBP soups NNS sources NNS Sources NNS sourcing VBG soured VBD soured VBN sour VB soviets NNS Soviets NNS sowed VBD Sows NNS sows VBZ sow VB sow VBP soybeans NNS Soybeans NNS SOYBEANS NNS spaceships NNS spaces NNS spackle VB spaghetti NNS spanned VBD spanning VBG spans NNS span VBP spared VBD spared VBN spares NNS spares VBZ spare VB spare VBP sparing VBG sparked VBD sparked VBN sparking VBG sparkplugs NNS sparks NNS sparks VBZ Sparks VBZ spark VB sparred VBD sparring VBG spasms NNS spas NNS spawned VBD spawned VBN spawns VBZ spawn VB speakers NNS Speakers NNS speaking VBG Speaking VBG speaks VBZ speak VB speak VBP spearheaded VBD spearheaded VBN spearheading VBG specialists NNS Specialists NNS specialized VBN specializes VBZ specialize VB specialize VBP specializing VBG specials NNS specialties NNS specialty-chemicals NNS specialty-metals NNS species NNS specifications NNS Specifications NNS specifics NNS specified VBD specified VBN specifies VBZ specifying VBG specify VB specimens NNS specs NNS spectators NNS speculated VBD speculated VBN speculate VB speculate VBP speculating VBG speculations NNS speculators NNS Speculators NNS sped VBD sped VBN speeches NNS speeded VBD speeding VBG speeds NNS speeds VBZ speed VB Speed VB spelled VBN spelling VBG spells NNS spells VBZ spell VB spenders NNS spending VBG Spending VBG spending VBG|NN spends VBZ spendthrifts NNS spend VB Spend VB spend VBP spent VBD spent VBN spewed VBD spewing VBG spices NNS spiders NNS spied VBD spies NNS spies VBZ spigots NNS spiked VBN spilled VBD spilled VBN spills NNS spills VBZ spill VB spinning VBG spinoffs NNS spins VBZ spin VB spiraled VBD spiraling VBG spiralled VBD spirited VBN spirits NNS Spirits NNS splashed VBD splints NNS splits NNS Splits NNS splits VBZ split VB split VBD split VBN spoiled VBD spoil VBP spoken VBN spokesmen NNS Spokesmen NNS spokes NNS Spokespersons NNS spoke VBD sponsored VBD sponsored VBN sponsoring VBG sponsors NNS Sponsors NNS sponsors VBZ sponsor VB sponsor VBP spooked VBD spooked VBN Spooked VBN spooks NNS spook VBP spoonbills NNS spoonfuls NNS spores NNS sported VBD sporting-goods NNS sporting VBG sportsmen NNS sports NNS Sports NNS sports VBZ spotlight VB spots NNS Spots NNS spots VBZ spotted VBD spotted VBN Spotted VBN spotting VBG spotting VBG|NN spot VB spouses NNS spout VBP sprang VBD sprawling VBG spraying VBG sprays NNS spreading VBG spreadsheets NNS spreads NNS Spreads NNS spreads VBZ spread VB spread VBD spread VBN spread VBP springing VBG springs VBZ spring VB sprinkled VBD sprinklers NNS sprinkles VBZ sprinkle VBP spritzers NNS sprouting VBG sprout VB sprout VBP spruce VB SPRUCING VBG sprung VBN spuds NNS spun VBD spun VBN spurned VBD spurned VBN spurning VBG spurns VBZ spurn VB spurn VBP spurred VBD spurred VBN Spurred VBN spurring VBG spurs VBZ spurted VBD spurted VBN spurts NNS spurts VBZ spurt VBP spur VB sputtered VBD sputter VBP spying VBG spy VB squabbles NNS squads NNS squalls NNS squandered VBN squandering VBG squared VBD squared VBN squares NNS square VB squaring VBG squatted VBN squeaking VBG squeegee VBP squeezed VBD squeezed VBN squeezes VBZ squeeze VB squeeze VBP squeezing VBG squelched VBN squelch VBP squinted VBD squinting VBG squirming VBG stabbed VBD stabbed VBN stabbing VBG stabilized VBD stabilized VBN stabilizes VBZ stabilize VB stabilize VBP stabilizing VBG Stabilizing VBG stacked VBN stacking VBG stacks NNS stack VB stadiums NNS Stadiums NNS staffed VBN staffers NNS Staffers NNS staffing VBG staffs NNS staff VB staged VBD STAGED VBD staged VBN stages NNS stages VBZ stage VB stage VBP stagewhispers VBZ staggered VBD staggering VBG staging VBG stagnated VBD stagnating VBG staid VBN stains NNS stain VB staircases NNS stairs NNS staked VBN stakes NNS stake VB stalked VBN stalking VBG stalled VBD stalled VBN Stalled VBN stalling VBG stalls NNS stalls VBZ stall VB stalwarts NNS stampeded VBN stampede VB stampings NNS stamping VBG stamps NNS stamp VB stanch VB standardize VB standards NNS standbys NNS standing VBG Standing VBG stands NNS stands VBZ stand VB Stand VB stand VBP Stand VBP stapling VBG stared VBD stares NNS stare VBP staring VBG starring VBG stars NNS stars VBZ started VBD started VBN Started VBN starters NNS starting VBG Starting VBG startled VBD starts NNS starts VBP starts VBZ start-up NNS start-ups NNS start VB start VBP star VB starved VBN starve VB Starve VB starving VBG stashed VBD stash VB stated VBD stated VBN statements NNS Statements NNS statesmen NNS states NNS States NNS STATES NNS states VBZ state VB state VBP stating VBG stationed VBN stations NNS statisticians NNS statistics NNS Statistics NNS statues NNS statutes NNS staunch VB stave VB stayed VBD stayed VBN staying VBG stays NNS stays VBZ stay VB stay VBP steadied VBD steadied VBN steaks NNS stealing VBG steals VBZ steal VB steal VBP steamed VBD steamed VBN steaming VBG steelmakers NNS Steelmakers NNS steels NNS steelworkers NNS steered VBD steered VBN steering VBG steers VBZ steer VB stemmed VBD stemmed VBN stemming VBG stems VBZ stem VB stem VBP stepped VBD stepped VBN stepping VBG steps NNS Steps NNS steps VBZ Steps VBZ step VB Step VB step VBP stereos NNS stereotypes NNS steriles NNS sterilized VBN sterilize VB sterilizing VBG steroids NNS stewards NNS stewed VBD stickers NNS sticking VBG sticks NNS sticks VBZ stick VB Stick VB stick VBP stifles VBZ stifle VB stifling VBG still-raging VBG stilts NNS stimulated VBN stimulate VB stimulating VBG stimulators NNS stimuli NNS stinging VBG stingrays NNS stink VBP stints NNS stipends NNS stipulated VBD stipulates VBZ stipulate VB stirred VBD stirrings NNS stirring VBG stirrups NNS stirs VBZ Stirs VBZ stir VB stir VBP stitched VBN stitches NNS stockbrokers NNS Stockbrokers NNS stockbuilding VBG stocked VBN stockholders NNS Stockholders NNS stockholdings NNS stock-index-futures NNS stock-index NNS stocking VBG stock-options NNS stockpiles NNS stockpile VB stocks NNS Stocks NNS STOCKS NNS stocks VBZ stock VB stockyards NNS stoked VBN stoke VB stoking VBG stolen VBN stole VBD stomachs NNS stomach VB stomped VBD stomping VBG stoned VBN stones NNS stonewalled VBD stood VBD stood VBN stooges NNS stools NNS stoppages NNS stopped VBD stopped VBN stopping VBG stops NNS stops VBZ stop VB Stop VB stop VBP stored VBD stored VBN storefronts NNS stores NNS Stores NNS stores VBZ store VB store VBP stories NNS storing VBG stormed VBD storming VBG storms NNS storytellers NNS stowed VBD straddling VBG strafe VB straightening VBG straighten VB strained VBN strainers NNS straining VBG strains NNS strain VB strain VBP straits NNS stranded VBN stranding VBG strands NNS strangled VBN strangles VBZ strapped VBD strapped VBN stratagems NNS strategies NNS Strategies NNS strategists NNS Strategists NNS strawberries NNS straying VBG stray VB stray VBP streaked VBD streamed VBD streaming VBG streamlined VBD streamline VB streamlining VBG streams NNS streets NNS Streetspeak VB strengthened VBD strengthened VBN strengthening VBG strengthens VBZ strengthen VB strengthen VBP strengths NNS stressed VBD stressed VBN stresses NNS stresses VBZ stressing VBG stressors NNS stress VB stress VBP stretched VBD stretched VBN stretches NNS stretches VBZ stretching VBG Stretching VBG stretch VB stretch VBP strewn VBN stricken VBN strides NNS strikers NNS strikes NNS Strikes NNS strikes VBZ strike VB strike VBP striking VBG Striking VBG strings NNS string VB stripes NNS stripped VBD stripped VBN stripping VBG strips NNS Strips NNS strips VBZ strip VB striven VBN strives VBZ strive VBP striving VBG strode VBD strokes NNS Strokes NNS strokes VBZ Stroking VBG strolling VBG stroll VB stroll VBP strongholds NNS strove VBD struck VBD STRUCK VBD struck VBN structured VBD structured VBN structures NNS structure VB structuring VBG struggled VBD STRUGGLED VBD struggled VBN struggles NNS struggles VBZ struggle VB struggling VBG STUBBED VBN stuck VBD stuck VBN studded VBN student-athletes NNS studentled VBN students NNS Students NNS STUDENTS NNS studied VBD studied VBN studies NNS Studies NNS studies VBZ studios NNS Studios NNS studying VBG study VB study VBP stuffed VBD stuffed VBN Stuffing VBG stuff VBP stumbled VBD stumbled VBN stumble VB stumble VBP stumbling VBG stung VBN Stung VBN stunned VBD stunned VBN Stunned VBN stunted VBN styled VBN styles NNS styling VBG stymied VBN subcommittees NNS subcompacts NNS subconferences NNS subcontractors NNS Subcontractors NNS subcontract VB subdued VBN subgroups NNS subjected VBN subjecting VBG subjects NNS subjects VBZ subject VB sublet VB submarines NNS sub-markets NNS submits VBZ submitted VBD submitted VBN submitting VBG submit VB submit VBP subordinated VBD subordinated VBN subordinates NNS subordinates VBZ subordinate VB subpoenaed VBN subpoenas NNS subpoena VB subscribed VBN subscribers NNS Subscribers NNS subscribes VBZ subscribe VB subscribe VBP subscribing VBG Subscribing VBG subscriptions NNS sub-segments NNS subsided VBD subsided VBN subsides NNS subsidiaries NNS subsidies NNS Subsidies NNS subsidized VBN subsidizes VBZ subsidize VB subsidizing VBG Subsidizing VBG subskills NNS substances NNS substantiate VB substations NNS substituted VBD substituted VBN substitutes NNS substitute VB substituting VBG Substituting VBG substracting VBG subsumed VBN subtilis NNS subtitled VBN subtracted VBD subtracted VBN subtracting VBG subtract VB sub-underwriters NNS sub-underwriting VBG suburbs NNS subversives NNS subverted VBN subverts VBZ subvert VB subways NNS succeeded VBD succeeded VBN succeeding VBG Succeeding VBG succeeds VBZ succeed VB succeed VBP successes NNS successors NNS succumbed VBD succumbed VBN succumbing VBG suckers NNS sucks VBZ sued VBD sued VBN SUES VBZ sue VB suffered VBD suffered VBN suffering VBG Suffering VBG suffers VBZ suffer VB suffer VBP sufficed VBD suffice VB Suffice VB suggested VBD suggested VBN suggesting VBG suggestions NNS suggests VBZ suggest VB suggest VBP suing VBG suited VBN suites NNS suitors NNS Suitors NNS suits NNS Suits NNS suits VBZ suit VB summaries NNS summarized VBD summarize VB summarizing VBG Summarizing VBG summed VBD summers NNS summoned VBD summoned VBN summoning VBG summon VB sums NNS sums VBZ sum VB Sundays NNS sunflowers NNS sunglasses NNS sung VBN sunk VBD sunk VBN sunsets NNS superceded VBD supercede VBP supercomputers NNS superconcentrates NNS Superconcentrates NNS superconductors NNS Superconductors NNS superimposed VBN superintendents NNS superiors NNS supermarkets NNS Supermarkets NNS superpowers NNS superpremiums NNS superseded VBD superseded VBN supersede VB superstars NNS supervised VBD supervised VBN supervises VBZ supervise VB supervise VBP supervising VBG supervisors NNS supplements NNS supplement VB supplied VBD supplied VBN suppliers NNS Suppliers NNS supplies NNS supplies VBZ supplying VBG supply VB supply VBP supported VBD supported VBN supporters NNS Supporters NNS supporting VBG Supporting VBG supports NNS supports VBZ support VB support VBP supposed VBD supposed VBN suppose VB Suppose VB suppose VBP suppressants NNS suppressed VBD suppressed VBN suppress VB surfaced VBD surfaced VBN surfaces VBZ surface VB surface VBP surfacing VBG surfers NNS surged VBD SURGED VBD surged VBN surges NNS surges VBZ surge VB surging VBG surmounting VBG surpassed VBD surpassed VBN surpassing VBG surpass VB surpass VBP surpluses NNS surprised VBD surprised VBN surprises NNS Surprises NNS surprise VB surprising VBG surrendered VBD surrendered VBN surrendering VBG surrender VB Surrender VB surrounded VBN Surrounded VBN surrounded VBN|JJ surrounding VBG surround VBP surtaxes NNS surveyed VBD surveyed VBN Surveying VBG surveys NNS Surveys NNS surveys VBZ survey VB survey VBP survived VBD Survived VBD survived VBN survives VBZ survive VB Survive VB survive VBP surviving VBG Surviving VBG survivors NNS suspected VBD suspected VBN suspecting VBG suspects NNS suspects VBZ suspect VB suspect VBP suspended VBD suspended VBN suspending VBG suspend VB suspensions NNS suspicions NNS sustained VBD sustained VBN sustaining VBG sustains VBZ sustain VB sutures NNS suvivors NNS 's VBZ s VBZ 'S VBZ swallowed VBN swallowing VBG swallow VB swamped VBD swamped VBN swamp VB swans NNS swapped VBN swapping VBG swaps NNS swap VB swap VBP swarms NNS swathed VBN swayed VBN swaying VBG sway VB sway VBP swears VBZ swear VB swear VBP sweated VBN sweaters NNS Sweating VBG sweatshirts NNS sweat VB sweat VBD sweat VBP Swedes NNS sweepers NNS sweeping VBG sweeps NNS sweepstakes NNS sweeps VBZ sweep VB sweetened VBD sweetened VBN sweeteners NNS sweeten VB sweets NNS swelled VBD swelled VBN swelling VBG swells NNS swells VBZ swell VB swell VBP swept VBD swept VBN swerve VBP swimming VBG swim VBP swindled VBN swine NNS swinging VBG swings NNS swings VBZ swing VB swing VBP switched VBD switched VBN switchers NNS switches NNS switches VBZ switching VBG SWITCHING VBG switch VB switch VBP swiveling VBG Swiveling VBG swore VBD sworn VBN swung VBD SWUNG VBD swung VBN sycophants NNS symbolized VBN symbolizes VBZ symbols NNS sympathies NNS sympathizers NNS sympathize VBP symposiums NNS symptoms NNS Syms NNS synchronized VBN Synchronized VBN synchronize VBP syndciated VBN syndicated VBN syndicates NNS Syndicates NNS syndicate VB syndicating VBG syndications NNS synergies NNS syngeries NNS synonyms NNS synthesizers NNS synthesize VB synthetics NNS systems NNS Systems NNS tabacs NNS tables NNS tablespoons NNS tablets NNS tabloids NNS taboo VB tabs NNS tacked VBD tacking VBG tackled VBN tackles VBZ tackle VB tackle VBP tackling VBG tacos NNS tactics NNS tagged VBN tags NNS tag VB tailing VBG tailored VBN tailoring VBG tails NNS tainted VBN taken VBN Taken VBN takeovers NNS takers NNS takes VBZ Takes VBZ taketh VB take VB Take VB take VBP Take VBP takings NNS taking VBG Taking VBG talents NNS tales NNS Tales NNS talked VBD talked VBN talking VBG Talking VBG talks NNS Talks NNS talks VBZ TALKS VBZ talk VB talk VBP tallied VBD tallies NNS tallying VBG taming VBG Taming VBG tampering VBG tampers NNS tamper VB tampons NNS Tandy VB tangled VBN tangle VBP tangoed VBD tanked VBN tankers NNS Tankers NNS tanks NNS Tanks NNS tank VB tanned VBN tans NNS tantalizing VBG taped VBD taped VBN Taped VBN tapering VBG tapers NNS tapers VBZ taper VB tapes NNS tapestries NNS tape VB tape VBP tapings NNS taping VBG tapped VBD tapped VBN tapping VBG taps NNS Taps VBZ tap VB Tap VB targeted VBD targeted VBN targeting VBG targeting VBG|NN targets NNS targets VBZ target VB target VBP tariffs NNS tarnished VBD tarnished VBN tarnish VB tarred VBD tartans NNS tasks NNS tassels NNS tastes NNS Tastes NNS tastes VBZ taste VB taste VBP taught VBD taught VBN taunted VBD tax-deductions NNS taxed VBD taxed VBN taxes NNS Taxes NNS taxes VBZ tax-exempts NNS Tax-exempts NNS taxpayers NNS Taxpayers NNS TAXPAYERS NNS tax-reducing VBG tax VB tax-writers NNS T-bills NNS teachers NNS teaches VBZ teaching VBG teach VB TEACH VB teach VBP teamed VBD teaming VBG teammates NNS teams NNS Teams NNS team VB tearing VBG tears NNS tear VB teased VBN teaspoons NNS technical-services NNS technicians NNS techniques NNS technocrats NNS technologies NNS teemed VBD teeming VBG teen-agers NNS teenagers NNS teens NNS Teens NNS teetering VBG teeth NNS telecines NNS telecommunications NNS telecommunications NNS|NN Telecussed VBD telegraphed VBD telegraph VBP telemarketers NNS telephoned VBD Telephone-operations NNS telephones NNS telephone VB telephoning VBG telesystems NNS Telesystems NNS televised VBN televisions NNS telexes NNS tellers NNS telling VBG tells VBZ tell VB Tell VB tell VBP temblors NNS temperatures NNS tempered VBN temps NNS tempted VBD tempted VBN tempts VBZ tempt VB tempt VBP tenants NNS tended VBD tended VBN tendencies NNS tendered VBD tendered VBN tendering VBG tenders NNS Tenders NNS tender VB tender VBP tending VBG tends VBZ tend VB tend VBP tenets NNS tensions NNS tens NNS tenths NNS termed VBD termed VBN terminals NNS Terminals NNS terminated VBD terminated VBN terminate VB terminating VBG terminations NNS terms NNS Terms NNS term VB terrified VBN terrify VB territories NNS terrorists NNS testaments NNS test-drive VB test-drive VBP tested VBD tested VBN testers NNS test-fired VBN testified VBD testifies VBZ testifying VBG Testifying VBG testify VB testify VBP testing VBG testing VBG|NN tests NNS TESTS NNS tests VBZ test VB test VBP tethered VBN textbooks NNS textiles NNS texts NNS thanked VBD thanking VBG thanks NNS Thanks NNS THANKS NNS thank VB Thank VB thank VBP that VBP theaters NNS thefts NNS Thefts NNS themed VBN themes NNS theologians NNS theories NNS theorists NNS Theorists NNS theorized VBD theory-teaching VBG therapies NNS therapists NNS thermometers NNS The VB the VBP thieves NNS Thieves NNS things NNS Things NNS thinking VBG thinks VBZ think VB Think VB think VBP Think VBP thinned VBN thinning VBG thirds NNS thirties NNS Thirties NNS thistles NNS thoroughbreds NNS thoughts NNS thought VBD thought VBN thousands NNS Thousands NNS thrashed VBD thrashing VBG thrash VB threads NNS threatened VBD threatened VBN threatening VBG threatens VBZ threaten VB threaten VBP threats NNS three-fourths NNS three-quarters NNS three-sevenths NNS threw VBD thrifts NNS Thrifts NNS thrilled VBN thrills VBZ thrill VB thrived VBD thrives VBZ thrive VB thrive VBP Thrive VBP thriving VBG throats NNS throwers NNS throwing VBG Throwing VBG thrown VBN throws VBZ throw VB Throw VB throw VBP thrusting VBG thrusts NNS thrust VBD thrust VBN thugs NNS thumbing VBG thumbs NNS thumbs VBZ thumb VB thundered VBD thwarted VBD thwarted VBN thwarting VBG thwart VB ticked VBN ticketed VBN ticketing VBG tickets NNS ticking VBG ticks VBZ tick VB tidbits NNS tides NNS tied VBD tied VBN tie-ins NNS ties NNS ties VBZ tie-ups NNS tie VB tigers NNS tightened VBD tightened VBN tightening VBG tighten VB tighten VBP tiles NNS tilted VBN tilts VBZ tilt VB timberlands NNS timbers NNS timed VBD timed VBN time-hotels NNS time-shares NNS times NNS Times NNS time VB timing VBG timpani NNS tinged VBN tinges NNS tinkered VBN tinkering VBG tinker VB tins NNS tipped VBD tips NNS tipsters NNS tiptoed VBD tiptoed VBN tiptoe VB tip VB tired VBN tire-kickers NNS tires NNS tires VBZ tissues NNS titans NNS titled VBD titled VBN Titled VBN titles NNS toasted VBD toddlers NNS toeholds NNS toes NNS toiled VBD toiletries NNS toiling VBG toil VB toil VBP told VBD told VBN tolerate VB tolls NNS Tolls NNS tomatoes NNS toned VBN tones NNS tongues NNS tonnages NNS tons NNS Tons NNS took VBD tools NNS tooted VBD topics NNS topped VBD topped VBN topping VBG Topping VBG toppled VBD toppled VBN topple VB tops NNS tops VBZ top VB top VBP torched VBD torched VBN tore VBD Tories NNS tormentors NNS torments VBZ torment VB tornadoes NNS torn VBN torpedoed VBN torpedo VB torts NNS tortured VBN torture VB torture VBP tossed VBD tossed VBN tossers NNS tossing VBG toss VB totaled VBD totaled VBN totaling VBG totalling VBG totals NNS Totals NNS totals VBZ total VB total VBP toted VBN tote VB toting VBG tottering VBG totter VB touched VBD touched VBN touches NNS Touches VBZ touching VBG touch VB touch VBP toughened VBD toughen VB tough VB toured VBD touring VBG tourists NNS tournaments NNS tours NNS tours VBZ tour VB touted VBD touted VBN touting VBG touts VBZ tout VB towels NNS Towering VBG towers NNS tower VB townhouses NNS townships NNS towns NNS tows NNS toying VBG toys NNS traced VBD traced VBN traces NNS traces VBZ trace VB tracing VBG tracked VBD tracked VBN tracking VBG tracks NNS tracks VBZ track VB track VBP tractors NNS tracts NNS traded VBD traded VBN traded VBN|VBD trademarks NNS trade-offs NNS tradeoffs NNS traders NNS Traders NNS trades NNS trades VBZ trade VB trade VBP trading VBG Trading VBG trading VBG|NN traditionalists NNS traditions NNS traduced VBN traduce VB traffickers NNS tragedies NNS trailed VBD trailed VBN trailers NNS trailing VBG trails NNS trails VBZ trail VB trail VBP trained VBN Trained VBN trainers NNS training VBG trains NNS Trains NNS trains VBZ train VB train VBP traipse VB traits NNS tramping VBG trampled VBN trampling VBG transacted VBN transacting VBG transactions NNS Transactions NNS transact VB transcribe VBP transcripts NNS transferred VBD transferred VBN transferring VBG transfers NNS transfers VBZ transfer VB transformed VBD transformed VBN transforming VBG transforms VBZ transform VB transfusions NNS transistors NNS translated VBN Translated VBN translate VB translate VBP translating VBG translations NNS transmissions NNS transmitted VBD transmitted VBN transmitting VBG transmit VB transmogrified VBD transmogrified VBN transplanted VBN transplanting VBG transplants NNS transplant VB transported VBD transported VBN transporting VBG transports NNS Transports NNS transports VBZ transport VB Transport VB transvestites NNS trapped VBN trappings NNS trapping VBG traps NNS trashing VBG trash VB traumas NNS traumatized VBD traumatized VBN travails NNS traveled VBD traveled VBN travelers NNS Travelers NNS traveling VBG Traveling VBG travelogues NNS travels NNS travels VBZ TRAVELS VBZ travel VB travel VBP trays NNS treadmills NNS treads VBZ tread VB treasurers NNS treasures NNS Treasures NNS treasuries NNS Treasurys NNS treated VBD treated VBN treating VBG TREATING VBG treatises NNS treatments NNS treats NNS treats VBZ treat VB treat VBP trebled VBD trees NNS trekked VBD trembling VBG tremors NNS trenches NNS trending VBG trend-setters NNS trends NNS trend VB trespasses NNS trespass VBP trials NNS triangles NNS tribes NNS tributes NNS trickle VB trickle VBP trickling VBG tricks NNS trick VB tried VBD tried VBN tries NNS tries VBZ triggered VBD triggered VBN triggering VBG Triggering VBG triggers VBZ trigger VB trigger VBP trillions NNS trills NNS trimesters NNS trimmed VBD trimmed VBN trimming VBG TRIMMING VBG trim VB Trim VBP tripled VBD tripled VBN triples NNS triple VB tripling VBG tripped VBN trips NNS triumphed VBD trivialize VB trivia NNS troops NNS Troops NNS TROs NNS trotted VBD trotted VBN Trotting VBG troubled VBD troubled VBN Troubled VBN troublemakers NNS troubles NNS TROUBLES NNS troubles VBZ troubling VBG troughed VBD troughs NNS troupes NNS trousers NNS trout NNS Truckers NNS truck-parts NNS trucks NNS trucks VBZ truck VBP trudging VBG Trumped VBN trumpeting VBG trumpets VBZ trumpet VBP trundles VBZ trunks NNS trusted VBD trusted VBN trustees NNS Trustees NNS trusting VBG trusts NNS TRUSTS NNS trusts VBZ trust VB trust VBP truths NNS trying VBG Trying VBG tryouts NNS try VB Try VB try VBP Try VBP T-shirt NNS T-shirts NNS tubes NNS tubs NNS tucked VBD tucked VBN tucking VBG tuck VB tugged VBD tugging VBG tuitions NNS Tuitions NNS tumbled VBD tumbled VBN tumbles NNS tumbles VBZ tumble VB tumbling VBG tumors NNS tumor-suppressors NNS tuned VBN tunes NNS tune VB tune VBP tuning VBG tunnels NNS turbans NNS turbines NNS turboprops NNS turmoils NNS turnarounds NNS turned VBD turned VBN Turned VBN turning VBG turn-ons NNS turns NNS turns VBZ Turns VBZ TURNS VBZ turn VB turn VBP turtles NNS tusks NNS tutored VBN tutorials NNS tutoring VBG tuxedos NNS TVs NNS tweezers NNS twenties NNS twiddling VBG twinned VBN twisted VBN twisting VBG twists NNS twists VBZ twist VBP twitch VB two-hundredths NNS two-sevenths NNS two-thirds NNS twothirds NNS Two-thirds NNS tycoons NNS tying VBG typed VBN typefaces NNS types NNS typewriters NNS typhoons NNS typified VBD typifies VBZ UFOs NNS ulcers NNS ultimatums NNS umbrellas NNS un-advertisers NNS unbanning VBG unblock VB uncertainties NNS unchlorinated VBN uncles NNS unconsolidated VBD unconsolidated VBN uncovered VBD uncovered VBN uncovering VBG uncover VB Underclass NNS undercutting VBG undercut VB undercut VBD undercut VBN undercut VBP underestimated VBD underestimated VBN underfunded VBN undergirded VBD undergoing VBG undergone VBN undergo VB undergo VBP underlie VB underlined VBD underline VB underlying VBG Underlying VBG undermined VBD undermined VBN undermine VB undermine VBP undermining VBG underperformed VBN underperformers NNS underperforming VBG underperforms VBZ underperform VB underpinned VBN underpinning VBG underpin VB underpriced VBN underreacting VBG underscored VBD underscored VBN underscores VBZ underscore VB underscore VBP underscoring VBG Underscoring VBG underselling VBG understaffs VBZ understand\/adopt VB understanding VBG understands VBZ understand VB understand VBP understated VBD understated VBN understate VBP understating VBG understood VBD understood VBN undertaken VBN undertake VB undertakings NNS undertaking VBG undertones NNS undertook VBD underutilized VBN undervalued VBD undervalued VBN underweighted VBN underwent VBD underwhelmed VBN underwiters NNS underwriters NNS Underwriters NNS underwrites VBZ underwrite VB underwrite VBP underwritings NNS underwriting VBG Underwriting VBG underwritten VBN underwrote VBD undid VBD undone VBN undo VB undulate VB undulate VBP unearthed VBD unexecuted VBN unfazed VBN unfocussed VBN unfolded VBD unfolding VBG unfolds VBZ unfold VB unhinged VBN unhocked VBN unhusked VBN unified VBN uniforms NNS unifying VBG unify VB Unify VB unionists NNS unionized VBD unions NNS Unions NNS united VBN United VBN unites VBZ unite VB unitholders NNS Unitholders NNS units NNS Units NNS universities NNS Universities NNS unknowns NNS unleashed VBD unleashed VBN unleashes VBZ unleashing VBG unleash VB unleash VBP unloaded VBN unloading VBG Unloading VBG unload VB unload VBP unlocked VBD unlocks VBZ unlock VBP unmasks VBZ unmask VB unmaterialized VBN unnerved VBD unnerving VBG unplug VB unpolarizing VBG unraveled VBD unraveled VBN unraveling VBG unravel VB unrolls VBZ unroll VBP unseated VBD unseating VBG unsettled VBD unsettled VBN unveiled VBD unveiled VBN Unveiled VBN unveiling VBG unveils VBZ unveil VB unwavering VBG unwinding VBG unwind VB updated VBN updates NNS update VB updating VBG upgraded VBD upgraded VBN upgrades NNS Upgrades NNS upgrade VB upgrading VBG upheavals NNS upheld VBD upheld VBN UPHELD VBN upholding VBG upholds VBZ uphold VB upped VBD Upping VBG uprooted VBD ups-and-downs NNS upsets NNS upsetting VBG upset VB upset VBD upset VBN Upset VBN ups NNS uptick VB Urals NNS urged VBD urged VBN URGED VBN urges VBZ urge VB urge VBP urgings NNS urging VBG used VBD used VBN Used VBN users NNS Users NNS uses NNS uses VBZ use VB Use VB use VBP ushered VBD ushered VBN ushering VBG ushers NNS ushers VBZ using VBG Using VBG usurp VB U.S. VBP utilities NNS Utilities NNS utilize VB utmosts NNS utopians NNS utterances NNS uttered VBD uttering VBG UVB NN vacancies NNS Vacancies NNS vacated VBD vacated VBN vacate VB vacating VBG vacationers NNS vacationing VBG vacations NNS vaccines NNS vacillate VB vacuum VB vagabonds NNS vagaries NNS validating VBG valuations NNS valued VBD valued VBN Valued VBN values NNS values VBZ value VB value VBP valuing VBG valves NNS vandalized VBD vanished VBD vanished VBN vanishes VBZ vanishing VBG vanish VB vanish VBP Vanities NNS vans NNS vapors NNS variables NNS variations NNS varied VBD varied VBN varies VBZ varieties NNS varying VBG vary VB vary VBP Vary VBP vases NNS vassals NNS vaults NNS vault VB VCRs NNS veering VBG veer VB vegetables NNS Vegetables NNS vegetarians NNS vehicles NNS veiled VBN Veiling VBG vending VBG vendors NNS vented VBD ventilated VBD ventilated VBN ventures NNS venture VB venturing VBG vent VB venues NNS verdicts NNS verged VBD verified VBN verify VB versions NNS vessels NNS vested VBN vestments NNS vests NNS veterans NNS veterinarians NNS vetoed VBD vetoed VBN vetoes NNS vetoing VBG veto VB 've VB 've VBP vexed VBN viaducts NNS vibrating VBG vicars NNS vices NNS vicissitudes NNS vicitims NNS victimized VBN victims NNS VICTIMS NNS victories NNS VICTORIES NNS videocassettes NNS videodisks NNS videos NNS videotapes NNS videotape VB Vietnamese NNS viewed VBD viewed VBN viewers NNS Viewers NNS viewings NNS viewing VBG viewpoints NNS views NNS Views NNS views VBZ view VB view VBP vignettes NNS villagers NNS villages NNS Villages NNS villains NNS vindicated VBN vineyards NNS vintages NNS vinyl-products NNS violated VBD violated VBN violates VBZ violate VB violate VBP violating VBG violations NNS virgins NNS virtues NNS virtuosos NNS visages NNS visionaries NNS visions NNS visited VBD visited VBN visiting VBG Visiting VBG visitors NNS Visitors NNS visits NNS visits VBZ visit VB visit VBP visualize VB VISUALIZING VBG visuals NNS vitiate VB voiced VBD voiced VBN voices NNS Voices NNS voices VBZ voice VB voice VBP voicing VBG voided VBN void VB volumes NNS volunteered VBD volunteered VBN volunteers NNS volunteer VB vomiting VBG voted VBD voted VBN vote-getters NNS voters NNS Voters NNS votes NNS votes VBZ vote VB vote VBP voting VBG vouchers NNS vowed VBD vowed VBN vowels NNS Vowels NNS vowing VBG Vowing VBG vows VBZ vying VBG waddles VBZ wade VB wad VB wafers NNS waffled VBD waffled VBN wafting VBG waged VBD wages NNS wage VB waging VBG wagons NNS wags NNS wailing VBG wail VB waited VBD waited VBN waiters NNS waiting VBG waits VBZ wait VB Wait VB wait VBP waived VBD waived VBN waivered VBN waivers NNS waives VBZ waive VB waiving VBG wake VBP waking VBG walked VBD walkie-talkies NNS walking VBG Walking VBG walking VBG|NN walkouts NNS walks NNS walks VBZ walk VB Walk VB walk VBP Walk VBP wallcoverings NNS wallets NNS wallops VBZ wallowing VBG walls NNS wall VB wandering VBG wanders VBZ wander VB wander VBP waned VBD waned VBN wanes VBZ WANES VBZ wane VB wane VBP waning VBG wanted VBD wanted VBN Wanted VBN wanting VBG wants VBZ want VB want VBP warded VBN wardens NNS ward VB warehouses NNS wares NNS warheads NNS warmed VBD warmed VBN warming VBG warming VBG|NN warm VB warned VBD Warned VBD WARNED VBD warned VBN Warners NNS warnings NNS warning VBG warns VBZ warn VB warn VBP warranted VBN warranties NNS warrants NNS warrants VBZ warrant VB warrant VBP Warrens NNS warring VBG warriors NNS wars NNS Wars NNS WARS NNS warts NNS war VB washed VBD washed VBN washing VBG wash VB wash VBP wasted VBD wasted VBN wastes NNS wastes VBZ waste VB wasting VBG was VBD Was VBD WAS VBD watchdogs NNS watched VBD watched VBN watchers NNS watches NNS watches VBZ watching VBG Watching VBG watch VB WATCH VB watch VBP watered VBD waterfalls NNS watering VBG waters NNS water VB waterworks NN|NNS watts NNS waved VBD wavelengths NNS wavered VBD wavering VBG waves NNS waving VBG waxed VBD waxed VBN ways NNS Ways NNS weakened VBD weakened VBN weakening VBG Weakening VBG weakens VBZ Weakens VBZ weaken VB weaken VBP weaknesses NNS wean VB weapons NNS wearing VBG Wearing VBG wears VBZ wear VB Wear VB wear VBP weasling VBG weather VB weather VBP weaves VBZ weaving VBG webs NNS wedded VBN weddings NNS wedged VBD wedged VBN wedge VB Wednesdays NNS weds VBZ wed VBN Weeds NNS weekdays NNS weekends NNS weeklies NNS weeknights NNS weeks NNS weepers NNS weeping VBG weighed VBD weighed VBN weighing VBG Weighing VBG weighs VBZ weighted VBN weightings NNS weights NNS weight VB weigh VB weigh VBP welcomed VBD welcomed VBN welcomes VBZ welcome VB Welcome VB welcome VBP welcoming VBG welded VBN wells NNS well-stated VBN well-wishers NNS went VBD WENT VBD wept VBD were VB Were VB were VBD Were VBD we VBP whacked VBD whacked VBN whack VB whales NNS wheelbases NNS wheeled VBN wheels NNS wheezing VBG when-issued VBN while VB whimpers NNS whims NNS whipped VBN whippings NNS whipping VBG whipsawed VBN whipsaw VB whips VBZ whip VB whirlwinds NNS whirring VBG whisked VBN whispered VBN whispering VBG whispers NNS whistled VBN whistles NNS whistle VBP whistling VBG whites NNS whitewashed VBN whittled VBN whizzes NNS wholesalers NNS Wholesalers NNS wholesaling VBG whooping VBG whoosh VBP widened VBD Widened VBD widened VBN widening VBG widens VBZ widen VB widen VBP widgets NNS widowed VBN widows NNS wielded VBN wielding VBG wields VBZ wield VB wiggled VBD wiggle VBP wiggling VBG wigs NNS Wilfred VBD willies NNS willing VBG wills NNS will VB wimping VBG wimp VB windfalls NNS winding VBG windows NNS windshields NNS winds NNS wind VB wind VBP wineries NNS wines NNS wings NNS winking VBG winners NNS Winners NNS winning VBG Winning VBG wins NNS wins VBZ winters NNS win VB win VBP wiped VBD wiped VBN wipe VB wipe VBP wiping VBG wired VBN wires NNS wiretaps NNS wire VB wisecracks NNS wished VBD wished VBN wishes NNS Wishes NNS wishes VBZ wishing VBG wish-lists NNS wish VB wish VBP witches NNS withdrawals NNS withdrawing VBG withdrawn VBN withdraw VB withdrew VBD withering VBG wither VBP withheld VBD withheld VBN withholdings NNS withholding VBG withhold VB Withhold VB withhold VBP withstanding VBG withstand VB withstood VBD withstood VBN witnessed VBN witnesses NNS Witnesses NNS witnessing VBG Witness VB wives NNS Wives NNS wizards NNS woes NNS woken VBN woke VBD wolves NNS womanizing VBG women NNS Women NNS wonderbars NNS wondered VBD wondering VBG wonders NNS wonders VBZ wonder VB wonder VBP won NNS won VBD won VBN WON VBP woodchucks NNS wood-products NNS woods NNS wooed VBN wooing VBG woo VB worded VBN word-processing NNS words NNS wore VBD workbooks NNS workdays NNS worked VBD worked VBN workers NNS Workers NNS WORKERS NNS workings NNS working VBG Working VBG working VBG|NN workmen NNS worksheets NNS Worksheets NNS works NNS workstations NNS works VBZ work VB Work VB work VBP WORK VBP world-affairs NNS worlds NNS worms NNS worn VBN Worn VBN worried VBD worried VBN Worried VBN worriers NNS worries NNS Worries NNS worries VBZ worrying VBG worry VB worry VBP worsened VBD worsened VBN worsening VBG worsen VB wounded VBN wounds NNS wound VB wound VBD wound VBN woven VBN wows VBZ wracked VBD wracked VBN wrack VBP wrangling VBG wrapped VBD wrapped VBN wrappers NNS wrapping VBG wraps VBZ wrap VB wreaked VBD wreaking VBG wreak VB wrecked VBD wrecking VBG wreck VB wrenched VBN wrenching VBG wrested VBN wrestlers NNS wrestles VBZ wrestle VB wrestling VBG wrest VB wriggling VBG wring VB wrists NNS write-downs NNS writedowns NNS write-offs NNS writeoffs NNS writer\/producers NNS writers NNS writes VBZ Writes VBZ write VB write VBP writhing VBG writings NNS writing VBG Writing VBG written VBN Written VBN wronged VBN wrote VBD Wrote VBD wrought VBD wrung VB X-rays NNS yachts NNS yanked VBD yanked VBN yanking VBG yank VB yards NNS yearbooks NNS yearlings NNS years NNS Years NNS YEARS NNS yelled VBD yelling VBG yellows NNS yells VBZ yell VB yelped VBD yen NNS yielded VBD yielded VBN yielding VBG yields NNS Yields NNS yields VBZ yield VB yield VBP youngsters NNS Younkers NNS youths NNS yuppies NNS zappers NNS zapping VBG zeroing VBG zeros NNS zero VB zero VBP zigzags NNS zig-zag VBP zip VB zip VBP zlotys NNS zombies NNS zones NNS Zones NNS zoning VBG zoomed VBD zoomed VBN zoom VB ================================================ FILE: ccgbank/data/wsj-nns-vb-stems ================================================ 12-inch 15 16-year-old 1850 1890 18-year-old 1900 1920 1930 1940 1950 1960 1970 1980 1990 19-year-old 204 20 20-stock 3090 '30 30 323 '40 '50 '60 '70 747-400 757-200 75-cents-an-hour '80 80 8300 '90 a330-300 abacus abandon abandon abandon abandon abandon abandon abandon abate abate abate abate abate abdicate abet abet abide ability abolish abolish abolish abolish abort abortion-rights abortion-rights abortion abound abound abound abound abrasive abridge absence absent absent absolve absolve absorb absorb absorb absorber absorb absorb absorb absorb abstain abstention abstract abuse abuse abuse abuse abuse academic accede accede accelerate accelerate accelerate accelerate accelerate accelerate accent acceptance acceptance accept accept accept accept accept accept accessory accessory access accident acclaim accolade accommodate accommodate accommodate accommodate accommodation accomodate accompany accompany accompany accompany accompany accomplish accomplish accomplish accomplishment accomplish accord accord accord accord accord accord accountant account account account account account account account account accrete accrual accrue accrue accrue accrue accrue accumulate accumulate accumulate accusation accuse accuse accuser accuse accuse accuse accuse accustom ace achieve achieve achievement achieve achieve achieve achieve ache acidify acid acknowledge acknowledge acknowledge acknowledge acknowledge acknowledge acorn acquaint acquiesce acquiesce acquire acquire acquire acquirer acquirer acquire acquire acquire acquisition acquisition acquit acquit acre across-the-board-cut act act act act action activate activate active activist activist activity actor act act actuary act act adapt adapt adapt adapt adapt adapt add add add add addict addict add add addition additive address address address address address address address address add add add add add adept adhere adhere adhesive adjourn adjudicator adjust adjust adjuster adjuster adjust adjustment adjust adjust adjust adjust adman administer administer administer administration administrator admire admire admirer admire admission admit admit admit admit admit admit admit admit admonish adolescent adopt adopt adopt adopt adopt adopt adopt adopt adorn adorn adorn adr ad ad ad adult adult advanced-ceramic advance advance advancement advancer advancer advance advance advance advance advance advance advantage advantage adventure adversary advertise advertise advertisement advertiser advertiser advertise advertise advertise advertise advertise advertise advise advise adviser advise advise advise advise advise advisory advocate advocate advocate advocate advocate advocate advocate advocate aerobic affair affected affect affect affect affection affect affect affect affidavit affiliate affiliate affiliate affiliate affinity affirm affirm afflict afflict afflict afford afford afford aftereffect aftereffect after-hour afternoon aftershock aftershock aftershock age agency agenda agent agent age age aggravate aggravate aggravate aggravate aggravate aggravate aggregate age agitate agonize agree agree agree agree agreement agree agree agree agree agriproduct aichus aid aid aid aide aide aid aid aid aid ail ailment ail aim aim aim aim aim aim aim aim aim aim aircraft air air airfield air airlift airlift airliner airline airline airman airplane airport air air air air airwave aisle be be alarm alarm alarm alarm alarm albanian album alchemist alcoholic alert alert alert alert alienate alienate alienate alienate alien align align allay allay allegation allege allege allege allege allege allege allege allergy alleviate alleviate alley alliance ally ally ally alligator allocate allocate allocate allocate allocate allocation allotment allowance allow allow allow allow allow allow allow allow allow alloy allrightnik allude allusion ally ally alter alter alter alternate alternate alternate alternative alter alter alter aluminum-maker alumnus amah amalgamate amalgamation amass amass amass amass amass amateur amaze amaze ambassador ambiguity ambition amble ambush amend amend amend amend amendment amend amenity amenity a-man americana americanize american american amortize amount amount amount amount amount amount amount amphobile amplify amplifier amplify amplify amp amuse amusement amuse be be be anachronism analysis analysis analyst analyst analyze analyze analyze analyze analyze anchor anchor anchor anchor and anemia anemic angel anger anger anger angle animal animal animate animosity ankle anne anniversary announce announce announce announcement announce announce announce announce annoy annoy annualize annualize annuity annuity annuity anoint anomaly answer answer answer answer answer answer answer answer antagonist antagonize anteater antecedent antelope anther anti-abortionist antibody anticipate anticipate anticipate anticipate anticipate anticipate anticipate antic anti-hero anti-infective antipathy anti-programmer antiquity ant anxiety apartment apology apologist apologize apologize apologize apologize appal apparition appeal appeal appeal appeal appeal appeal appeal appearance appear appear appear appear appear appear appear appear appease appease appendage append append applaud applaud applaud applaud applaud apple appliance applicant application application apply apply apply apply apply apply apply apply appoint appoint appoint appointee appointment appoint appraisal appraise appraiser appraise appreciate appreciate appreciate appreciate appreciate appreciate apprehension apprise approach approach approach approach approach approach approach appropriate appropriate appropriation appropriation appropriator approval approve approve approve approve approve approve approve approximate arab arbitrager arbitrager arbitrageur arbitrage arbitrage arbitrate arbitrate arborist arb arb arcade arch architect architect arch area area arena be be be argue argue argue argue argue argue argue argument arise arise arise arise arise arise arkansa armadillo arm arm army armpit arm arm arm aroma arise arouse arouse arouse arouse arouse arpeggio arraignment arrange arrange arrangement arrangement arrange arrange arrange arrears arrest arrest arrest arrest arrest arrival arrive arrive arrive arrive arrive arrive arrive arrow arsenal artery article articulate artifact artillerist artist artist art art artwork asahus ascend ascertain ascribe ascribe ash ashland ashtray ask ask ask ask ask ask ask ask ask aspect aspen aspersion aspiration aspire aspire aspire aspire assail assail assassinate assassinate assassinate assassinate assassination assault assault assemblage assemble assemble assemble assemble assembly assemble assert assert assert assertion assert assert assert ass assess assess assess assessment assess assess asset asset asset assign assignment assign assign assign assimilate assistant assist assist assist assist assist associate associate associate associate associate association assuage assume assume assume assume assume assume assume assume assumption assurance assure assure assure assure assure asteroid astound astronaut eat eat athlete athletics atlanti atoll atom atone atrocity attach attach attach attach attach attach attack attack attacker attack attack attack attack attack attack attain attain attarck attempt attempt attempt attempt attempt attempt attempt attempt attendant attendant attend attend attendee attend attend attend attend attention attest attest attic attitude attitude attorney attorney attract attract attract attract attraction attract attract attract attribute attribute attribute attribute attribute attribute attribute auction auction auction auction auction audience audiocassette audiophile audit audit audit audition auditor auditor audit audit audit augment auspices author authority authority authorization authorize authorize authorize authorize authorize author aution auto-emission autograph autograph autograph auto\/homeowners automaker automate automate automate automate automobile automobile auto-sale auto autumn avenge avenue average average average average average average average aver avert avert avert avert aviator avoid avoid avoid avoid avoid avoid avoid avoid await await await await await awaken award award award award award award awake axiom axle b-2 babelist baby baby backdate back-date back back back backer backfire backfire backfire backfire backfire backflip background back backlog backlot backpacker backpack backpedal backslap back back back back-up back back bacterium badge bag bail bailiff bail bailout bail bakery baker balance balance-of-payment balance balance balance balance balance balcony bale balk balk balk balk balk ballerina ballet balloon balloon balloon balloonist balloon balloon balloon ballot ballot ballpark ballplayer ball ballyhoo bamboozle banana bandage band bandy band band bang bang banish banish banker banker banker bank bankroll bankroll bankroll bankruptcy bankrupt bankrupt bank bank bank bank bank ban ban banner ban banshee ban ban ban ban barb barber barb bargain bargain bargain-hunter bargain-hunt bargain bargain bargain barge barge bare bark barnacle barn baron bar bar bar barrel barrel barrel barricade barrier bar bar bar bar bartender barter bar bar baseball base base base basement basis base base base bash bash bash basics base basket basket bastion batch bath bathroom bath bat battalion bat bat batter batter battery batter batter bat battle battle battle battlegroup battlements battle battle battle battle battle bay beach bead beam beam beam beanball bean bearing bear bear bear bear bear bear beastie beast beat beat beat beat beat beat beat become become become become become become become become become bedevil bedevil bedfellow bedfellow bedpan bedroom bed beef beef beef beef be be be beep beep beer bee befall befall befall befriend befuddle begin beggar beg beg beginning begin begin begin begin begin begin beget beg begin beg behave behave behave behave behave behavior behead behemoth being be be be be beleaguer belfry belie belief belie belie believe believe believe believe believe believe belittle belly bellow bellringer bell bellwether belly-flop belong belongings belong belong belong belong belt belt bemoan bench benchmark bend bend benefactor beneficiary beneficiary benefit benefit benefit benefit benefit benefit benefit benefit benefit bequeath bequest berate berry beset besiege best best bestir bestow beta betray bet bet betters better bet bet bet bet bet be be be be beverage beware beware beware bewilder bewilder bewitch bias bias bible bicker bicker bicycle bidder bid bid bid bid bid bid bid bid bifurcate biker biker bike bike bilge bilk bill bill billing billing bill billion billion billow bill bill bill bill bill bill binder bind binge binoculars bin bioengineer biographer biologist biomedical-product biopsy bird bird birthday birth biscuit bishop bite bite bite bite bit bite bite blab black blacken blacklist blackmail blackmailer blackmail blackmail blackout black black blackstone blade blame blame blame blame blame blame blame blame blanding blanket blanket blanket blare blast blast blast blast blaze blaze bleacher bleed bleed bleed blemish blend blend blend bless blessing bless blow blindfold blind blini blink blinkers blink blink blip blip blip blitz block block block block block block block block blonde bloody bloodlet blood bloom blossom blossom blossom blot blow blow blow blow bludgeon bludgeon blueblood blue-chip blues blues blunder blunder blunder blunt blunt blunt blur blur blurt blurt blur bmw board boardroom boardroom board board boast boast boast boast boast boater boat boat bodega bode bode body bodyworker boe bog bog bog boil boiler boil boil boil bolster bolster bolster bolster bolster bolster bolster bolt bolt bombard bombard bombard bomb bomb bomber bombing bomblet bomb bomb bond bond-holder bondholder bondholding bond bond bond bone bone bonnet bonus bonus boo book book booker booking booking book booklet book book bookstore book boom boom boomer boom boom boom boost boost booster boost boost boost boost boost boost boost booth booth bootleg boot booze border border border bore bear bore bear bear bear borrow borrow borrow borrower borrower borrowing borrow borrow borrow borrow borrow boss boss botch bother bother bother bother bother bottle bottleneck bottler bottle bottle bottle bottom bottom bottom bottom bottom buy buy bounce bounce bounce bounce bounce bounce boundary bound bound bound bind bourbon bourse boutique bout bouygue bow bow bow bowl bowl bowl bow bow box boycott boycott boyfriend boy boy brace brace brace brace brag brag braid brain brake brake brake branch branch branch branch branch brandish brandish brand brand brand brassiere brat brave brave brave brawl brazen brazilian breach breach breach breach bread breakdown breaker break break break break breakthrough break break breast breathe breathe breathe breath breed breed breeder breeder breed breed breed breeze brethren brew brewery brewery brewer brew brew bribe bribe bribe bribe bribe brick bridge bridge bridge briefcase brief brief briefing briefs briefs brief brigade brighten brighten brighten brim bring bring bring bring bristle bristle bristle british broadcaster broadcaster broadcast broadcast broadcast broadcast broadcast broadcast broadcast broadcast broaden broaden broaden broaden broaden brochure break brokerage broker-dealer broker broker broker broker break bronco bronc brother bring bring browbeat browse browse bruise bruise bruise brushback brush brush brush brush brush brussels bubble bubble buccaneer buck buck bucket buck buck buckle buckle buckle buck buck buck buck buddy bud budge budge budget budgeteer budget budge bud buffer buffet buffet buffet buffet buff buff bug bug bug builder builder building-material building-product building building build build build build build build build build build bulb bulge bulkhead bulldoze bulldozer bulletin bullet bullhorn bully bully bull bull bully bumble bumble bumble bump bump bump bump bum bunch bundle bundle bundle bungle bun buoy buoy buoy buoy buoy buoy burble burden burden burden burden burden bureaucracy bureaucrat bureaucrat bureau burgeon burger burglary burglarize burg burial bury bury bury burn burn burn burn burnish burnout burn burn burn burn burn burst burst burst burst burst bury bury bury bus bushel bush busy business-communication business business business-machine businessman businessman business-partner busload bust bust bust bust butler butterfly button buttress buttress buttress butt buy-back buy-back buyer buyer buyer buying buy buy buy-out buyout buy buy buy buy buy buy buzz buzz buzz buzzword bylaw byline bypass bystander cabin-crew cabinet cable cab cache cadet cadge cafe cafeteria cake calamity calavera calculate calculate calculate calculate calculate calculate calculate calculate calculation calculator calibrate californian call call call call call call caller caller call call call call call call call call call calm calm calm calorie caltran calf camera come camouflage campaign campaign campaign campaign camp camper camp campus campus canal cancel cancel cancel cancel cancellation cancel cancel cancer cancer-suppressor candidate candy candle can can can canvas canvass canyon capability capacity capacitor capital-asset capital-drain capital-gain capitalgain capital-goods capitalist capitalize capitalize capitalize capitalize capitalize capital capital-to-asset capita capitulate cap cap cap cap capsule captain caption captivate captive capture capture capture capture capture cap cap carat carbide-product car-dealer cardholder cardinal card care care careen careen careen career career care care care caricature caricature carillon care carnivore carpenter carpetbagger carpet carpet carp carry carry carry carrier carrier carry carry-forward carryforward carry carry carry carry car car cart cart carton cartoonist cartoon cartridge cart cart carve carve carver carve carve carve carve cascade cascade caseload case case case cash cash cash cash casing casino casino casket cassette castigate castigate castigate casting cast castle cast cast cast cast cast casualty cataclysm catalog catalog catapult catapult cataract catastrophe catcher catch catch catch catch catch category categorize cater cater cater cater cater catfish catheter cathode cat cattle cattle catch catch cause cause cause cause cause cause cause caution caution caution caution caution caveat cave cave c.d.s cd cease cease cease cease cease cede cede cede cede ceiling celebrate celebrate celebrate celebrate celebrate celebration celebrity cellar cellist cell celluloid cement cement cement-maker cement censor censor censure centenarian center center center center center center center centimeter centralize centralize cent cent century centurion ceo ceramic cereal ceremony certificate certificate certificate certify certify cfc chafe chafe chain chain chain chair chair chairman chair chair chalk chalk challenge challenge challenger challenge challenge challenge challenge challenge challenge chamber chamber champagne champion champion champion championship champion champion champ chance chance chandelier change change change change change change change change change change change change channel channel channel channel chan chant chant chant chant chap chapter characteristic characterize characterize characterize characterize characterize characterize character character charge charge charge-off charge charge charge charge charge charge charge charity charity charlatan charlestonian char charter charter chart chart chart chart chase chaser chase chase chassis chasten chasten chastise chastise chastise chat chat chauffeur chauffeur cheapen cheat cheater cheat cheat cheat checkbook check check check checkpoint check check check check check cheer cheer cheer cheerleader cheer cheer cheese chef chelicerate chemical chemical chemical-weapon chemist cherish cherish cherry cherub chew chew chew chew chew chicken chicken chide chide chide chief child child child chill chill chill chime chimney chimpanzee chinese chip chip chip chisel chlorofluorocarbon choice choice choke choke choke choke choose choose choose choose choose chop chop chop chopstick chop chord choreographer chore chore chortle chorus choose choose christian chromosome chronicle chronicle chuck chuckle chuckle chuckle chug chum chunk church church-goer churn churn cial cigarette cigar cinch cincinnatus circle circle circle circuit circular circulate circulate circulate circulate circulate circulation circumstance circumvent circumvent circumvent citation cite cite Cite cite cite cite citicorp city city cite Cite citizen citizen citizen civic civilian civil-rights clothe clothe claimant claimant claim claim claim claim claim claim claim claim claim claim clamber clamor clampdown clamp clamp clamp clang clank clarification clarify clarify clarify clarify clash clash clash clash class class class classic classic classification classify classify classify classify classmate classroom clause claw clean clean cleaner clean-fuel clean clean cleanse cleanse cleanser cleanse cleanse clean clean clearance clear clear clear clear clear clear clear clergy cleric clerk click clientele client client climb climb climber climb climb climb clinch clinch cling cling clinical-product clinic clinker clip clip clipping clip clip clique cloak clobber clobber clobber clock clock clock clog clog clone clone close close close close close close close close close close closing close close closure clothe clothes clothier cloud cloud cloud cloud cloud clown club club club club club clue clump cluster cluster clutch clutch clutter clutter c'mon coach coach co-anchor coast coaster coast coat coating coat coat coattail co-author co-author coax coax cobble cobble cobb co-chairman cockatoo cockroach cocktail coconut coddle code co-defendant code-name code code co-developer codify codify co-edit coerce co-exist coextrude coffer co-found co-founder cognoscenti cohere cohort co-host coincide coincide coincide coin coin cola cold-cut coli collaborate collaborate collaborate collaborate collaborator collage collapse collapse collapse collapse collapse collapse collar collateralize colleague colleague collect collect collectible collectible collect collection collective collectivizer collector collector collect collect collect college college college-sport colloquy colonialist colony colonist color-code color-code color color color columnist column co-manage co-manage combatant combate combat combat comb combination combine combine combine combine combine combine combine comb combine comedy comestible come come comet come come come come come come comfort comfort comfort come come command commander command commando command command commemorate commemorate commemorate commence commence commend commend commentary commentator commentator comment comment comment comment comment comment comment comment commercialize commercialize commercial commercial commission commission commissioner commission commission commission commission commission commitment commit commit commit committee committee committee committe commit commit commit commodity commodity commoditize communicate communicate communicate communicate communication communique communist communist community commuter commuter commute commute compact company company company companion compaq compare compare compare compare compare compare compare compare compare compare comparison compatriot compel compel compel compel compensate compensate compensate compensation compete compete compete Compete compete compete compete Compete compete competition competitor competitor compile compile compile compile compile complain complain complain complain complaint complaint complain complain complement complement complete complete complete complete complete completion complex complexity complicate complicate complicate complicate complication comply comply compliment compliment comply comply comply component compose composer composite composition compound compound compound compound compound compound compress compressor comprise comprise comprise comprise comprise compromise compromise compromise compromise compromise compromise compulsion computation computerize computerize computerize computer-product computer-service computer computer computer computer-system compute compute compute comrade conceal conceal conceal concede concede concede concede concede concede conceive conceive conceive conceive concentrate concentrate concentrate concentrate concentrate concentrate concentration conception concept concern concern concern concern concern concern concern concern concern concerto concert concession conclude conclude conclude conclude conclude conclude conclusion concoct concoction concoct concur concur condemn condemn condemn condemn condemn condenser condition conditioner condition condominium condom condone condone condo conduct conduct conduct conduct conduct conduct conduit conduit cone conferee conferee conference confer confer confer confer confer confess confess confess confess confession confidant confide confide confide configuration confine confines confirm confirm confirm confirm confirm confirm confirm confiscate confiscate confiscate confiscate conflict conflict conflict conflict conform conform conform conform confrontation confrontation confront confront confront confront confront confront confuse confuse confuse confuse confuse confusion conglomerate congratulate congratulate congratulate congressman congressman conjure conjure connect connect connection connect connect connect connotation connote conquer conscript consent consent consent consent consent consequence conservationist conservationist conservative conservative conservative conserve consideration consider consider consider consider consider consider consider consider consider consign consign consist consist consist consist consist consist consist con console console consolidate consolidate consolidate consolidate consolidate consolidation consonant consort conspiracy conspirator conspire conspire conspire constituency constituent constitute constitute constitute constitute constrain constrain constraint constrain constrict constrictor construct construct construction construct construe construe construe construe consultant consultant consultation consult consult consult consume consume consumer-electronics consumer-goods consumer-product consumer consumer consume consume consummate contact contact contact contact contact contain contain container contain contain contain contain contaminate cont'd. contemplate contemplate contemplate contemplate contemplate contemplate contemporary contemporize contend contend contender contend contend contend contend contention contents content contestant contest contest contest contest contest continental contingency continue continue continue continue continue continue continue continue contraceptive contract-drill contract contract contract contract contraction contractor contractor contract contract contract contract contract contradict contradiction contradict contradict contra contrast contrast contrast contrast contrast contrast contrast contravene contribue contribute contribute contribute contribute contribute contribute contribute contribution contributor control control controller control control control control control controversy con convenant convene convene convene convene convene convene conventional-arm conventioner convention-goer convention converge conversation conversion convert convert converter convertible convert convert convert convert convert convey convey convey convey convict convict conviction convict convict convince convince convince convince convince convince convince convolute convolution convoy convulsion cookbook cook cookie cook cook cook coolant cool cool cool cool cool cool cool cooperate cooperate cooperate cooperate cooperate cooperative coordinate coordinate coordinate coordinate coor co-payment cope copy copy copier copy co-pilot cope co-president co-produce copycat copy copyright copyright copy copy copy core corkscrew corner corner cornerstone corner cornice corporate-earnings corporate corporation corporation corral correct correct correct correction correct correct correct correlate correspond correspondent correspond correspond corridor corroborate corvette cosmetic cosmetic cosmology co-sponsor co-sponsor co-sponsor costa cost cost cost cost costume costume cost cost cost cost cot cottage couch couch cough cough cough cough cough councilor council counsel counsel counselor counsel counsel count count countenance counteract counteract counterattack counterbalance counterbidder counterbid counter-claim counterclaim counter counter counter counter countermeasure counterpart counterprogram counter countersue countersue countersue countervail counter counter county count count country country countryman count count count count count couple couple couple couplet coup-maker coupon coupon coup courier course course court court courtesy courthouse court courtroom court court court court court cousin covenant coverage cover cover covering cover cover cover covert cover cover covet covet coward cowboy cowboy cower co-worker cow cpa crab crack crack crack crack crackle crack crack crack craft craft craft craftsman craft cram cram cramp cram crane crane crank crank crank crank crash crash crash crash crash crash crash crater crate crave crawl crawl crawl crawl creak cream cream create create Create create create create create create create creation creator creator creature credentials credit-datum credit credit credit creditor creditor credit-rating credit credit credit credit creep crematorium creep creep crest crevasse crevice crew cry crime criminalize criminal crimp crimp cringe cripple cripple cripple crisis crisis crisscross criss-cross criterion criticism criticize criticize criticize criticize criticize criticize critic critic croak croissant crony crook croon croon crop crop crop crop crop cross-breed cross cross cross cross cross-pollinate crossroads cross-shareholding cross cross crouch crouch crowd crowd crowd crowd crowd crow crown crow cr crude cruiser cruise cruise crumble crumble crumble crumble crumble crumple crumple crunch crunch crush crush crush crush cry cryptographer crystal cry cry cub cuckoo cue cuff cull culminate culminate culminate culminate culprit cultivate cultivate cultivate culture cup curator curb curb curb curb curb curb curb cure cure cure cure curl curl currency current curry curse curse curtail curtail curtail curtail curtail curtain cushion cushion cushion customer customer customize custom custom cutback cutout cut cut cutter cutting-tool cut cut cut cut cut cut cut cut cuvee cvb cycad cycad cycle cyclical cycle cyclist c-yield czar dabble dabble dabble dab daily dalliance damage damage damages damage damage damage damage damp damp dampen dampen damp damp damp dam dancer dance dance dance danger danger dangle dangle dare dare dare darling dash dash dash dash database datum datum datum datum date date date date date date daughter daunt daunt dawdle dawn dawn day. day day dazzle deactivate deadbeat deadline deadlock dead deafen dealership dealer dealer dealings deal deal dealmaker deal deal deal deal deal deal death debacle debate debate debate debate debate debenture debtholder debtor debt debunk debut debut decade decade decapitalize decease deceive deceive deceive decelerate decelerate decentralize decentralize decertify decide decide decide decide decide decide decide decide decimate decision-maker decision decision deck deckhand deck deck declaration declare declare declare declare declare declare declare declassify decline decline decliner decliner decline decline decline decline decline decline decline decontaminate decorate decorate decorator decoy decrease decrease decrease decrease decrease decrease decrease decree decribe decry decry decry dec. dedicate dedicate deduce deduct deductible deduct deduction deduction deduct deed deem deem de-emphasize de-emphasize deem deem deepen deepen deer deer default default defaulter default default default default defeat defeat defeat defeat defeat defeat defect defect defection defection defect defect defendant defendant defend defend defender defender defend defend defend defend defend defense-electronics defens defer defer defer deficiency deficit defy defy define define define define define define definition deflate deflate deflate deflator deflect deflect deflect deform defraud defraud defraud defraud defunct defuse defuse defy defy degenerate degenerate degenerate degree delay delay delaying delay delay delay delay delay delay delegate delegate delegate delegate delete delete delete deletion deliberate deliberate deliberation delight delight delight delight delight deli delinquency delinquent delist deliver deliver delivery delivery deliver deliver deliver deliver deliver delouse deluge deluge delve delve demagogue demand demand demand demand demand demand demand demean demeanor demilitarize demobilize demobilize democracy democratize democratize democrat demographic demographic demolish demolish demolish demolish demonize demon demonstrate demonstrate demonstrate demonstrate demonstrate demonstrate demonstration demonstration demonstrator demote demur denationalize denial deny deny deny deny denims denizen denominate denomination denounce denounce denounce denounce denounce dentist dent denuclearize denude deny deny deny depart depart depart department departure depart depart depend depend dependent depend depend depend depend depend depict depict depict depict depict depict deplete deplete deplete deplete deplore deplore deploy deploy deport depose deposit deposit deposit deposition depositor deposit deposit deposit deposit depot depreciate depredation depress depress depress depress depression depress depress deprivation deprive deprive deprive deprive deprive deprive deprogramming depths deputy deputy derail derail derail deregulate deregulate deride deride derivative derive derive derive derive derive descendant descend descend descend descent describe describe describe describe describe describe describe description desert desert desert deserve deserve deserve deserve designate designate designate designate designation design design designee designer design design design design design desire desire desire desire desist desk despair despise despise despot destabilize destabilize destination destination destine destroy destroy destroy destroy destroy destroy detach detach detail detail detail detail detain detain detain detect detect detect detective detector detect detect detergent deteriorate deteriorate deteriorate deteriorate deteriorate deteriorate determine determine determine determine determine determine determine deter deterrent deter deter deter deter detest dethrone detractor detract detract devaluation devalue devalue devastate devastate devastate develop develop develop developer developer develop develop development development develop develop develop develop deviate deviation device devil devise devise devise devise devise devote devote devotee devotee devote devote devote devour devour dewater diabetic diagnose diagnose diagnose diagnostic diagram dialect dial dial dial dial diamond diaper diary dibenzofuran dice dicker dictate dictate dictate dictate dictate dictate dictatorship dictator do do die die die die die die-hard diehard diesel die die die die die die differ difference difference differential differentiate differentiate differ differ differ differ difficulty digest digest digest dig digit dignify dignitary dig dig dig dig dilute dilute dilute dilute dilute dimension dime diminish diminish diminish diminish diminish diminish dim dine dine diner dine dinner dinosaur dioxin diplomat dip dip dip dip dip dip direct direct direct direction directive directory director director direct direct dirk disability disable disabled-worker disable disadvantage disagree disagree disagreement disagree disagree disagree disallow disappear disappear disappear disappear disappear disappoint disappoint disappointment disappointment disappointment disappoint disapprove disapprove disapprove disapprove disarm disarm disassemble disassociate disaster disaster disavow disband disband disband disburse disbursement discard discard discard discern discern discharge discharge discharge disciple discipline discipline discipline disclaim disclose disclose disclose disclose disclose disclosure disclosure discolor discomfit disconnect disconnect discontinue discontinue discontinue disco disco discount discount discount discount discount discount discount discourage discourage discourage discourage discourage discourage discover discover discovery discover discover discover discover discredit discredit discrepancy discrepancy discrepency discriminate disc discuss discuss discuss discuss discussion discuss discuss disdain disdain disease disembark disenchant disengage disgorge disgruntle disguise disguise disguise disgust disgust dish dish dishwasher disincline disinfectant disintegrate disintegrate disk dislike dislike dislike dislocation dismantle dismantle dismantle dismantle dismay dismember dismiss dismiss dismiss dismiss dismiss dismiss dismiss disobey disorder disparage disparage disparage disparity dispatch dispatch dispatcher dispatch dispatch dispatch dispel dispel dispense dispense dispense dispense dispersant disperse disperse displace displace display display display display display display display displease disposable disposal dispose dispose dispose dispose dispose disposition disprove dispute dispute dispute dispute dispute dispute disqualify disqualify disregard disregard disrupt disrupt disrupt disruption disrupt dissatisfy dissect dissect disseminate disseminate disseminate dissent dissent dissenter dissent dissident dissident dissipate dissipate dissipate dissociate dissociate dissolve dissolve dissolve dissolve dissuade distance distance distance distil distiller distil distinction distinguish distinguish distinguish distort distortion distort distort distort distract distract distract distraction distract distribute distribute distribute distribute distribute distribute distribute distribute distribution distributor district districts\/states disturbance disturb disturb disturb disturb dither dive diverge diverge diversife diversification diversify diversify diversify diversify diversion diver divert divert divert divert divest divest divest divestiture divest divest dive dive dive divide divide dividend divide divide divide divide divide divide dive division divorce divorce divulge divvy divvy dizzy docket dock-sider doctor doctor doctor doctor doctrine docudrama documentary document document document document document document dodge dodge do do dog dog dog do-gooder dog dog dog dog do do doldrums dole dole dole dole dollar doll dolphin dome dominate dominate dominate dominate dominate dominate donate donate donate donate donate donate donation donation do don don donor don don don't don doom doom doom doomsayer doom doorman door dose dossier dote doth dot dot double-cross double double double double double double double doubt doubter doubt doubt doubt doubt doubt do do do do dovetail dive down downgrade downgrade downgrade downgrade downgrade downpayment downplay downsize downsize downsize downs downturn down dozen dozen draft draft draft draftsman draft drag drag drag drag drag drag drag drain drain drain drain dramatization dram drape drape drawback drawing draw draw draw draw draw draw draw dread dream dream dream dream dream dream dream dream dream dream dress dress dress dress dress dress dress draw dry dry drift drift drift drift drill drill driller drill drill drill drink drink drink drink drip drive driver drive drive drive drive drive drive drool drool drop droplet dropout drop drop dropper drop drop drop drop drop drought drove drive drown drown drown drug-sale drug drug drugstore drum drum drum dry dry d dub dub dub dub duck duckling duck duck duck duck duct dud duel duel due due duffer dig dull dummy dump dump dump dump dump dump dump dune dupe duplicate duplicate duplicate duplication durable-goods durable dust dust duty 'd dwarf dwarf dwarf dwarf dweller dwelling dwindle dwindle dye dye die dynamics dynamo earmark earmark earmark earmark earn earn earner earnig earnings earnings earnings earn earn earn earn earn earring ear earthling earthquake earthworm ease ease ease ease ease ease easterner east east eat eater eat eat eat eat eavesdrop ebb ebb eccentric echelon echo echo echo echo echo echo echo eclair eclipse eclipse economics economy economist economist economize edge edge edge edge edge edition editorial editorial editor edit educate educate educate education educator educator effect effect effect efficiency effort effort egg ego eject eke eke elaborate elaborate elaborate elapse elbow elder elect elect elect election election election electrical-product electrify electrochemical electrode electrogalvanize electrogalvanize electromagnet electronic-datum electronics electronic-system electro-optics elect elect element element elephant elevate elevate elevation elevator eliminate eliminate eliminate eliminate eliminate eliminate eliminate eliminate elite elite elitist elongate elude elude emasculate embargo embargo embargo embark embark embark embark embarrass embarrass embarrass embarrass embassy embattle embed embed embellish embezzle embezzle emblem embody embody embody embolden embolden embrace embrace embrace embrace embrace embroil emerge emerge emergency emerge emerge emerge emerge emerge emigrate emigrate emigre emissary emission emit emote emote emotion empathize emphasize emphasize emphasize emphasize emphasize emphasize emphaticize empire employ employee employee employee employee employer employer employ employ employ employ empower empower empower empower empty empty empty empty emulate emulate emulate enable enable enable enable enable enable enact enact enact enact encapsulate encase encircle enclose enclose encompass encompass encompass encounter encounter encounter encounter encounter encourage encourage encourage encourage encourage encourage encourage encourage encourage encroach encrust encrypt encumber endangered-species endanger endanger endeavor endeavor endeavor end end end ending end end endorse endorse endorsement endorser endorse endorse endorse endorse endow endow endow end end end end end-tail endure endure endure endure endure endure end end end enemy energy energize energy-service enforce enforcer enforcer enforce enforce enforce enforce engage engage engagement engage engage engage engage engineer engineer engineer engineer engineer engine english engrave engulf engulf enhance enhance enhancement enhance enhance enhance enjoin enjoin enjoin enjoy enjoy enjoy enjoy enjoy enjoy enjoy enlarge enlarger enlarge enlarge enlighten enlighten enlighten enlist enlist enlist enliven ennumerate enrage enrich enrich enrol enrol enrollee enrollment enroll enroll ensconce ensemble ensnarl ensue ensue ensue ensure ensure ensure ensure entail entail entail entangle enter enter enter enterprise enter entertain entertain entertainer entertain entertain enter enter enter enthusiasm enthusiast enthusiast entice entice entice entice entity entitle entitle entitlement entitle entitle entitle entitle entomb entrance entrant entrench entrench entrepreneur entrepreneur entry entrust entrust entrust entwine envelope environmentalist environmentalist environment envisage envisage envision envision envision envision envy eon epileptic episode epitomize equal equal equal equal equal equal equate equate equate equestrian equip equip Equip equip equip equip equity equity equivalent equivalent eradicate erase erase erase erase erase erase erasure erect erect erect erode erode erode erode erode erode erode erratum err err error err err err err erupt erupt erupt erupt escalate escalate escalate escalate escalator escape escape escape escape eschew escort escort escrow espouse esp essay essential establish establish establish establish establish establish establishment establish establish establsh estimate estimate estimate estimate estimate estimate estimate estimate estimate estimate estimator estrange ethic euphemism eurobond eurodebenture eurodollar eurodollar euroissue euronote european evacuate evacuate evade evader evade evade evaluate evaluate evaluate evaluate evaluate evaluate evaluate evaluation evaluation evangelist evaporate evaporate evaporate even evening even event event even everglade evidence evil-doer evince evince eviscerate evoke evoke evoke evolve evolve evolve evolve exacerbate exacerbate exacerbate exacerbate exacerbate exaggerate exaggerate examination examine examine examiner examiner examine examine examine examine examine example example exam excavate excavate excavator exceed exceed exceed exceed exceed exceed excel excel exception except excerpt excerpt excess exchange exchange exchange exchange exchange exchange excise excise excise excite excite exclaim exclude exclude exclude exclude exclude exclude exclude exclude exclude exclusion excorciate excoriate excursion excuse excuse excuse excutive exec execute execute execute execute execute execute execute execution execution executive executive executive executor exemplify exemplify ex-employee exempt exempt exemption exempt exercise exercise exercise exercise exercise exercise exercise exerpt exert exert exert exhale exhaust exhaust exhaust exhaust exhaust exhibit exhibit exhibition exhibitor exhibit exhibit exhibit exhort exile exile exist exist exist exist exist exist exist exit exit exit exit exonerate exonerate exonerate exorcise exorcism expand expand expand expand expand expand expand expansionist expansion expectation expectation expect expect expect expect expect expect expect expect expect expect expedient expedite expedite expedite expel expel expel expend expenditure expenditure expense expense experience experience experience experience experience experience experience experience experiment experiment experiment experiment experiment expert expert expiration expire expire expire expire expire expire explain explain explain explain explain explain explain explain explanation explode explode explode explode explode exploit exploiter exploit exploit exploit exploit exploit explore explore explore explore explosion explosion explosive export export exporter export export export export export export expose expose expose expose expose exposure expound express express express express expression express express expunge expunge extend extend extend extend extend extend extension extension exterior extinguish extort extort extort extract extract extract extract extract extradite extradition extrapolate extra extreme extremist extricate extrusion exude exude eyeball eyeball eyebrow eye eyeglass eye eye eye eyewitness eye f16 f-18 f18 fabricate fabricate fabrication fabricator fabric facade face face face facelift face face face face facilitate facilitate facilitate facility facing face face facsimile faction factory factory factor factor factor factor factory-job fact fade fade fade fade fade fad fail fail fail failing fail fail fail failure failure fail fail faint faint fair fajita fake fake fake fall fall fall fall fall fall falsify falsify falsify falter falter falter falter falter fame familiarize family family famine fanatic fancy fang fan fan fan fan fantasy fantasize fantasize fan fare fare fare fare fare fare fare fare farmer farmer farmer farm farm farmstead farm farm farmwife fascinate fascist fashion fashion fashion fastball fasten fastener fatality fatality father father fat fatten fatten fatten fault faultline fault fault fauna favor favor favor favorite favor favor favor favor fawn fax fax fear fear fear fear fear fear fear fear feast feast feather feat feature feature feature feature feature feature feature feed feed feed feedlot feedlot feed feed feed feeler feeling feeling feel feel feel feel feel feel fee fee foot fella fell fell fellow fall fall felony felon feel feel female feminist fence fend fend fend fend ferret ferret ferry ferry ferry fertilize fertilizer fertilize fester festival festivity festoon festoon fetch fetch fetch fetch fetch fetus feud feud fiberglass fiber ficial fidget fiefdom field field field field field field fighter-bomber fighter fight fight fight fight fight fight figure figure figure figure figure figure figure figure figure filbert filch file file filer file file file file filings file filipino fill fill fill fill fill fill fill film film film film film-maker film filter filter filter filter filter finagle finalist finalize finalize finalize final finance finance finances finances finance finance financial-crime financial-service financier financing finance finding find find find find find find find fine-art fine fine fine finesse fine-tune fine finger finger fingerling fingerprint finger finger fine finish finish finish finish finish finish finish finish fireball fire fire firefighter firehoop fireman fireplace fireproof fire fire fire fire firework firing fire firm firm firm firm firm firm firm fishery fisherman fish fissure fist fit fit fit fit fit fix fix fix fix fixture fix fizz fizzle fizzle flabbergast flag flag flame flame flank flap flap flare flare flare flashback flash flash flash flash flash flashlight flash flash flat flatten flatten flatten flatten flaunt flaunt flavor flaw flaw flaw flay flea fledge flee flee fleece flee fleet fleet flee fleshpot fly flex flicker flick fly flier fly fly flight-attendant flight flight flinch fling fling flip flip flip flip flirt flirt flirt float float float float float float float flock flock flock flog flood flood flood flood floor flop flop floppy flora flotation flounder flounder flounder flourish flourish flourish flourish flout flow flower flower flow fly flow flow flow flow fluctuate fluctuate fluctuate fluctuate fluctuate fluctuate fluctuation fluid flunk flunk fluoropolymer flush flush fly fly fly fly foam focus focus focus focus focus focus focus foe foe fog foil foil foil foil foil fold fold folder fold fold fold folk folk folly follow follow follower follow follow follow follow follow follow foment food-fish food-service food foodstuff fool fool fool fool foothill foothill foot footnote footnote footstep foot foot foray forbid forbid forbid forbid force force force force force force force force force forecaster forecast forecast forecast forecast forecast forecast forecast forecast foreclose foreclose foreclose foreclose foreclosure foreclosure forefather foreigner foreigner forensic forerunner foresee foresee foresee foresee foresee foreshadow forest-product forest-product forest forfeiture forfeit forge forgery forget forget forget forget forget forge forging forge forgive forgive forgive forget forget forget forgo fork fork forklift fork formality formalize format form form form form form form formula formulate formulate formulate formulate formulation form form forsake fortify fortune forum forward fossil foster foster foster foster fight fight foul foundation found found found founder founder founder founder found found find find fountain four-fifth fox fraction fraction fracture fracture fragment fragment fragment frailty frame framer frame frame frame franchise franchisee franchisee franchiser franchise franchise franchise franchise franciscan franc fraternity fraud fray fray freak freak freedom free free free free freeholder free free free free freeway freezer freeze freeze freeze freighter freight french frequency frequent freshman fret fret fret fret fret friction friday fry friendship friend friend frier frieze frigate frighten frighten frighten fringe frippery fritter fritter frock frogman frog frolic frolic frond front froth freeze freeze fruit frustrate frustrate frustrate frustrate frustration fry fudge fudge fuel fuel fuel fuel fuel-service fuel fuel fugitive fujus fulfil fulfil fulfil fulfil fulfill fulfill fulmination fume fume fume fumper functionary function function function function function fundamentalist fundamental fund fund fund fund fund-raiser fundraiser fund-raiser fundraise fund fund fund fund fund fund fungus funnel funnel funnel funnel furlough furlough furnace furnish furnishings furnish furnish furrier furrow fur further further further fuss future future future gadget gag gain gain gain gainer gainer gain gain gain gain gain gain galaxy gallery gallon gallop gallstone galvanize galvanize galvanize gambler gamble gamble game gangbuster gang gangster gape garage gardener gardenette garden garden garden garment garner garner garner garner gas gas gas-gather gasoline gasp gasp gate gather gather gather gathering gather gather gather gather gauge gauge gauge give give gaze gear gear gear gear gear geek geek gemsbok gem gender generalist generalization generalization generalize generate generate generate generate generate generate generation generator gene genre gentlelady gentleman geoscience german german germ gesture gesture get get get get get-together get get get get get geyser ghetto ghostbuster ghostbuster ghost giant giant gift gilt gilt gimmick gird gird girl giveaway giveaway giveback give give give give giveth give give give give give give gizmo glamorize glamorize glance glare glass glaze glaze gleam glean glean glide glide glimpse glitch glitterati gloat gloater gloat gloat gloat globalist gloss gloss glove glow glow glue glue glut glut glut glycol gnaw goal goat gobble gobble goblin god go go goings-on go go goldband gold golfer golf goliath go go go gon gon goody goods goodyear gore gore gorilla gossip get get get get get get go go go govern govern govern governmental-affair government-relation government-security government-set governmentset government government governor governor govern govern govern grab grab grab grab grab grab grace grader grade grad graduate graduate graduate graduate graduate graft grain grain grain grammy gram grandchild grandee grandkid grandmaster grandmother grandparent grange grant grant grant grant grant grant grant grape grape graphic graphic graph grapple grapple grapple grasp grasp grasp grassroot gratuity gray graze grazer graze great-grandchild great greenhouse green green greet greet greeting greet greet greet grenade grow grow grow gridlock grievance grill grimace grimace grimace grinder grind grind grind gringo grin grin grin gripe gripe grip grip grip groan grocery grope gross grotto groundbreaker ground ground-handling ground grounds grind grind grind group group group group grouse Grouse grouse grovel grower grow grow grow growl grow grow grow growth growth grow grow gruel grumble grumble guarantee guarantee guarantee guarantee guarantee guarantee guarantee guarantee guard guard guard guard guard guard guard guber-peter guerrilla guess guess guess guess guest guest guide guide guideline guideline guidepost guide guide guide guilder gunboat gunman gunman gun gunner gunsling gun gun guru guru gush gush gut gut guy guy guzzle gymnastics gyrate gyrate gyrate gyration habeas habitat habit hacker hackles hack hack have have have haggle haggling haggle hail hail hail hail half-state hall hallway halogenate halt halt halt halt halt halt halt halve halve half halve halve hamburger hammer hammer hammer ham hamper hamper hamper hamper hamper hamstring hamstring hamstring handbill handbook hand-carry handcuff handcuffs hand hand handicapped handicap handicap handle handle handler handle handle handle handle handpick hand handstand hand hand hang hang hang hang hang happen happen happen happen happening happen happen happen happen harangue harangue harass harass harass harbor harbor harbor harbor harden hardship harm harm harm harm harm harm harness harp harp harp harry harvest harvest harvest hash hassle hasten hasten have have have have have hatch hatch hate hate hate hate hat haul hauler haul haul haunt haunt haunt haunt haunt haunt haven have have have have have have have have hawaius hawker hawk hawk hawk hazard hcfc hdtv headache head head head head head headlight headline headline headline headphone headquarters headrest headset head head head head heal heal health-product heal heap heap heap hear hear hear hearing hearing hear hear hear hear hearten heart hear hear heat heater heat heat heat heat heave heave heaven heave heavyweight heckle hedger hedge hedge hedge hedge hedge heebie-jeeby heed heed heed heel heighborhood heighten heighten heighten height heir hold hold helicopter help help help help help help help help help help help hemorrhage hemorrhage hemorrhoid hen hen herald herald herald herbicide hercule herd herd herniate hero heron hesitate hesitate hesitate hew hew hew hide hideout hider hide hide hide hide highland highlight highlight highlight highlight highlight highlight highlight high-rise high hightail hightop highway highway hike hiker hike hill hinder hinder hinder hinder hinder hinge hint hint hinterland hint hint hint hint hint hip hire hire hire hire hire hire hire hire hispanic hiss historian historical-claim historicize hitch hitch hit hit hitter hit hit hit hit hit hit hit hoard hoard hoard hobby hobble hobble hobble hobbyist hobo hoe hog hog hog hoist holder holder holding holding hold hold holdout holdover hold holdup hold hold hold hold hold hold hole hole hole holiday holler homefe homeowner homer homer homer home homicide homosexual homosexual honduran hondura hone hone honorarium honor honor honor honor honor hood hoodwink hook hook hook hook hookup hook hoof hope hope hope hope hope hope hope hope hope Hope hop hopscotch hop horizon hormone horn horoscope horrible horror horse horse horticultural-product hosanna hose hose hose hospitalization hospitalize hospital hospital hostage host hostility host host host host hot-cereal hotel-casino hotelier hotel hotline hound hound hour hour house household house house house house houseware housewife housing hover hover hover howl huckster huddle hug hug hug hulk humanities humanize human-resource human-rights human humble hum hunch hundred hundred hang hang hunker hunt hunter-gatherer hunter hunt hunt hunt hunt hurdle hurl hurl hurl hurricane hurry hurry hurry hurt hurtle hurt hurt hurt hurt hurt hurt husband husband hustler hustle hybrid hyena hype hypermarket hype hypnotize hypocrite hypothesize ice-bagger ideal idea idea identify identify identify identify identify identify identity ideology ideologue ideologue idiot idle idle idle idle ignite ignite ignite ignore ignore ignore ignore ignore ignore ignore Ignore illegality illness ill illuminate illuminate illusion illustrate illustrate illustrate illustrate illustrate illustration image imagine imagine imagine imagine imagine imagine imagine imbalance imitate imitate imitate immerse immigrant immigrate immunity impact impact impact impair impair impart impeach impede impede impede impediment impede impel impend imperative imperfection imperialist imperil impersonation implant implant implant implant implement implement implement implement implement implicate implicate implication imply imply imply implore implore imply imply imply import import import importer import import import import import import impose impose impose impose impose impose impound impound impoverish impress impress impress impressionist impress imprison imprison imprison imprison impropriety improve improve improvement improve improve improve improve improve improvise impugn impulse inaugurate incarcerate incense incentive inch inch inch inch inch incident incision incite incline include include include include include include include include include income incongruity inconsistency incorporate incorporate incorporate incorporate incorporate increase increase increase increase increase increase increase increase increase increase increase increase increase increment incriminate incumbent incur incur incur incur incur indemnify independent index indexer index index index-future index index-option indicate indicate indicate indicate indicate indicate indication indicator index indict indict indictment indict individual individual indoctrinate induce induce inducement induce induce induce indulgence indulge indulge indulge industrialist industrialize industrialize industrial industrial industry industry inefficiency inequality inequity infant infant infect infect infect infection inference infer infiltrate infiltrate infiltrate inflame inflate inflate inflate inflate inflate inflation-adjust inflict inflict inflict inflow influence influence influence influence influence influence influence information-service information-system inform inform inform inform inform infraction infringe infringe infringe infringe infringe infuriate infuriate infuse infuse ingest ingot ingrate ingratiate ingredient inhabit inhabit inhabit inherit inherit inherit inherit inhibit inhibit inhibit initial initial initial initiate initiate initiate initiate initiate initiatior initiative initiative inject inject inject injection inject inject inject injunction injure injure injure injury injure injustice ink inmate inning innocent innoculate innovate innovate innovation innovator i inn innuendo input inquire inquiry inquire inroad insect insert insert insert insert insert insider insider insight insinuate insist insist insist insist insist insist insist inspect inspect inspect inspection inspector inspector inspect inspect inspect inspiration inspire inspire inspire inspire inspire installation install install install installment install install instance instigate instill instinct institute institute institute institute institution institution instruct instruct instruct instruction instructor instruct instruct instrumentalist instrument instrument insulate insulate insulate insulate insulin insult insult insurance-claim insure insure insurere insurer insurer insurer insure insure insure insurgent insure integrated-technology integrate integrate integrate intellectual intend intend intend intend intend intensify intensify intensify intensify intention intent interaction intercede intercept intercept interconnect interconnect interest interest interest interest interfere interfere interfere interfere interfere interior-furnishings interior interject interlope intermediary intermix internationalist international-operation interpretation interpret interpret interpret interpret interpret interpret interrogate interrogator interrupt interrupt interruption interrupt intersection intersperse intersperse interstate intertitle intertwine intertwine interval intervene intervene intervene intervene interventionist intervention interview interview interview interview interview interview intimate intimidate intimidate intimidate intimidation intone intrigue intrigue introduce introduce introduce introduce introduce introduce introduce introduction intrude intrusion inundate invade invader invade invade invade invade invalidate invent invent invent invention inventory inventory inventor invent invent invert invest invest investigate investigate investigate investigate investigate investigate investigation investigator investigator invest invest invest investment investor-relation investor investor invest invest invest invitation invite invite invite invite invite invoice invoke invoke invoke invoke invoke involve involve involve involve involve involve involve involve ious ipo ira iris irk irk irk irk irony iron iron irradiate irregularity irritate irritate island island is isolate isolate isolate isolate isolate issue issue issuer issue issue issue issue issue issue issue be be be itemize item item jab jackal jack jack jacket jacket jackhammer jack jack jail jail jail jail jam jam jam jam jan. japanese-american japanese japanese jar jar jaunt jaunt jeans jeep jell jeopardize jeopardize jeopardize jeopardize jerk jetliner jet jet jettison jet jeweler jewel jew jiggle jillion jingle jinks jinx jitters job job jockey jock jog join join join join join join joint-implant joint join join join joke joke joke joke jolt jolt jolt jolt jolt jostle journalist journalist journal joust joy judge judgeship judge judge judge judge judge judge judge judgment juggler juggle juggle jug juice jumbo jump jump jump jump jump jump jump jump juncture junior junk-bond junket junk-holder junkholder junk-holder junkie junk jury jurisdiction jurist juror jury-rig justice justify justify justify justify jut jut juxtapose kayo kc-135 keen keeper keep keep keep keep keep keep keep keep ketchup keyboard key key key kickback kick kick kicker kick kick kick kick kick kiddie kid kidnap kidnap kidnapper kidnap kid kid kill kill killer killing kill kill kill kill kill kilobyte kilogram kilometer kindle kind kinfolk kingpin king kiss kiss kit knead knead knee know knight knit knit knit knock knock knock knock knock knot know know known know know know know know know knuckle kowtow krona kudos label label label Label label label label label laboratory laborer labor labor labor lab lace lack lack lackey lack lack lack lady laggard lag lag lag lagoon lag lag lag lag lag lay lay lake lambast lambaste lament lament laminate lamppost lamp land land landfill landholding landing land landlord landowner landowner landscaper landscape landslide land land land lane language languish languish languish languish languish lapse lapse lapse lap laptop laptop lap laser lash lash lash lash last last last last last last latch latch latch latch lathe laud laud laugh laugh laugh laugh laugh laugh launch launch launch launch launch launch launder launder launder launderer launder launder laurel lavish lavish lawbreaker lawmaker lawmaker lawmaker law-making lawn law lawsuit lawsuit lawyer lawyer lawyer laxative layer lay layoff layoff lay lay lay lay lbo leader leader lead lead lead lead lead lead lead lead leaf leaflet leaguer league leak leak leaker leak leak lean lean lean lean lean lean lean leap leap leapfrog leap leap leap leap leap leap learn learn learn learn learn learn learn lease lease lease lease lease lease lease lease leather leaf leave leave leave leave leave leave lecture lecture lecture lecture ledger lead lead lead leech leftist leftovers leave leave leave leave legalize legal-service legion legislate legislate legislator legislator legislature legitimize legitimize leg lemming lemon lender lender lend lend lend lend lend lengthen lengthen lengthen lengthen lengthen length lens lens lend lend leotard lesbian lesion lessen lessen lesser lesson let letter letter let let let let let let let let level level level level level level leverage leverage leverage leverage leverage levy levy liability liaison liaison liar libel liberalization liberalize liberalize liberalize liberalize liberalize liberal liberal liberate libertarian liberty librarian library louse license license licens license license license license lick lid lie lie lie lie lieutenant lie lie lifeguard life lift lift lift lift lift lift lift lift light lighten lighten lighten light light light light like like liken liken liken likes like like like limb limitation limit limit limit limit limit limit limit limit limousine limp linebacker line line liner line line lineup line line linger linger linger linger line linkage link link link link link link link link link lion lipoprotein lip lipstick liquefy liquefy liquefy liquefy liquidate liquidate liquidate liquidate liquidate liquid lira list list listen listen listener listener listen listen listen listen listen listing list list list list list lithograph litigant litigator litter litter light light live live live-haul live-hauler life life live live live live live live live live load load loading load load load loafer loan loan loan loan loan loan loathe loathe loathe loaf lobby lobby lobby lobby lobbyist lobby locale locality localize local locate locate locate locate locate location lock lock lock lock lock lock lock locution lodge lodge lodge lodgings log log logger log logistics logo log-roll log log log log long-term long look look lookee-loo look look look look look look look loom loom loom loom loom loom loony loophole loop loosen loosen loosen loose loot loot lop lord lord loser lose lose lose lose lose loss loss loss lose lose lose lotion lot lot lottery loudspeaker loui lounge lovebird love love lover love love love love lower lower lower lower lower lower low-life low loyalty lubricant luck lug lug lug lull lumber luminary lump lump lump lump lunch lunge lunge lung lurch lurch lurch lure lure lure lure lure lure lure lurk luxury lie lyric lyric machete machiguenga machine machine machinist machinist machinist make make make mafia mafioso magazine magazine maggot magician magistrate magnetize magnet magnify magnify magnify magnolia maharaja mail mail mailer mailing mail mailman mail-sort mail mail mainframe main maintain maintain maintain maintain maintain maintain maintain major major maker maker make make make make make make make make make make makin malefactor male malfunction malign mall mammoth manacle manage manage manage management manager manager manage manage manage manage manage mandate mandate mandate mandate mandate mandate maneuver maneuver maneuvering maneuver maneuver maneuver maneuver maneuver manhandle manifestation manifesto manifest maninstay manipulate manipulate manipulate manipulate manipulate manipulation manipulator man mannerism manners man mansion manual manual manuever manufacture manufacture manufacturer manufacturer manufacture manufacture manufacture manufacture man many many map map map map map marathon marble march march marcher march march march march margeote marginalia marginalize margin-call margin margin margin margin markdown mark mark market market marketeer marketer marketer marketing-communication market market-maker marketplace market market market market market mark markka mark mark mark mark mark marquee mar marriage marry marry marry marry marsh marvel marvel marvel mask mask mask mask mason masquerade massacre massage massage massage mass mass masseur masseuse mass-media mass-produce master masterpiece master master match match match match match match match mate materialize materialize materialize materialize materialize material material mate mathematics mate matter matter matter matter matter matter matter mature mature mature mature mature maturity maturity maul maven maximize maximize maxim mayor maze mcdonald meadow meal meander meander meaning mean means means mean mean mean mean mean mean mean measure measure measure measurement measure measure measure measure measure meat meat mechanic mechanism medallion meddle meddle media mediate mediator medical-product medicine meeting meeting meet meet meet meet meet meet meet megabyte mega-crash mega-hit mega-issue mega-merger mega-problem mega-project megaquestion mega-resort megawatt meld meld meld mellow melody melt melt melt membership member member memento memoirs memorabilia memorandum memorandum memorialize memory memorize memo memo mend man man mention mention mention mention mention mention mentor menu mercantilist mercedes-benz mercedes merchandise merchandiser merchandise merchant merchant merge merge merger merge merge merge meringue merit merit merit mesh message messenger mess messr. mess metabolize metal metal metal metal-worker metalworker metaphor mete meter methodology method mete metric meet meet mouse mouse micoprocessor microbe microchip microcomputer microeconomic microelectronics microphone microprocessor microwave mid-1940 mid-1960 mid-1970 mid-1980 mid-1990 middleman midsize miff miff mig-29 mighta migrate migration mile milestone military-electronics militate militia milk milk milk mill millionaire million-plus million million mill mimic mimic mimic mimic mince mind mind mind mind minefield mineral miner mine mine mingle miniaturize minicar minicomputer minimill minimize minimize minimize minimum mine miniseries minister ministry minisupercomputer minivan minneapoli minority minor mint mint mint minus minus minute minutiae mips mips mire mirror mirror mirror mirror misadventure miscalculate miscalculate miscarriage misclassify miscreant misdeed misdemeanor misfortune misguide mishandle mishandle misinterpret misinterpret misjudge misjudgment mislay mislead mislead mislead mismeasurement misperception misplace misquote misrepresentation misrepresent misrepresent misrepresent misrepresent misroute miss miss miss missile miss missionary mission misspend misstate misstate misstatement misstate miss miss mistake mistake mistake mistreat mistress mistrial mist misunderstanding misunderstand misunderstand misuse mite mitigate mitigate mitsubishus mitsuus mix mix mix mixer mix mix mixture mix moan moan moan mobilize mobilize mobilize mobilize mock mock model model model model model modem moderate moderate moderate moderate moderate modernize modernize modernize modernize mode modification modification modify modify modify modulate moisturizer mold mold molecule mollify mollify moment monday monetarist moneymaker monitor monitor monitor monitor monitor monitor monitor monitor monkey monolith monologue monopoly monopolize monopolize monopolize monopolize month month moonie moonlight moon mop mop moral mores morning morsel mortgage mortgage mortgage moslem motel mother mother motif motion motion motivate motivate motivate motivate motive motorcycle motorist motorize motor mound mountain mount mount mount mount mount mount mourn mourn mousetrap mouth mouth move move move movement move move move move move move move move movie move move mow muck muddy muddle muffle muff mull mull mull multimedia multinational multiple multiple multiply multiply multiply multiply mumble mummy munch municipality municipal municipal municipal muni muni mural murder murderer murder murder murmur muscle muscle muscle muscovite muse muse museum museum muse mushroom mushroom mushroom musician muster mutate mutate mutation mute mute mutilate mutiny mutter mutt muzzle be mystery myth nab nagging nag nag nail nail nail name-dropper name-drop name name name nameplate name name name name name name narcotic narrative narrow narrow narrow narrow narrow narrow nationalist nationalize nationalize national nation native natural-food natural-resource navy navigate naysayer naysay nazi neanderthal near near near-monopoly near necessitate necessitate necessity neck necktie need need need need need need need need need negate negative neglect neglect neglect negotiate negotiate negotiate negotiate negotiate negotiate negotiation negotiation negotiator negotiator neighbhorhood neighborhood neighbor neighbor neighbour neophyte nerd nerd nerve nestle nest net net net net net net network network network network neurologist neutralize neutralize neutron newborn newcast newcomer newscast newsy newsletter news newspaper newspaper newsprint newsstand newsstand news-weekly nibble niche nichola nick nickname nickname nightclub nightmare night nine-month nine-tenth nix nobleman nod no-load nominate nominate nominate nomination nominee non-client non-communist nonconformist non-economist non-lawyer non-lawyer nonoperate nonperformer nonperform nonperform nonrecurr non-seaman nonstop noodle normalize norm norm northrop nose-dive nosedive nose-dive nosedive nose no. notch notch notebook note note noteholder note note note note note note note note notice notice notice notice notice notice notification notify notify notify notify notify notify note note notion no. novel novelty novitiate nov. nozzle nuance nuclear-arm nudge nullify nullify nullify number-cruncher number number number number nurse nursery nurse nurse nurture nurture nurture nurture nut oak oasis oats obey obfuscate object object object objection objection objective object object object obligate obligate obligation obligation oblige oblige oblige obliterate obscure obscure obscure observation observe observe observer observer observe observe observe observe observe obsess obsolete obsolete obstacle obstruct obstruct obstruct obtain obtain obtain obtain obtain obviate occasion occasion occupation occupy occupy occupy occupy occupy occur occur occurrence occur occur occur occur ocean octave octogenarian oddball oddity oddity odds odds offender offend offend offend offensive offer offer offer offer offering offer offer offer offer offer offer offer off-hour offical officer officer office office office-supplies official official official offus offset offset offset offset offset offset offset offshoot offspring ogle ogle oils oils old old-timer olefin olympics omen omission omit omit omit omit omit oncogene oncogene one onlooker onlooker on-ramp ooze ooze open open openend opener opening open open open open open opera operate operate operate operate operate operate operate operate operation operative operator opine opinion-maker opinion opinion opponent opponent opportunist opportunity oppose oppose oppose oppose oppose oppose oppose opt opt optical-product optimist opt option option option option opt opt orange orchardist orchard orchestra orchestrate orchestrate orchestrate orchid ordain order order order order order order order order ordinance organism organization organization organize organize organize organizer organize organize organize organ orient originate originate originate originate originate origination originator origin orkem ornament orphan orphan other other ounce oust oust oust oust oust outage outbid outbid outbid outbreak outcome outdate outdo outdistance outdo outfielder outfit outfit outflank outflow outflow outfly outgain outgrow outgrow outing outlander outlast outlast outlaw outlaw outlaw outlay outlay outleap outlet outline outline outline outline outline outlook outmode outnumber out-of-stater outpace outpace outpace outpace outpace outpace outperform outperform outperform outperform outperform outperform outpost outrage outrank outsell outsell outsell outshine outshine outsider outskirts out-smart outsell outstrip outstrip outstrip outstrip outstrip out-trade outweigh outweigh outweigh outweigh oven over-allotment overalls overarch overbid overbid overbought overburden overcome overcharge overcollateralize overcome overcome overcome overcome overcommit overdo overdo overdose overdose overemphasize overflow overhang overhaul overhaul overhaul overheate overheate overlay overlap overlap overlap overlay overlook overlook overlook overlook overlook overlook over-magazine overpay overpay overpay overpay overplant overpower overprice overpurchase overreact overreact overreact overreact overreact override override override overrule overrule overrule overrule overrun oversee oversee oversee oversee overseer oversee oversee overshadow overshadow overshadow oversimplify oversell oversell overstate overstate overstate overstate overstrain oversubscribe overtake overtax overthrow overthrow overthrow overture overturn overturn overturn overturn overuse overvalue overvalue overweight overwhelm overwhelm overwhelm overwhelm overwhelm overwork owe owe owe owe owe owe own own owner owner own own own own own own pace pacemaker pace pacify packaged-goods packaged-goods package package package package package package pack pack packet pack pack pack pack pack pac pac pact paddle paean page pay pay pay pay painewebber pain paint paint painter painting painting paint paint paint paint pair pair pair palazzo pale pale pale pale palm palmtop palmtop palm pal pamper pamphlet pancake pander panelist panel pang panic panic panic panic panjandrum pan pan pan pantage panties pants pan paper-goods paper paper-product papers papers parachute parachute parade parakeet parallel parallel parallel paralyze paralyze paramedic parameter paraphernalia paraphrase parasite parastatal parcel parcel parcel parch pardon pare pare parent parent pare pariba parimutuel pare parish parishioner parity park park park park park park parlor parry parry partake part participant participant participate participate participate participate participate participate participation particulars party party party part partisan partisan partnership partnership partner partner partner part part part pasha passage pass pass pass passenger-kilometer passenger passer-by pass pass pass passion passport pass pass password paste pastel pasteurize pastime pastor patch patch patent patent patent path patient patient patrol patrol patronize patronize patronize patron pattern pattern pattern pause pause pause pause pause pause pave pave pave pave paw pawn pawn payable paycheck payer payer pay pay payment payment payment payoff payout payout payroll pay pay pay pay pay pay pcb pc peacemaker peach peak peak peak peak peak peal peal peanut pearl pear peasant peasant pea peccadillo peck peck peculiarity pedal pedal pedal peddle peddle peddle peddle peddle pedestrian pedigree peek peel peer peer peer peer peer peg peg peg peg peg peg pellet penalize penalize penalize penalty penny pencil pencil pencil pencil pend pend pend penetrate penetrate penetrate penis pen penny pension pen people people people people pepper pepper pepper pepsi perceive perceive perceive perceive percentage perception perch perfect performance perform perform performer performing-art perform perform perform perform perform peril periodical period period peripheral perishable perish perk perk perk permeate permeate permit permit permit permit permit permit permit perpetrate perpetuate perpetuate perpetuate perpetuate persecute persecute persist persist persist persist persist personality personalize personalize personnel personnel person persuade persuade persuade persuade persuade persuade pertain perturb peruse peruse pervade pervade perversity pesata peseta peso pessimist pester pesticide petition petition petition petrochemical pet pet pharaoh pharmaceutical pharmaceutical pharmacy pharmacist phase phase phase phase phase phenomenon phillip philosopher philosophy phobia phone phone phone phone phone phone photocopier photocopy photocopy photofinisher photograph photographer photograph photograph photograph photo phrase physician physics piano piaster pick pick picker pick pickin pickle pick pick pickup pick pick pick picture picture picture picture picture picture piece piece piece piece pierce pier pie piggyback piggyback piglet pigment pig pile pile pile pile pile piling pile pile pillar pillory pillory pillowcase pillow pill pilot pilot pimp pinch pinch pinch pinch pine ping ping pin pin pin-point pinpoint pinpoint pin pin pint pin pioneer pioneer pioneer pipe pipe pipeline pipe pirate pirate piroghi pistil pistol piston pitch pitch pitcher pitch pitch pitch pitchman pitch pitch pitfall pitfall pit pit pit pit pit pit pivot pizzas-with-everything pizzeria placate placate place place placement place place place place place place plague plague plague plague plainclothes plain plain plaintiff plaintiff plane planet plank plan plan planner planner plan plan plan plan plan plan plan plantation plant plant plant plant plant plant plan plan plastic plastic plate platform platitude plaudit play play player player playground play play playoff play play play play play plead plead pleading plead plead pleasantry please please please please please please please plea pleasure plea pledge pledge pledge pledge pledge pliers ply plight plod plot plot plotter plot plot plow plow plow plow ploy ploy pluck pluck plug plug plug plug plug plummet plummet plummet plummet plummet plunge plunge plunge plunge plunge plunge plunge plunk plus plus ply poacher poach pocket pocket pocket pocket pocket pockmark point point pointer point point point point point poise poison poison poke poke poke polarize pole pole police police police police policy policy police policyholder policy-maker policy-make polish polish polish polish politician politician politicize politico-plaintiff politics politics politics poll poll pollen-produce pollinate pollinate pollinate pollinate poll poll pollster poll pollutant polluter pollute pollute poll pol poltergeist polymer polyol polyp polyrhythm ponder ponder ponder pond pony pony pontificate pony pool pool pool pool pop pop popularize popularize popularize populate populate populate population pop pop porcelain porch pore pore pore pork-barreler portable portend portend portfolio portfolio port portion portrait portrayal portray portray portray portray portray portray port port pose pose pose pose pose pose position position position position position possess possess possess possess possession possess possess possibility postcard post post post post poster posting post postmark postmark postpone postpone postpone postpone postpone post post post posture post potable potato potentate potentiality pothole pot pot pouch pounce pound pound pound pound pour pour pour pour pour pour powder power powerhouse power power practice practice practice practice practice practitioner pragmatist prairie praise praise praise praise praise praise praise prance prayer pray preach preach preach pre-approve preapprove prearrange precaution precede precedent precede precede precede precinct precious-metal precious-metal precipice precipitate precipitate preclude preclude preclude predate predator predecessor predetermine predicate predict\/advocate predict predict predict predict prediction prediction predict predict predict predispose pre-empt pre-empt preference prefer prefer prefer prefer prefer prefer prefer prejudice prejudice premiere premiere premiere premiere premise premium preoccupy prepay prepay preparation preparative prepare prepare preparer prepare prepare prepare prepare prepay prepayment prepayment prepay prep pre-register pre-register prerogative presage presage preschooler prescribe prescribe prescribe prescribe prescription presentation present present presenter present present present present present present preserve preserve preserve preserve preserve preside preside president president preside preside pre-sign press press press press press pressure pressure pressure pressure pressure pressure pressure press press press presume presume presume pretend pretend pretend pretension pre-test pre-try prevail prevail prevail prevail prevail prevail prevent prevent prevent prevent prevent prevent prevent prevent preview preview prey price price price-earnings price\/earnings price price price price price price pricing price prick priest prime primitive prince principal principle print print printer print print printout print print print print priority prisoner prison privatize privatize privatize privilege prize prize probability probe probe probe probe probe problematic problem procedure proceed proceed proceedings proceedings proceed proceeds proceeds proceed proceed proceed process process process process process process processor process process proclaim proclaim proclaim proclaim proclaim proclamation procure prod prod prod produce produce producer producer produce produce produce produce produce production product product prod profess profess profess professional professional profession professor profess proffer proffer profferr profile profile profile profit profit profiteer profiteer profit profit-sharing profit profit profit profit profit-taking profitt profit profit progenitor prognosticator program programmer program program program program program progress progress progress progress progression progressive progress prohibit prohibit prohibit prohibition prohibit prohibit prohibit project project project project project projection projector project project project project project proliferate proliferate proliferate prolong prolong promise promise promise promise promise promise promise promise promote promote promoter promote promote promote promote promotion prompt prompt prompt prompt prompt prompt promulgate prong pronounce pronounce pronouncement pronounce proof-of-purchase proofread propagandist propagandize propagandize propel propel propel propel property proponent proponent proportion proposal proposal proposal propose propose propose propose propose propose proposition prop prop prop proprietorship proprietor propse prop prop proscribe proscribe prosecute prosecute prosecute prosecute prosecution prosecutor prosecutor prosecutor pro pro prospects prospects prospects prospectus prosper prosper prostitute protect protect protect protection protector protect protect protect protege protein protein protest protest protester protest protestor protest protest protest protocol prototype prove prove prove proverb prove prove prove prove provide provide provide provider provide provide provide provide provide provide province prove provision provision provoke provoke provoke provoke proxy prune prune pseudo-lobbyist psychic psychologist psychologist publication publicize publicize public-relation public-works public-works publish publish publish publisher publisher publish publish publish pub puffer pull-back pullback pull pull pull pullout pull pull pull pull pull pulverize pummel pummel pummel pump pump pump pump pump pump punch punch puncher punch punch punch punch pundit punish punish punish pun punt pupil puppet puppy purchase purchase purchaser purchase purchase purchase purchase purchase purchase purge purge purge purge purge purist pur-poise purport purport purpose purr purse-snatching purse pursue pursue pursuer pursue pursue pursue pursue pursuit push push pusher push push push push push push put put put put put put put put put put put put put puzzle puzzle puzzle puzzle pyramid pyramid quack quack quadruped quadruple quadruple quadruple quadruple quake qualification qualify qualify qualify qualify qualify qualify quality qualm quantify quantify quantity quarrel quarrel quarterback quarter quartet quash quash quash queer quell query query query question question question question question question question question question queue queue quibble quicken quiet quiet quiet quiet quip quip quip quip quirk quit quit quit quit quit quit quiver quiz quota-cheater quota quotation quote quote quote quote quote quote quote quote race racehorse race racetrack race race rack rack racketeer racket rack rack radar-elude radical radio radio rafter rage rage rage rage rage raid raid raider raid raid raid railbike railcar railing railroad rail rail railway rain rain rain raise raise raiser raise raise raise raise raise raise raise rake rake rally rally rally rally rally rally rally rally rally ramble ramification ram rampart ramp ramp rancher ranch rand range range ranger range range range range range ring rank rank ranking rank rankle rank rank rank rank run rape rape rapeseed rape rapist raptor rarefy ratchet rate rate rate ratepayer rate rate rate rate rate rate ratify ratify ratify ratify rating rating rate rationalization rationalize rationalize ration ratio ratio rat rattle rattle rattle rattle rattle rat ravage ravages ravages rave rave ravine raw-material ray raze raze reach reach reach reach reach reach reach reach reach react react react reaction reactivate reactivate reactor react react react reader reader ready reading read read readmit read read read read read read read read reaffirm reaffirm reaffirm reaffirm reaganaut realestate realign realign realignment realign realist reality realize realize realize realize realize realize reallocate reallocate realm ream reap reap reap reappoint reapportion reappraise reappraise reappraise reap reap reard rear rear rearm rearrange rearrange reason reason reason reason reason reason reason reassert reassert reassert reassess reassess reassign reassign reassignment reassume reassurance reassure reassure reassure reassure reauthorize reawaken rebate rebel rebel rebound rebound rebound rebound rebound rebound rebound rebind rebuff rebuff rebuild rebuild rebuild rebuild rebuke rebut rebut rebut recalculate recalculate recalculation recall recall recall recall recall recall recall recall recall recant recant recapitalization recapitalize recapture recede recede receipt receivable receive receive receiver receive receive receive receive recentralize receptionist receptor recess recession recharge recipe recipient recite recite reckon reckon reckon reckon reckon reclaim reclaim reclaim reclaim reclaim reclassify recline recognize recognize recognize recognize recognize recognize recommendation recommendaton recommend recommend recommend recommend recommend recommend reconcile reconcile reconcile reconnect reconsider reconsider reconstruct reconstruct reconstruct reconstruct record record recorder recording record record record record record record recount recount recount recount recoup recoup recoup recover recover recovery recover recover recover recover recover recraft re-creaction recreate re-create re-creation recruit recruit recruit recruit recruit recruit recruit rectangle rectify rectify recuperate recur recuse recycle recycle recycle recycle redden redeem redeem redeem redefine redefine redefine redemption redeploy redesign redesign redesign redesign redevelop red-flag redial redirect rediscover redistribute redistribute redline redo redouble redound redo redraw redress red red reduce reduce reduce reduce reduce reduce reduce reduction reef re-elect re-elect reel reel reel re-emerge re-emphasize re-enact re-enactment re-enactment re-enter re-enter re-enter re-establish re-establish reestablish re-evaluate re-evaluate re-evaluate re-examine reexamine re-export refashion referee reference referral refer refer refer refer refer refer refer refile refinance refinance refinance refinance refine refinery refiner refiner refine refine refit reflect reflect reflect reflect reflect reflect reflect refocus refocus refocus refocus refocus reform reformer reform reform reformulate reform refrain refrain refresh refrigerator refuel refugee refund refund refund refund refund refurbish refurbish refurbish refuse refuse refuser refuse refuse refuse refuse refute refute regain regain regain regain regain regain regain regard regard regard regard regard regard regard regard regenerate region register register register register register register registrant registration regret regret regret regret regret regret regroup regroup regulate regulate regulate regulate regulation regulation regulator regulator regulator regumm rehabilitate rehabilitate rehash reign reign reignite reignite reignite reignite reimburse reimburse reimbursement reimburse reimburse reimburse reimpose reincorporate reincorporate reindict reinforce reinforce reinforcement reinforce reinforce reinforce reinforce reinforce rein rein reinstall reinstate reinstate reinstate reinstate reinstitute reinsurer reintegrate reintroduce rein reinvent reinvent reinvest reinvest reinvest reinvest reinvest reinvigorate reinvigorate reinvigorate reiterate reiterate reiterate reit reject reject reject rejection reject reject reject rejoice rejoin rejoin rejoin rejoin rejuvenate rekindle rekindle rekindle rekindle relabel relate relate relate relate relate relate relate relationship relation relation relative relaunch relaunch relax relax relax relax relax relay release release release release release release release release relegate relent relent relent relic rely rely rely rely relieve relieve relieve religion relinquish relinquish relinquish relinquish relish relish relish relive relocate relocate relocate relocate relocation rely rely rely rely remake remain remain remain remain remains remains remain remain remain remake remand remark remark remarketer remarketing remark remark remark remedy remedy remedy remember remember remember remember remember remember remember remic remic remind remind reminder remind remind remind remind remittance remnant remodel remodel remora remove remove remove remove remove remove remunerate rename rename rename render render rendering render render rendezvous renege renege renegotiate renegotiate renegotiate renewal renew renew renew renew renew renew renounce renounce renounce renovate renovate renovate renown rental rent renter rent rent rent rent rent reoffer reopen reopen reopen reopen reopen reorganize reorganize reorganize reorient repackage repackage repackage repay repay repaint repair repair repair repair repair repair repair reparation repass repatriate repay repayment repay repeal repeal repeal repeat repeat repeater repeat repeat repeat repeat repel repel repercussion replace replace replacement replace replace replace replace replace replaster replay replenish replenish replicate replicate replicate reply reply reply reply reply reply repond report report reporter report report report report report report report report report repose reposition repository repossess representation representative representative represent represent represent representive represent represent represent repress repress reprice reprint reprint reprint reprint reprisal reproduce reproduce reproduce reprove rep republican republic repudiate repurchase repurchase repurchase repurchase repurchase repurchase repurchase reputation repute request request request request request request require require requirement require require require require require requisition reroute reroute rerun resale reschedule reschedule reschedule rescind rescind rescind rescission rescue rescuer rescue rescue research researcher researcher researcher research research research reseller resell resell resell resell resemblance resemble resemble resemble resemble resent resent resent reservation reserve reserve reserve reserve reserve reserve reserve reserve reserve reservoir reset reshape reshape reshape reshape reshuffle reshuffle reshuffling reshuffle reside residence resident resident reside reside reside residue resignation resignation resign resign resign resign resign resin resist resist resist resist resist resist resist resell resell resell resolution resolve resolve resolve resolve resonate resonate resonate resort resort resort resort resource respect respect respects respect respect respond respond respondent respond respond respond respond respond response response responsibility restart restart restarter restart restart restate restate restate restate restaurant restaurant rest restore restore restore restore restrain restrain restrain restraint restrain restrict restrict restrict restriction restrict restrict restrict restrict restructure restructure restructure restructure restructure restructuring restructure rest rest rest rest rest restyle resubmit result result result result result result result result result resume resume resume resume resume resume resume re-supply resurface resurface resurge resurrect resurrect resurrect resurrect resurrect resuscitate resuscitate retailer retailer retail retail-sale retail retail retail retain retain retain retain retain retain retake retaliate retardant retard rethink rethink retire retire retire retiree retirement retire retire retire retire retool retool retort retort retrace retract retrain retreat retreat retreat retreat retreat retrench retrieve retrieve retrieve retrieve retrofit retry return return return return return return return return reunion reunite reunite reuse revalue revamp revamp revamp revamp be reveal reveal reveal reveal reveal reveal reveal revelation reveler revel revel revel revenue revenue reverberate reverberate reverberate reverberation reversal reverse reverse reverse reverse reverse reverse reverse revert revert review review review review review review review review revise revise revise revise revise revise revise revisionist revision revisit revisit revisit revitalize revitalize revival revival revive revive revive revive revive revoke revoke revoke revolutionary revolutionize revolutionize revolve revolve revolve rev reward reward reward reward reward reward reward rework rewrite rewrite rewrite rhyme rhyme rhyme ribby ribbon rib rican riches rider ride ride ride ridge ridicule ridicule ride rid rid rid rid rid riff rifle rig rig right rights rights rights right-to-lifer right-winger rigor rig rile rile rim ringer ringer ring ring ring ring ring riot ripen ripen ripoff rip rip ripple ripple ripple rise rise rise rise rise rise rise risk risk risk risk risk risk risk rite ritual rival rivalry rival rival rival rival river rivet rivet rivet rivet roadblock road road roadway roam roam roar roar rob robbery robber robber rob robe robot rock rock rocker rocket rocket rocket rock rock rock rodent ride ride rod rogue roil roil role rollback roll roll roll roller roll rollover roll roll roll roll romance romanticize romp romp roofer roof rooftop roommate room roost root rooter root root root root rope rosarian rose rise rise rotate rotate rotate rot rot rough roughneck round round round round round roustabout route route route routine route row row row royalty rub rubber-neck rubber-stamp rubdown rubfest rubin ruble rub rub rub rub ruffle ruffle rug ruin ruin ruin ruin ruin ruin rule rule ruler rule rule rule rule rule rule ruling rule rumble rumble rumble rumbling rumble ruminate ruminate rumination rumor rumor rumor rumor rumor ring run-in runner runner-up run run run run runup run run run run runway rupture rupture rupture rush rush rush rush rush rush rust rusticate rustler rustling rv saatchus saber sabotage sack sacking sack sack sacrifice sacrifice sacrifice sacrifice saddle saddle safeguard safeguard safeguard safeguard safety sage sag sag sag sag sago say say say say say sail sail sailor sail saint salary salary salaryman salesman sale sale sale salespeople salespeople salicylate salt salute salute salvage salvage salve samaritan samovar sample sample sanction sanction sanction sanction sandinista sand sandwich sandwich sandwich sandwich sing sanitationist sanitize sanitize sink san sap sap sap satellite satisfy satisfy satisfy satisfy satisfy satisfy satisfy satisfy saturate saturate saturate sit sit saucer sauce sauna save save savers\/investors saver save save save save savings savings save savor savor savor savor see say say say say say say say say scab scalawag scale scale scale scale scale scalp scam scammer scamper scam scandalize scandal scanner scan scan scan scan scape scare scare scare scare scare scare scarf scare scare scar scar scatter scavenger scenario scenario scene scent schedule schedule schedule schedule schedule schemer scheme scheme schmumper scholarship scholar schoolboy schoolchildren schoolmate school school schoolteacher school science scientist scientist scime scime scoff scoff scoff scold scold scoop scoop scoop scoop scoop scoot score score score score score score scorn scorn scotch scotch scourge scour scour scout scout scowl scowl scramble scramble scramble scramble scramble scrape scrap scrap scrap scrap scrap scrap scrap scrap scratch scratch scratch scratch scream scream scream scream scream screech screech screen screening screen screen screen screen screenwriter screw screw screw scribble scribbler scribble scribe scrimp scrimp script scriptwriter scrounge scrounge scrubber scrutinize scrutinize scrutinize scrutinize sculptor sculpture scurry scurry scurry scuttle scuttle scuttle sealant seal seal seal seal seal seaman search search search search search search sear sear sea seasoning season seat seat seat seat seat secede second-guess second-guess seconds seconds secretary secret section sector secure secure secure secure secure securite security security sedan seduce seduce seed seed see see seeker seek seek seek seek seek seek seek seem seem seem seem seem seem see see seep seesaw see see seethe see see see see segment segment segregate segregate seize seize seize seize seize seize seizure select select select selection selection select select select self-destruct self-insure self-reinsure self-starter seller sell sell sell-off sell-off selloff sell sell sell sell sell semantics semester semi-celebrity semiconductor semifinish semifinish seminar senator senator sender send send send send send senior senior sense sens sense sense sensibility sense sensitive sensitivity sensitize sen. sensor sentence sentence sentence sentence sentencing sentence sentence sentiment send send separate separate separate separate separate sequel sequester sequester sequester sequin sergeant serial series series servant serve serve serve server serve serve serve service service service service service service serve session setback set set setter setting set set settle settle settlement settlement settler settle settle settle settle settle set set set set sever sever sever sever sevice sewer sew sex shack shades shadow shadow shadow shag shake shake shake shake shake shake shake shape shape shape shape shape shape shard sharecropper share share shareholder shareholder shareholding share share share share share share share shark sharpen sharpen sharpen shatter shatter shatter shatter shave shave shave shave shave shear sheath shed shed shed shed shed shed sheet sheik shell shell shell shell shelter shelter shelter shelter shelve shelf shepherd sheriff shy shy shield shield shield shield shield shift shift shift shift shift shift shift shilling shill shimmer shine shine shine shine shin shipbuilder shipment shipment ship ship shipper shipper ship shipset ship ship ship shipyard shirk shirk shirt shiver shiver shock shock shock shock shock shoehorn shoe-horn shoelace shoemake shoe shake shake shooting shoot shoot shoot shoot shoot shopkeeper shop shop shopper shopper shopper shop shop shop shop shop shore shore shore shear shear shortage shortchange short-circuit shortcoming short shorten shorten shorten short short short-seller shorts short shot shoot shoot shoulder shoulder shoulder shoulder shout shout shout shout shout shove shovel shove shove shove showcase show show shower shower showgirl showing show show show showroom showroom show show-stopper show show show show show shrink shred shriek shrine shrink shrink shrink shrink shrivel shroud shrub shrub shrug shrug shrug shrug shrink shudder shuffle shuffle shun shun shun shun shun shutdown shut shut shutter shutter shut shuttle shuttle shuttle shuttle shut shut shut shy shy shy sibling side side sideline sideline sideline side side sidestep sidestep sidestep sidestep sidestep sidetrack side side sidewalk sift sift sift sigh sigh sigh sigh sighting sight signal signal SIGNAL signal signal signal signal signal signal signal signatory signature signboard sign sign sign signify signify signify sign sign sign sign sign silence silence silence silence silt similarity simmer simmer simplicity simplify simplify simplify simplify simplify simulate simulate simulation simulator singer sing singin single single singles single single single sing sing sing sink sink sink sin siphon siphon siphon siphon sip sip siren sister sitcom site sit sit sit sit situate situation sit sit sit six-pack sixty sixty size size size sizzle sizzle skateboard skeptic skeptic sketch sketch sketch skew skid skid skid skid skid skid skid skier sky ski skill skill skimmer skimp skim skin skip skipper skip skip skip skip skirmish skirmish skirt skirt skirt ski ski skr1.5 skr205 skr20 skr225 skr29 skull skyrocket skyrocket skyrocket slab slacken slacken slacken slacks slack slay slam-dunk slam slam slap slap slap slap slap slash slash slash slash slash slash slate slate slate slat slaughter slaying slay sleep sleep sleep sleep sleep sleep sleeve sleep slice slice slice slide-pack slide slide slide slide slide slide slide slim slim slinger sling slip slip slip slip slip slip slither slither slit sliver slogan slog slog slope slope slosh slot slough slowdown slow slow slow slow slow slow s&ls slump slump slump slump slump slump slum slur smack smack smart smash smash smash smash smell smell smell smell smell smidgin smile smile smile smile smile smile smoke smoker smoker smokescreen smoke smoke smoke smoke smolder smolder smooth smooth smother smother smother smuggle snafus snag snag snake snake snake snap snap snap snapshot snap snap snare snarl snatch snatch snatch sneak sneaker sneak sneak sniff sniff sniff sniff snipe snippet snivel ' snoop snooze snore snort snowball snowball snowbird snub snub soak soak soap soapsud soar soar soar soar soar soar soar sober sob socall social-affair socialist socialist socialize socialize society sociologist sock sock sock soda sofa soft-drink soften soften soften soften soften softie soil soil-nutrient soil soiree soldier soldier sell sell solicitation solicit solicit solicit solicitor solicit solicit solidify solidify solidify solution solve solve solve solvent solve solve solve solve song songster songwriter son soothe soothe sophisticate sop sorehead sort sort sort sort sort seek seek soulmate soul sound sound sounding sound sound sound sound sound sound sound soup source source source sour sour sour soviet soviet sow sow sow sow sow soybean soybean soybean spaceship space spackle spaghetti span span span span spare spare spare spare spare spare spare spark spark spark sparkplug spark spark spark spark spar spar spasm spa spawn spawn spawn spawn speaker speaker speak speak speak speak speak spearhead spearhead spearhead specialist specialist specialize specialize specialize specialize specialize special specialty specialty-chemical specialty-metal species specification specification specifics specify specify specify specify specify specimen spec spectator speculate speculate speculate speculate speculate speculation speculator speculator speed speed speech speed speed speed speed speed speed spell spell spell spell spell spender spend spend spend spend spendthrift spend spend spend spend spend spew spew spice spider spy spy spy spigot spike spill spill spill spill spill spin spinoff spin spin spiral spiral spiral spirit spirit spirit splash splint splits splits split split split split spoil spoil speak spokesman spokesman spoke spokesperson speak sponsor sponsor sponsor sponsor sponsor sponsor sponsor sponsor spook spook spook spook spook spoonbill spoonful spore sport sporting-goods sport sportsman sport sport sport spotlight spot spot spot spot spot spot spot spot spot spouse spout spring sprawl spray spray spread spreadsheet spread spread spread spread spread spread spread spring spring spring sprinkle sprinkler sprinkle sprinkle spritzer sprout sprout sprout spruce spruce spring spud spin spin spurn spurn spurn spurn spurn spurn spur spur spur spur spur spurt spurt spurt spurt spurt spur sputter sputter spy spy squabble squad squall squander squander square square square square square squat squeak squeegee squeeze squeeze squeeze squeeze squeeze squeeze squelch squelch squint squint squirm stab stab stab stabilize stabilize stabilize stabilize stabilize stabilize stabilize stack stack stack stack stadium stadium staff staffer staffer staff staff staff stage stage stage stage stage stage stage stagewhisper stagger stagger stage stagnate stagnate staid stain stain staircase stair stake stake stake stalk stalk stall stall stall stall stall stall stall stalwart stampede stampede stamping stamp stamp stamp stanch standardize standard standby stand stand stand stand stand stand stand stand staple stare stare stare stare star star star start start start starter start start startle start start start start-up start-up start start star starve starve starve starve stash stash state state statement statement statesman state state state state state state state station station statistician statistics statistics statue statute staunch stave stay stay stay stay stay stay stay steady steady steak steal steal steal steal steam steam steam steelmaker steelmaker steel steelworker steer steer steer steer steer stem stem stem stem stem stem step step step step step step step step step step stereo stereotype sterile sterilize sterilize sterilize steroid steward stew sticker stick stick stick stick stick stick stifle stifle stifle still-rage stilt stimulate stimulate stimulate stimulator stimulus sting stingray stink stint stipend stipulate stipulate stipulate stir stirring stir stirrup stir stir stir stir stitch stitch stockbroker stockbroker stockbuild stock stockholder stockholder stockholding stock-index-future stock-index stock stock-option stockpile stockpile stock stock stock stock stock stockyard stoke stoke stoke steal steal stomach stomach stomp stomp stone stone stonewall stand stand stooge stool stoppage stop stop stop stop stop stop stop stop store store storefront store store store store store story store storm storm storm storyteller stow straddle strafe straighten straighten strain strainer strain strain strain strain strait strand strand strand strangle strangle strap strap stratagem strategy strategy strategist strategist strawberry stray stray stray streak stream stream streamline streamline streamline stream street streetspeak strengthen strengthen strengthen strengthen strengthen strengthen strength stress stress stress stress stress stressor stress stress stretch stretch stretch stretch stretch stretch stretch stretch strew stricken stride striker strike strike strike strike strike strike strike string string stripe strip strip strip strip strip strip strip strive strive strive strive stride stroke stroke stroke stroke stroll stroll stroll stronghold strive strike strike strike structure structure structure structure structure struggle struggle struggle struggle struggle struggle struggle stub stick stick stud student-athlete studentle student student student study study study study study studio studio study study study stuff stuff stuff stuff stumble stumble stumble stumble stumble sting sting stun stun stun stunt style style style stymie subcommittee subcompact subconference subcontractor subcontractor subcontract subdue subgroup subject subject subject subject subject sublet submarine sub-market submit submit submit submit submit submit subordinate subordinate subordinate subordinate subordinate subpoena subpoena subpoena subscribe subscriber subscriber subscribe subscribe subscribe subscribe subscribe subscription sub-segment subside subside subside subsidiary subsidy subsidy subsidize subsidize subsidize subsidize subsidize subskill substance substantiate substation substitute substitute substitute substitute substitute substitute substract subsume subtilis subtitle subtract subtract subtract subtract sub-underwriter sub-underwrite suburb subversive subvert subvert subvert subway succeed succeed succeed succeed succeed succeed succeed success successor succumb succumb succumb sucker suck sue sue sue sue suffer suffer suffer suffer suffer suffer suffer suffice suffice suffice suggest suggest suggest suggestion suggest suggest suggest sue suit suite suitor suitor suit suit suit suit summary summarize summarize summarize summarize sum summer summon summon summon summon sum sum sum sunday sunflower sunglass sing sink sink sunset supercede supercede supercomputer superconcentrate superconcentrate superconductor superconductor superimpose superintendent superior supermarket supermarket superpower superpremium supersede supersede supersede superstar supervise supervise supervise supervise supervise supervise supervisor supplement supplement supply supply supplier supplier supplies supply supply supply supply support support supporter supporter support support support support support support suppose suppose suppose suppose suppose suppressant suppress suppress suppress surface surface surface surface surface surface surfer surge surge surge surge surge surge surge surmount surpass surpass surpass surpass surpass surplus surprise surprise surprise surprise surprise surprise surrender surrender surrender surrender surrender surround surround surround surround surround surtax survey survey survey survey survey survey survey survey survive survive survive survive survive survive survive survive survive survivor suspect suspect suspect suspect suspect suspect suspect suspend suspend suspend suspend suspension suspicion sustain sustain sustain sustain sustain suture suvivor be s be swallow swallow swallow swamp swamp swamp swan swap swap swap swap swap swarm swathe sway sway sway sway swear swear swear sweat sweater sweat sweatshirt sweat sweat sweat swede sweeper sweep sweep sweepstakes sweep sweep sweeten sweeten sweetener sweeten sweet swell swell swell swell swell swell swell sweep sweep swerve swim swim swindle swine swing swing swing swing swing switch switch switcher switch switch switch switch switch switch swivel Swivel swear swear swing swing swing sycophant symbolize symbolize symbol sympathy sympathizer sympathize symposium symptom sym synchronize synchronize synchronize syndciate syndicate syndicate syndicate syndicate syndicate syndication synergy syngery synonym synthesizer synthesize synthetic system system tabac table tablespoon tablet tabloid taboo tab tack tack tackle tackle tackle tackle tackle taco tactic tag tag tag tail tailor tailor tail taint take take takeover taker take take taketh take take take take takings take take talent tale tale talk talk talk talk talk talk talk talk talk talk tally tally tally tame tame tamper tamper tamper tampon tandy tangle tangle tango tank tanker tanker tank tank tank tan tan tantalize tape tape Tape taper taper taper taper tape tapestry tape tape taping tape tap tap tap tap tap tap tap target target target target target target target target tariff tarnish tarnish tarnish tar tartan task tassel taste taste taste taste taste teach teach taunt tax-deduction tax tax tax tax tax tax-exempt tax-exempt taxpayer taxpayer taxpayer tax-reduce tax tax-writer t-bill teacher teach teach teach teach teach team team teammate team team team tear tear tear tease teaspoon technical-service technician technique technocrat technology teem teem teen-ager teenager teens teens teeter tooth telecine telecommunications telecommunications telecuss telegraph telegraph telemarketer telephone telephone-operation telephone telephone telephone telesystem telesystem televise television telex teller tell tell tell tell tell temblor temperature temper temp tempt tempt tempt tempt tempt tenant tend tend tendency tender tender tender tender tender tender tender tend tend tend tend tenet tension ten tenth term term terminal terminal terminate terminate terminate terminate termination term term term terrify terrify territory terrorist testament test-drive test-drive test test tester test-fire testify testify testify testify testify testify test test test test test test test tether textbook textile text thank thank thanks thanks thanks thank thank thank that theater theft theft theme theme theologian theory theorist theorist theorize theory-teach therapy therapist thermometer the the thief thief thing thing think think think think think think thin thin third thirty thirty thistle thoroughbred thought think think thousand thousand thrash thrash thrash thread threaten threaten threaten threaten threaten threaten threat three-fourth three-quarter three-seventh throw thrift thrift thrill thrill thrill thrive thrive thrive thrive thrive thrive throat thrower throw throw throw throw throw throw throw thrust thrust thrust thrust thug thumb thumb thumb thumb thunder thwart thwart thwart thwart tick ticket ticket ticket tick tick tick tidbit tide tie tie tie-in tie tie tie-up tie tiger tighten tighten tighten tighten tighten tile tilt tilt tilt timberland timber time time time-hotel time-share time time time time timpani tinge tinge tinker tinker tinker tin tip tip tipster tiptoe tiptoe tiptoe tip tire tire-kicker tire tire tissue titan title title title title toast toddler toehold toe toil toiletries toil toil toil tell tell tolerate toll toll tomato tone tone tongue tonnage ton ton take tool toot topic top top top top topple topple topple top top top top torch torch tear tory tormentor torment torment tornado tear torpedo torpedo tort torture torture torture toss toss tosser toss toss total total total total total total total total total tote tote tote totter totter touch touch touch touch touch touch touch toughen toughen tough tour tour tourist tournament tour tour tour tout tout tout tout tout towel tower tower tower townhouse township town tow toy toy trace trace trace trace trace trace track track track track track track track tractor tract trade trade trade trademark trade-off tradeoff trader trader trade trade trade trade trade trade trade traditionalist tradition traduce traduce trafficker tragedy trail trail trailer trail trail trail trail trail train train trainer train train train train train train traipse trait tramp trample trample transact transact transaction transaction transact transcribe transcript transfer transfer transfer transfer transfer transfer transform transform transform transform transform transfusion transistor translate translate translate translate translate translation transmission transmit transmit transmit transmit transmogrify transmogrify transplant transplant transplant transplant transport transport transport transport transport transport transport transport transvestite trap trappings trap trap trash trash trauma traumatize traumatize travail travel travel traveler traveler travel Travel travelogue travels travel travel travel travel tray treadmill tread tread treasurer treasure treasure treasury treasury treat treat treat treat treatise treatment treat treat treat treat treble tree trek tremble tremor trench trend trend-setter trend trend trespass trespass trial triangle tribe tribute trickle trickle trickle trick trick try try try try trigger trigger trigger trigger trigger trigger trigger trillion trill trimester trim trim trim trim trim trim triple triple triple triple triple trip trip triumph trivialize trivia troops troops tro trot trot trot trouble trouble trouble troublemaker trouble trouble trouble trouble trough trough troupe trousers trout trucker truck-part truck truck truck trudge trump trumpet trumpet trumpet trundle trunk trust trust trustee trustee trust trust trust trust trust trust truth try try tryout try try try try t-shirt t-shirt tube tub tuck tuck tuck tuck tug tug tuition tuition tumble tumble tumble tumble tumble tumble tumor tumor-suppressor tune tune tune tune tune tunnel turban turbine turboprop turmoil turnaround turn turn turn turn turn-on turn turn turn turn turn turn turtle tusk tutor tutorial tutor tuxedo tv tweezers twenty twiddle twin twist twist twist twist twist twitch two-hundredth two-seventh two-third twothird two-third tycoon tie type typeface type typewriter typhoon typify typify ufo ulcer ultimatum umbrella un-advertiser unban unblock uncertainty unchlorinate uncle unconsolidate unconsolidate uncover uncover uncover uncover underclass undercut undercut undercut undercut undercut underestimate underestimate underfund undergird undergo undergo undergo undergo underlie underline underline underlie underlie undermine undermine undermine undermine undermine underperform underperformer underperform underperform underperform underpin underpin underpin underprice underreact underscore underscore underscore underscore underscore underscore underscore undersell understaff understand\/adopt understand understand understand understand understate understate understate understate understand understand undertake undertake undertaking undertake undertone undertake underutilize undervalue undervalue underweight undergo underwhelm underwiter underwriter underwriter underwrite underwrite underwrite underwriting underwrite underwrite underwrite underwrite undo undo undo undulate undulate unearth unexecute unfaze unfocus unfold unfold unfold unfold unhinge unhock unhusk unify uniform unify unify unify unionist unionize union union unite Unite unite unite unitholder unitholder unit unit university university unknown unleash unleash unleash unleash unleash unleash unload unload unload unload unload unlock unlock unlock unmask unmask unmaterialize unnerve unnerve unplug unpolarize unravel unravel unravel unravel unroll unroll unseat unseat unsettle unsettle unveil unveil unveil unveil unveil unveil unwaver unwind unwind update update update update upgrade upgrade upgrade upgrade upgrade upgrade upheaval uphold uphold uphold uphold uphold uphold up up uproot ups-and-downs upset upset upset upset upset upset up uptick ural urge urge urge urge urge urge urging urge use use Use user user use use use use use usher usher usher usher usher use Use usurp u.s. utility utility utilize utmost utopian utterance utter utter uvb vacancy vacancy vacate vacate vacate vacate vacationer vacation vacation vaccine vacillate vacuum vagabond vagary validate valuation value value value value value value value value valve vandalize vanish vanish vanish vanish vanish vanish vanity van vapor variable variation vary vary vary variety vary vary vary vary vase vassal vault vault vcr veer veer vegetable vegetable vegetarian vehicle veil veil vend vendor vent ventilate ventilate venture venture venture vent venue verdict verge verify verify version vessel vest vestment vest veteran veterinarian veto veto veto veto veto have have vex viaduct vibrate vicar vice vicissitude vicitim victimize victim victim victory victory videocassette videodisk video videotape videotape vietnamese view view viewer viewer viewing view viewpoint view view view view view vignette villager village village villain vindicate vineyard vintage vinyl-product violate violate violate violate violate violate violation virgin virtue virtuoso visage visionary vision visit visit visit visit visitor visitor visit visit visit visit visualize visualize visual vitiate voice voice voice voice voice voice voice voice void void volume volunteer volunteer volunteer volunteer vomit vote vote vote-getter voter voter vote vote vote vote vote voucher vow vow vowel vowel vow vow vow vie waddle wade wad wafer waffle waffle waft wage wages wage wage wagon wag wail wail wait wait waiter wait wait wait wait wait waive waive waiver waiver waive waive waive wake wake walk walkie-talkie walk walk walk walkout walk walk walk walk walk walk wallcovering wallet wallop wallow wall wall wander wander wander wander wane wane wane wane wane wane wane want want want want want want want ward warden ward warehouse wares warhead warm warm warm warm warm warn warn warn warn warner warning warn warn warn warn warrant warranty warrant warrant warrant warrant warren war warrior war war war wart war wash wash wash wash wash waste waste waste waste waste waste be be be watchdog watch watch watcher watch watch watch watch watch watch watch water waterfall water water water waterworks watt wave wavelength waver waver wave wave wax wax way way weaken weaken weaken weaken weaken weaken weaken weaken weakness wean weapon wear wear wear wear wear wear weasle weather weather weave weave web wed wedding wedge wedge wedge wednesday wed wed weed weekday weekend weekly weeknight week weeper weep weigh weigh weigh weigh weigh weight weighting weight weight weigh weigh welcome welcome welcome welcome welcome welcome welcome weld well well-state well-wisher go go weep be be be be we whack whack whack whale wheelbase wheel wheel wheeze when-issue while whimper whim whip whipping whip whipsaw whipsaw whip whip whirlwind whir whisk whisper whisper whisper whistle whistle whistle whistle whites whitewash whittle whizz wholesaler wholesaler wholesale whoop whoosh widen widen widen widen widen widen widen widget widow widow wield wield wield wield wiggle wiggle wiggle wig wilfr willie will will will wimp wimp windfall wind window windshield wind wind wind winery wine wing wink winner winner win win win win winter win win wipe wipe wipe wipe wipe wire wire wiretap wire wisecrack wish wish wish wish wish wish wish-list wish wish witch withdrawal withdraw withdraw withdraw withdraw wither wither withhold withhold withholding withhold withhold withhold withhold withstand withstand withstand withstand witness witness witness witness witness wife wife wizard woe wake wake wolf womanize woman woman wonderbar wonder wonder wonder wonder wonder wonder won win win win woodchuck wood-product wood woo woo woo word word-processing word wear workbook workday work work worker worker worker workings work work work workman worksheet worksheet work workstation work work work work work world-affair world worm wear wear worry worry worry worrier worry worry worry worry worry worry worsen worsen worsen worsen wound wound wind wind wind weave wow wrack wrack wrack wrangle wrap wrap wrapper wrap wrap wrap wreak wreak wreak wreck wreck wreck wrench wrench wrest wrestler wrestle wrestle wrestle wrest wriggle wring wrist write-downs writedown write-off writeoff writer\/producers writer write write write write writhe writings write write write write wrong write write work wring x-ray yacht yank yank yank yank yard yearbook yearling year year year yell yell yellow yell yell yelp yen yield yield yield yield yield yield yield yield youngster younker youth yuppie zapper zap zero zero zero zero zigzag zig-zag zip zip zloty zombie zone zone zone zoom zoom zoom ================================================ FILE: ccgbank/data/wsj_0595Corrected.auto ================================================ ID=wsj_0595.1 PARSER=GOLD NUMPARSE=1 ( ( () ) ( ( () () ) ( () ( () ( ( () ) ( () ( () ) ) ) ) ) ) ) ID=wsj_0595.2 PARSER=GOLD NUMPARSE=1 ( ( ( () ( ( () ) ( ( () () ) ( ( () () ) ) ) ) ) ( () ( () ( () ( () ( () ( () ( ( () ) ( ( () ( ( () ( () () ) ) ( ( () () ) ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.3 PARSER=GOLD NUMPARSE=1 ( ( ( ( () ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ( () ( ( () ( () () ) ) ( () ( ( ( () () ) ) ( () ( ( ( ( ( ( () () ) () ) () ) ) ( () ( () ) ) ) ( () ( ( () () ) ( () ( ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.4 PARSER=GOLD NUMPARSE=1 ( ( ( () () ) ( () ( ( ( () ( () () ) ) ( () ( () ( ( () ( () () ) ) ) ) ) ) ( () ( ( () ( () () ) ) ( ( ( () ) () ) ) ) ) ) ) ) () ) ID=wsj_0595.5 PARSER=GOLD NUMPARSE=1 ( ( ( () () ) ( () ( ( () ) ( () ( ( () ( () ( () () ) ) ) ( () ( () ( ( ( ( () () ) () ) () ) ( () ( ( ( ( ( () () ) () ) ) ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ( () ( () ( ( ( () () ) ) ( () ( ( () ) ( () ( ( ( () ( () ) ) ) ( () ( ( ( () () ) ) ( () ( ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.6 PARSER=GOLD NUMPARSE=1 ( ( () ( () ( ( () ) ( () ( ( () () ) ) ) ) ) ) ( ( ( () ( () ) ) ( () ( ( () ) ( () ( () ) ) ) ) ) () ) ) ID=wsj_0595.7 PARSER=GOLD NUMPARSE=1 ( ( () ( ( () () ) ( () ( () ( ( () ( ( ( ( () ) () ) ( () () ) ) ( ( () ( ( ( () () ) () ) () ) ) () ) ) ) ( () ( ( ( ( () () ) () ) ( () () ) ) ( () ( ( () ( ( () ( ( ( () () ) () ) () ) ) ( () ( () ( () ( () () ) ) ) ) ) ) ( () ( () ( ( () ( ( ( () () ) () ) () ) ) ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.8 PARSER=GOLD NUMPARSE=1 ( ( ( () () ) ( ( () ( () () ) ) ( () ( ( ( () ( () () ) ) () ) ) ) ) ) () ) ID=wsj_0595.9 PARSER=GOLD NUMPARSE=1 ( ( ( () ( () () ) ) ( () ( ( () ) ( ( () () ) ( ( ( ( ( () () ) () ) () ) ) ( () ( () ( ( () ) ( () ( ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.10 PARSER=GOLD NUMPARSE=1 ( ( () ( ( ( () () ) ( () ( ( ( () () ) () ) ) ) ) ( () ( ( ( ( ( () () ) () ) () ) ) ( () ( () ( () ( ( () () ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.11 PARSER=GOLD NUMPARSE=1 ( ( () ( ( ( () () ) ( () ( ( ( ( ( () () ) ) ) () ) () ) ) ) ( () ( ( ( () ) ( () ( ( () () ) ( () ( ( () () ) ) ) ) ) ) ( () ( () ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.13 PARSER=GOLD NUMPARSE=1 ( ( ( () ( () () ) ) ) ( () ( () ( () ( ( () () ) ) ) ) ) ) ID=wsj_0595.14 PARSER=GOLD NUMPARSE=1 ( ( ( ( ( ( () () ) ( () ( () ( () () ) ) ) ) ) ( ( () () ) ( () ( ( () () ) () ) ) ) ) ( () ( ( ( () ( () () ) ) ) () ) ) ) () ) ID=wsj_0595.16 PARSER=GOLD NUMPARSE=1 ( ( ( ( () () ) ) ( ( () () ) ( () ( () ( ( ( ( () ( () ( () ( () () ) ) ) ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ( () ( () ( () ( ( ( () () ) ( () ( () ) ) ) ( () ( () ( () ( ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.17 PARSER=GOLD NUMPARSE=1 ( ( () ( ( ( ( () () ) ) ( () ( ( ( ( () ( () ( () ( () () ) ) ) ) ) ( () ( () ( () () ) ) ) ) ( () ( ( ( () () ) ) ( () ( () ( () () ) ) ) ) ) ) ) ) ( () ( () ( ( ( ( () ( ( () () ) () ) ) ( ( () ( ( ( ( () () ) () ) () ) ) ) () ) ) ( () ( () ) ) ) ( ( ( () () ) ( () ( ( () () ) ) ) ) ( () ( ( ( ( () () ) ( () ( ( () () ) ) ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.18 PARSER=GOLD NUMPARSE=1 ( () ( ( () () ) ( ( ( ( ( ( () () ) ( () ( () ) ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ( () ( ( () ( ( () () ) () ) ) ( () ( ( () () ) () ) ) ) ) ) () ) () ) ) ) ID=wsj_0595.19 PARSER=GOLD NUMPARSE=1 ( ( () ( ( ( () ( ( () () ) () ) ) ( ( () ( ( () () ) ) ) () ) ) ( ( () ( ( () () ) ) ) ( () ( ( () () ) ( () ( () ( () ( ( () () ) () ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.20 PARSER=GOLD NUMPARSE=1 ( ( ( ( () () ) ) ( ( ( () ( () ( () ( () ( () () ) ) ) ) ) () ) ( () ( ( () () ) ( () ( ( () () ) ( () ( ( () ) ( () ( ( ( () () ) ) ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.21 PARSER=GOLD NUMPARSE=1 ( ( ( () () ) ( () ( () ( () ( ( ( ( () () ) () ) ) ( ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.22 PARSER=GOLD NUMPARSE=1 ( ( ( () ( () ( () ( () ( () () ) ) ) ) ) ( ( ( ( () () ) () ) () ) ( () ( ( ( ( () ( () ( () () ) ) ) ) ( () ( () ( ( ( () () ) () ) ) ) ) ) ( () ( () ( ( ( () ( () ( () () ) ) ) ) ( ( () ( () () ) ) ( () ( ( () ( ( () () ) () ) ) ( () ( ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.23 PARSER=GOLD NUMPARSE=1 ( ( () ( ( () ( ( ( () ) () ) () ) ) ( ( () ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ( () ( ( () () ) ( () ( () ( ( () ( () () ) ) ( () ( ( ( () () ) ( ( () () ) ) ) ( () ( ( () () ) ( () ( ( () ) ( () ( ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.24 PARSER=GOLD NUMPARSE=1 ( ( ( () ( () () ) ) ) ( () ( ( () () ) ) ) ) ID=wsj_0595.25 PARSER=GOLD NUMPARSE=1 ( ( ( ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ( ( ( () ( ( () () ) ) ) ( () ( ( ( ( () () ) ) () ) () ) ) ) ) ) ( () ( ( ( () ( () ( () ( () ( ( () ( () () ) ) ) ) ) ) ) () ) ( () ( () ( ( () ) ( () ( ( () ( () ( () ( () () ) ) ) ) ( () ( ( () () ) ( () ( () ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.26 PARSER=GOLD NUMPARSE=1 ( ( ( ( () ( () () ) ) ) ( () ( () ( () ( ( () ( () () ) ) ( () ( () ( () ( ( () () ) ( () () ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.27 PARSER=GOLD NUMPARSE=1 ( ( ( ( ( () ) () ) ( ( () () ) ( () () ) ) ) ( ( ( () ( () ( () () ) ) ) () ) ( ( ( () () ) ( () ( () () ) ) ) ( () ( ( () () ) ) ) ) ) ) () ) ID=wsj_0595.28 PARSER=GOLD NUMPARSE=1 ( ( () ( () ( ( () ( ( () ( () ( () () ) ) ) ) ) ( () ( () ( ( () () ) ( () ( ( () ( ( ( () ) ( () ( () ) ) ) ( () () ) ) ) ( () ( () ( ( ( () () ) ) ( () ( ( ( () ( () ( () () ) ) ) () ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.29 PARSER=GOLD NUMPARSE=1 ( ( ( ( () ) ( () ( ( () ( () () ) ) ( () ( () ( () ( () ( () ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ( () ( () ( ( ( ( () ( () () ) ) ) ( () ( () ( ( () ( () () ) ) ( () () ) ) ) ) ) ( () ( ( ( ( () () ) () ) ( () ( ( () ( () () ) ) ) ) ) ( () ( () ( ( ( () ( ( () ( () () ) ) ( () () ) ) ) ( () ( ( ( () ) () ) () ) ) ) ( () ( () ( ( () () ) ( () ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.30 PARSER=GOLD NUMPARSE=1 ( ( () ( ( () () ) ( ( ( () () ) ( () ( () () ) ) ) ( () ( ( () ( () ( () () ) ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.31 PARSER=GOLD NUMPARSE=1 ( ( ( () ( () ) ) ( () ( ( ( ( () () ) ) ( () ( ( () ( ( () ( () () ) ) ( () () ) ) ) ( () ( () ( () ( () () ) ) ) ) ) ) ) ( () ( ( () ( () ( () () ) ) ) ( ( ( () ( () ) ) ( () ( ( () ( () ( ( () () ) ) ) ) ( () ( () ( () ( ( () () ) ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.32 PARSER=GOLD NUMPARSE=1 ( ( () ( () ( () ( ( ( ( ( () () ) ) ( () ( () ( () ( () () ) ) ) ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ( () ( ( ( () () ) ) ( () ( () ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.33 PARSER=GOLD NUMPARSE=1 ( ( ( ( () ( () ( () ( ( () ( () () ) ) ( () ( () ( () ( () ( ( () ) ( ( () () ) ( () ( ( () ( () ( () ) ) ) ( () ( () ) ) ) ) ) ) ) ) ) ) ) ) ) ) ) ( () ( ( ( ( () () ) ) ( () ( () ( () () ) ) ) ) ( ( () () ) ( () ( () ( ( ( ( ( () () ) () ) ) ( () ( ( () ( () ( () () ) ) ) ) ) ) ( () ( () ( () () ) ) ) ) ) ) ) ) ) ) () ) ID=wsj_0595.34 PARSER=GOLD NUMPARSE=1 ( ( () ( () () ) ) ) ID=wsj_0595.35 PARSER=GOLD NUMPARSE=1 ( ( ( ( ( ( ( () ( () () ) ) ( () ( () () ) ) ) ) ( () ( ( ( () () ) ( () ( () ( () ( () ) ) ) ) ) ( () ( ( () () ) ) ) ) ) ) ( () ( ( ( () () ) ( () ( ( () () ) ) ) ) () ) ) ) () ) () ) ID=wsj_0595.36 PARSER=GOLD NUMPARSE=1 ( ( ( ( ( () () ) ) ( () ( ( () ( () ( () () ) ) ) ( () ( () ( ( () () ) ( () ( ( () () ) ) ) ) ) ) ) ) ) ( () ( ( () ( () ( () () ) ) ) () ) ) ) () ) ID=wsj_0595.37 PARSER=GOLD NUMPARSE=1 ( ( () ( () ( ( ( () () ) ( () ( ( ( () () ) ) ( () ( ( () () ) ( () ( () ( () () ) ) ) ) ) ) ) ) ( () ( ( ( ( () ( () ( ( () () ) ) ) ) ( () ( () ( () ( () () ) ) ) ) ) () ) ( ( () ( () ( () () ) ) ) ( () ( () ( ( () ( ( ( ( () () ) () ) ) ( () ( () ( () () ) ) ) ) ) ( () ( ( ( () () ) ) ( () ( () ) ) ) ) ) ) ) ) ) ) ) ) ) () ) ================================================ FILE: ccgbank/extract/add-chunks.xsl ================================================ ================================================ FILE: ccgbank/extract/convert-to-graph.xsl ================================================ ================================================ FILE: ccgbank/extract/convert-to-hlds.xsl ================================================ Error: node with id = should be a reference (with idref) ================================================ FILE: ccgbank/extract/grammar.xml ================================================ ================================================ FILE: ccgbank/extract/raise-nodes.xsl ================================================ Next ================================================ FILE: ccgbank/models/hypertagger/ht-prior.flm ================================================ ## A prior probability model that estimates p(supertag | word, pos) ## with smoothed back-off (a "soft tagging dictionary" if you will). 1 ## lexical category ("supertag") (T) given POS tag (P) and word (W). T : 2 P(0) W(0) t_p0w0.count t_p0w0.lm 3 P0,W0 W0 wbdiscount gtmin 15 P0 P0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/models/hypertagger/ht.config ================================================ # location of FLM file and vocab file for prior model #priorModel=ht-prior.flm #priorModelVocab=vocab.st # maxent model (trained using ZL's toolkit) for hypertagging maxentModel=ht.mod # beam width for beta-best search through the tags returned by the maxent model betas = 0.075 0.03 0.01 0.003 0.001 0.0003 0.0001 0.00001 # if using tagging dictionaries (instead of prior models) specify the dictionary threshold # ("K" value from Clark 2002) dictk=10 # specify tagging dictionaries. these are only used if prior models are NOT specified! #wDict=word.dict.min10 #posDict=pos.dict.min10 # filename for configuration of POS tagger posConfig=pos.config # arg names and short names argnames = Arg0:A0 Arg1:A1 Arg1a:A1a Arg1b:A1b Arg2:A2 Arg2a:A2a Arg2b:A2b Arg3:A3 Arg4:A4 Arg5:A5 ================================================ FILE: ccgbank/models/hypertagger/ht2.config ================================================ # location of FLM file and vocab file for prior model #priorModel=stprior.flm #priorModelVocab=vocab.st # maxent model (trained using ZL's toolkit) for hypertagging maxentModel=ht.mod # second-pass model maxentModel2=ht2.mod # beam width for beta-best search through the tags returned by the maxent model # these will be applied to both hypertagging models betas = 0.075 0.03 0.01 0.003 0.001 0.0003 0.0001 0.00001 # if using tagging dictionaries (instead of prior models) specify the dictionary threshold # ("K" value from Clark 2002) #dictk=10 # specify tagging dictionaries. these are only used if prior models are NOT specified! #wDict=word.dict.min10 #posDict=pos.dict.min10 # filename for configuration of POS tagger posConfig=pos.config # arg names and short names argnames = Arg0:A0 Arg1:A1 Arg1a:A1a Arg1b:A1b Arg2:A2 Arg2a:A2a Arg2b:A2b Arg3:A3 Arg4:A4 Arg5:A5 ================================================ FILE: ccgbank/models/hypertagger/ht2.train.config ================================================ # config file for generating events during realizer training # # a single beta level (not the most strict one) is used, rather # than adaptive backoff (todo: try reverse adaptation) # maxent model (trained using ZL's toolkit) for hypertagging maxentModel=ht.mod # second-pass model maxentModel2=ht2.mod # beam width for beta-best search through the tags returned by the maxent model # these will be applied to both hypertagging models betas = 0.001 # filename for configuration of POS tagger posConfig=pos.config ================================================ FILE: ccgbank/models/hypertagger/pos.config ================================================ # this model gives priors on POS tags. #priorModel=posprior.flm #priorModelVocab=vocab.pos # this is a Zhang Le-style MEM. maxentModel=pos.mod # you can also configure the tagger's beta-value here using the key "beta" ================================================ FILE: ccgbank/models/hypertagger/posprior.flm ================================================ ## A prior probability model that estimates p(pos | word) ## with smoothed back-off (a "soft tagging dictionary" if you will). 1 ## POS tag (P) given word (W) with a back-off to the prior on the POS itself. P : 1 W(0) p_w0.count p_w0.lm 2 W0 W0 wbdiscount gtmin 1 0 0 wbdiscount gtmin 1 ================================================ FILE: ccgbank/models/hypertagger/vocab.flm ================================================ ## flm file for determining the vocab 3 ## word (W) unigram W : 0 w.count w.lm 1 0 0 ## supertag (T) unigram T : 0 t.count t.lm 1 0 0 ## pos tag (P) unigram P : 0 p.count p.lm 1 0 0 ================================================ FILE: ccgbank/models/parser/binary.flm ================================================ ## binary step of Hockenmaier's HWDep generative syntactic model ## with added pos tags 6 ## expansion (E) given parent cat (P), lexcat parent (CP), head postag parent (T) and headword parent (W) E : 4 P(0) CP(0) T(0) W(0) e_p0cp0t0w0.count e_p0cp0t0w0.lm 5 P0,CP0,T0,W0 W0 wbdiscount gtmin 10 P0,CP0,T0 T0 wbdiscount P0,CP0 CP0 wbdiscount P0 P0 wbdiscount 0 0 wbdiscount ## head (H) given expansion (E), parent cat (P) and lexcat parent (CP) H : 3 E(0) P(0) CP(0) h_e0p0cp0.count h_e0p0cp0.lm 4 E0,P0,CP0 CP0 wbdiscount E0,P0 P0 wbdiscount E0 E0 wbdiscount 0 0 wbdiscount ## sibling (S) given expansion (E), parent cat (P), head cat (H), lexcat parent (CP), head postag parent and headword parent (W) S : 6 E(0) P(0) H(0) CP(0) T(0) W(0) s_e0p0h0cp0t0w0.count s_e0p0h0cp0t0w0.lm 7 E0,P0,H0,CP0,T0,W0 W0 wbdiscount gtmin 10 E0,P0,H0,CP0,T0 T0 wbdiscount E0,P0,H0,CP0 CP0 wbdiscount E0,P0,H0 H0 wbdiscount E0,P0 P0 wbdiscount E0 E0 wbdiscount 0 0 wbdiscount ## lexcat sibling (CS) given expansion (E), sibling (S), parent cat (P) and head cat (H) CS : 4 E(0) S(0) P(0) H(0) cs_e0s0p0h0.count cs_e0s0p0h0.lm 5 E0,S0,P0,H0 H0 wbdiscount E0,S0,P0 P0 wbdiscount E0,S0 S0 wbdiscount E0 E0 wbdiscount 0 0 wbdiscount ## head postag sibling (TS) given lexcat sibling (CS), sibling (S), parent cat (P), head cat (H), head postag parent (T) and headword parent (W) TS : 6 CS(0) S(0) P(0) H(0) T(0) W(0) ts_cs0s0p0h0t0w0.count ts_cs0s0p0h0t0w0.lm 7 CS0,S0,P0,H0,T0,W0 W0 wbdiscount gtmin 10 CS0,S0,P0,H0,T0 T0 wbdiscount CS0,S0,P0,H0 H0 wbdiscount CS0,S0,P0 P0 wbdiscount CS0,S0 S0 wbdiscount CS0 CS0 wbdiscount 0 0 wbdiscount ## headword sibling (WS) given lexcat sibling (CS), sibling (S), parent cat (P), head cat (H), head postag parent (T), headword parent (W) and head postag sibling (TS) WS : 7 CS(0) S(0) P(0) H(0) T(0) W(0) TS(0) ws_cs0s0p0h0t0w0ts0.count ws_cs0s0p0h0t0w0ts0.lm 8 CS0,S0,P0,H0,T0,W0,TS0 TS0 wbdiscount gtmin 5 CS0,S0,P0,H0,T0,W0 W0 wbdiscount gtmin 10 CS0,S0,P0,H0,T0 T0 wbdiscount gtmin 5 CS0,S0,P0,H0 H0 wbdiscount CS0,S0,P0 P0 wbdiscount CS0,S0 S0 wbdiscount CS0 CS0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/models/parser/gen-events.prefs ================================================ ================================================ FILE: ccgbank/models/parser/leaf.flm ================================================ ## leaf step of Hockenmaier's HWDep generative syntactic model ## with added pos tags 1 ## expansion (E) given parent cat (P), lexcat parent (CP), head postag parent (T) and headword parent (W) E : 4 P(0) CP(0) T(0) W(0) e_p0cp0t0w0.count e_p0cp0t0w0.lm 5 P0,CP0,T0,W0 W0 wbdiscount gtmin 10 P0,CP0,T0 T0 wbdiscount P0,CP0 CP0 wbdiscount P0 P0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/models/parser/model.init ================================================ 1 genlogprob 1 ================================================ FILE: ccgbank/models/parser/parse.prefs ================================================ ================================================ FILE: ccgbank/models/parser/top.flm ================================================ ## top step of Hockenmaier's HWDep generative syntactic model ## with added pos tags 4 ## head (H) given expansion (E), parent cat (P) and lexcat parent (CP) ## nb: E, P and CP should all equal when invoked H : 3 E(0) P(0) CP(0) h_e0p0cp0.count h_e0p0cp0.lm 4 E0,P0,CP0 CP0 wbdiscount E0,P0 P0 wbdiscount E0 E0 wbdiscount 0 0 wbdiscount ## lexcat top (CT) given parent cat (P) CT : 1 P(0) ct_p0.count ct_p0.lm 2 P0 P0 wbdiscount 0 0 wbdiscount ## head postag top (TT) given lexcat parent (CP) TT : 1 CP(0) tt_cp0.count tt_cp0.lm 2 CP0 CP0 wbdiscount 0 0 wbdiscount ## headword top (WT) given lexcat parent (CP), head postag top (TT) WT : 2 CP(0) TT(0) wt_cp0tt0.count wt_cp0tt0.lm 3 CP0,TT0 TT0 wbdiscount gtmin 5 CP0 CP0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/models/parser/unary.flm ================================================ ## unary step of Hockenmaier's HWDep generative syntactic model ## with added pos tags 2 ## expansion (E) given parent cat (P), lexcat parent (CP), head postag parent (T) and headword parent (W) E : 4 P(0) CP(0) T(0) W(0) e_p0cp0t0w0.count e_p0cp0t0w0.lm 5 P0,CP0,T0,W0 W0 wbdiscount gtmin 10 P0,CP0,T0 T0 wbdiscount P0,CP0 CP0 wbdiscount P0 P0 wbdiscount 0 0 wbdiscount ## head (H) given expansion (E), parent cat (P) and lexcat parent (CP) H : 3 E(0) P(0) CP(0) h_e0p0cp0.count h_e0p0cp0.lm 4 E0,P0,CP0 CP0 wbdiscount E0,P0 P0 wbdiscount E0 E0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/models/parser/vocab.flm ================================================ ## flm file for determining the vocab 13 ## headword (W) unigram W : 0 w.count w.lm 1 0 0 ## headword top (WT) WT : 0 wt.count wt.lm 1 0 0 ## headword sibling (WS) WS : 0 ws.count ws.lm 1 0 0 ## expansion (E) E : 0 e.count e.lm 1 0 0 ## head (H) cat H : 0 h.count h.lm 1 0 0 ## parent (P) cat P : 0 p.count p.lm 1 0 0 ## lexcat parent (CP) CP : 0 cp.count cp.lm 1 0 0 ## lexcat top (CT) CT : 0 ct.count ct.lm 1 0 0 ## head postag top (TT) TT : 0 tt.count tt.lm 1 0 0 ## head postag parent (T) T : 0 t.count t.lm 1 0 0 ## sibling (S) cat S : 0 s.count s.lm 1 0 0 ## lexcat sibling (CS) CS : 0 cs.count cs.lm 1 0 0 ## head postag sibling (TS) TS : 0 ts.count ts.lm 1 0 0 ================================================ FILE: ccgbank/models/realizer/alph.init ================================================ 6 genlogprob 1 $ngram0 1 $ngram1 1 $ngram2 1 $ngram3 1 $deplen 1 ================================================ FILE: ccgbank/models/realizer/gen-events.prefs ================================================ ================================================ FILE: ccgbank/models/realizer/model.init ================================================ 2 genlogprob 1 $ngram0 1 ================================================ FILE: ccgbank/models/realizer/rz-test.prefs ================================================ ================================================ FILE: ccgbank/models/realizer/stp3.flm ================================================ ## Supertags FLM ## Supertag (T) based on POS tags (P), plus POS trigram 2 ## POS trigram P : 2 P(-1) P(-2) p_p1p2.count p_p1p2.lm 3 P1,P2 P2 kndiscount P1 P1 kndiscount 0 0 ndiscount ## 3gram with prev two POSs T : 3 P(0) P(-1) P(-2) t_p0p1p2.count t_p0p1p2.lm 4 P0,P1,P2 P2 kndiscount P0,P1 P1 kndiscount P0 P0 kndiscount 0 0 kndiscount ================================================ FILE: ccgbank/models/supertagger/pos.config ================================================ # an example POS tagger config file (D.N. Mehay) # change to suit your needs (e.g., replace the following paths # with paths that point to the relevant files). # "basic" = Ratnaparkhi-style features, inter alia # there is nothing else in OpenCCG, at the moment. taggerType=basic # this model gives priors on POS tags. priorModel=posprior.flm priorModelVocab=vocab.pos # this is a Zhang Le-style MEM. maxentModel=pos.mod # this last must be an ARPA-formatted n-gram model over POS tags (with and ) # (7- to 9-grams work well, without much memory usage). sequenceModel=pos.lm # tagging beam width (beta=1 means "single-best", i.e., unitagging) beta=0.044 #beta=0.1 #beta=1.0 # tagging algorithm. (choose from 'forward' and 'forward-backward') # the former is faster, the latter is better. taggingAlgorithm=forward-backward # self-explanatory (will potentially print certain errors and warnings, if set to 'true') verbose=true ================================================ FILE: ccgbank/models/supertagger/posprior.flm ================================================ ## A prior probability model that estimates p(pos | word) ## with smoothed back-off (a "soft tagging dictionary" if you will). 1 ## POS tag (P) given word (W) with a back-off to the prior on the POS itself. P : 1 W(0) p_w0.count p_w0.lm 2 W0 W0 wbdiscount gtmin 1 0 0 wbdiscount gtmin 1 ================================================ FILE: ccgbank/models/supertagger/st.config ================================================ # an example supertagger config file (D.N. Mehay) # change to suit your needs (e.g., replace the following paths # with paths that point to the relevant files). # this is a comment ("basic" means C,C & Vadas (2006)-style features). # there is nothing else in OpenCCG, at the moment. taggerType=basic # this model gives priors on supertags. #priorModel=stprior.flm #priorModelVocab=vocab.st # this is a Zhang Le-style MEM. maxentModel=st.mod # this last must be an ARPA-formatted n-gram model over supertags (with and ) # (7- to 9-grams work well, without much memory usage). sequenceModel=st.lm # tagging beam widths (first try beta1, then beta2 if that doesn't give a parse, etc.). # give as many as you want, but keep in mind that your parser will try them all out # (which may make it try to parse in vain -- i.e., when it just doesn't have the # categories to do it). #betas = 0.024 0.003875 0.001225 0.0005377 0.000275 0.0000925 0.00004 #betas = 0.075 0.03 0.01 0.005 0.001 0.0005 betas = 0.075 0.03 0.01 0.003 0.001 0.0003 0.0001 0.00001 # if not using the prior model (above), you must give 'K' values (see Clark and Curran (2007)). # the first one is for all beta values but the last. the second one is for # the last. firstK=20 lastK=100 # also, if you're not using the prior models, you need to specify a word-keyed tagging # dictionary (this interacts with the 'K' values) and a POS-keyed tagging dictionary. wDict=word.dict.min10 posDict=pos.dict.min10 # use automatic POS features? (even if you are automatically POS tagging, # but only using single-best, set this to false). autoPOS=true # if autoPOS=true, you need to specify a POS config file. posConfig=pos.config # tagging algorithm. (choose from 'forward' and 'forward-backward') # the former is faster, the latter is better. taggingAlgorithm=forward-backward # self-explanatory (will print certain errors and warnings, if set to 'true') verbose=true # option to include gold tags #includeGold=true # POS-specific relaxation #betaMultipliers=, 0.01 : 0.01 . 0.1 CC 0.1 betaMultipliers=, 0.01 : 0.01 . 0.1 ================================================ FILE: ccgbank/models/supertagger/st.config.train ================================================ # an example supertagger config file (D.N. Mehay) # change to suit your needs (e.g., replace the following paths # with paths that point to the relevant files). # this is a comment ("basic" means C,C & Vadas (2006)-style features). # there is nothing else in OpenCCG, at the moment. taggerType=basic # this model gives priors on supertags. #priorModel=stprior.flm #priorModelVocab=vocab.st # this is a Zhang Le-style MEM. maxentModel=st.mod # this last must be an ARPA-formatted n-gram model over supertags (with and ) # (7- to 9-grams work well, without much memory usage). sequenceModel=st.lm # tagging beam widths (first try beta1, then beta2 if that doesn't give a parse, etc.). # give as many as you want, but keep in mind that your parser will try them all out # (which may make it try to parse in vain -- i.e., when it just doesn't have the # categories to do it). #betas = 0.024 0.003875 0.001225 0.0005377 0.000275 0.0000925 0.00004 #betas = 0.075 0.03 0.01 0.005 0.001 0.0005 betas = 0.075 0.03 0.01 # if not using the prior model (above), you must give 'K' values (see Clark and Curran (2007)). # the first one is for all beta values but the last. the second one is for # the last. firstK=20 lastK=100 # also, if you're not using the prior models, you need to specify a word-keyed tagging # dictionary (this interacts with the 'K' values) and a POS-keyed tagging dictionary. wDict=word.dict.min10 posDict=pos.dict.min10 # use automatic POS features? (even if you are automatically POS tagging, # but only using single-best, set this to false). autoPOS=false # if autoPOS=true, you need to specify a POS config file. posConfig=pos.config # tagging algorithm. (choose from 'forward' and 'forward-backward') # the former is faster, the latter is better. taggingAlgorithm=forward-backward # self-explanatory (will print certain errors and warnings, if set to 'true') verbose=true # option to include gold tags includeGold=true # POS-specific relaxation betaMultipliers=, 0.01 : 0.01 . 0.1 CC 0.1 ================================================ FILE: ccgbank/models/supertagger/st.noprior.config ================================================ # an example supertagger config file (D.N. Mehay) # change to suit your needs (e.g., replace the following paths # with paths that point to the relevant files). # this is a comment ("basic" means C,C & Vadas (2006)-style features). # there is nothing else in OpenCCG, at the moment. taggerType=basic # this model gives priors on supertags. use it. # nb: testing out no prior model here ... #priorModel=stprior.flm #priorModelVocab=vocab.st # this is a Zhang Le-style MEM. #maxentModel=st.mod maxentModel=st.noprior.mod # this last must be an ARPA-formatted n-gram model over supertags (with and ) # (7- to 9-grams work well, without much memory usage). sequenceModel=st.lm # tagging beam widths (first try beta1, then beta2 if that doesn't give a parse, etc.). # give as many as you want, but keep in mind that your parser will try them all out # (which may make it try to parse in vain -- i.e., when it just doesn't have the # categories to do it). #betas = 0.024 0.003875 0.001225 0.0005377 0.000275 0.0000925 0.00004 #betas = 0.075 0.03 0.01 0.005 0.001 0.0005 betas = 0.075 0.03 0.01 0.003 0.001 0.0003 0.0001 0.00001 # if not using the prior model (above), you must give 'K' values (see Clark and Curran (2007)). # the first one is for all beta values but the last. the second one is for # the last. firstK=20 lastK=100 # also, if you're not using the prior models, you need to specify a word-keyed tagging # dictionary (this interacts with the 'K' values) and a POS-keyed tagging dictionary. wDict=word.dict.min10 posDict=pos.dict.min10 # use automatic POS features? (even if you are automatically POS tagging, # but only using single-best, set this to false). autoPOS=true # if autoPOS=true, you need to specify a POS config file. posConfig=pos.config # tagging algorithm. (choose from 'forward' and 'forward-backward') # the former is faster, the latter is better. taggingAlgorithm=forward-backward # self-explanatory (will print certain errors and warnings, if set to 'true') verbose=true # option to include gold tags #includeGold=true # POS-specific relaxation #betaMultipliers=, 0.01 : 0.01 . 0.1 CC 0.1 betaMultipliers=, 0.01 : 0.01 . 0.1 ================================================ FILE: ccgbank/models/supertagger/stprior.flm ================================================ ## A prior probability model that estimates p(supertag | word, pos) ## with smoothed back-off (a "soft tagging dictionary" if you will). 1 ## lexical category ("supertag") (T) given POS tag (P) and word (W). T : 2 P(0) W(0) t_p0w0.count t_p0w0.lm 3 P0,W0 W0 wbdiscount gtmin 15 P0 P0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/models/supertagger/vocab.flm ================================================ ## flm file for determining the vocab 3 ## word (W) unigram W : 0 w.count w.lm 1 0 0 ## supertag (T) unigram T : 0 t.count t.lm 1 0 0 ## pos tag (P) unigram P : 0 p.count p.lm 1 0 0 ================================================ FILE: ccgbank/original/models/postagger/pos.config ================================================ # an example POS tagger config file (D.N. Mehay) # change to suit your needs (e.g., replace the following paths # with paths that point to the relevant files). # "basic" = Ratnaparkhi-style features, inter alia # there is nothing else in OpenCCG, at the moment. taggerType=basic # this model gives priors on POS tags. priorModel=posprior.flm priorModelVocab=vocab.pos # this is a Zhang Le-style MEM. maxentModel=pos.mod # this last must be an ARPA-formatted n-gram model over POS tags (with and ) # (7- to 9-grams work well, without much memory usage). sequenceModel=pos.lm # tagging beam width (beta=1 means "single-best", i.e., unitagging) beta=0.044 #beta=0.1 #beta=1.0 # tagging algorithm. (choose from 'forward' and 'forward-backward') # the former is faster, the latter is better. taggingAlgorithm=forward-backward # self-explanatory (will potentially print certain errors and warnings, if set to 'true') verbose=true ================================================ FILE: ccgbank/original/models/postagger/posprior.flm ================================================ ## A prior probability model that estimates p(pos | word) ## with smoothed back-off (a "soft tagging dictionary" if you will). 1 ## POS tag (P) given word (W) with a back-off to the prior on the POS itself. P : 1 W(0) p_w0.count p_w0.lm 2 W0 W0 wbdiscount gtmin 1 0 0 wbdiscount gtmin 1 ================================================ FILE: ccgbank/original/models/supertagger/st.config ================================================ # an example supertagger config file (D.N. Mehay) # change to suit your needs (e.g., replace the following paths # with paths that point to the relevant files). # this is a comment ("basic" means C,C & Vadas (2006)-style features). # there is nothing else in OpenCCG, at the moment. taggerType=basic # this model gives priors on supertags. #priorModel=stprior.flm #priorModelVocab=vocab.st # this is a Zhang Le-style MEM. maxentModel=st.mod # this last must be an ARPA-formatted n-gram model over supertags (with and ) # (7- to 9-grams work well, without much memory usage). sequenceModel=st.lm # tagging beam widths (first try beta1, then beta2 if that doesn't give a parse, etc.). # give as many as you want, but keep in mind that your parser will try them all out # (which may make it try to parse in vain -- i.e., when it just doesn't have the # categories to do it). betas = 0.024 0.003875 0.001225 0.0005377 0.000275 0.0000925 0.00004 #betas = 0.075 0.03 0.01 0.005 0.001 # if not using the prior model (above), you must give 'K' values (see Clark and Curran (2007)). # the first one is for all beta values but the last. the second one is for # the last. firstK=20 lastK=100 # also, if you're not using the prior models, you need to specify a word-keyed tagging # dictionary (this interacts with the 'K' values) and a POS-keyed tagging dictionary. wDict=word.dict.min10 posDict=pos.dict.min10 # use automatic POS features? (even if you are automatically POS tagging, # but only using single-best, set this to false). autoPOS=true # if autoPOS=true, you need to specify a POS config file. posConfig=../postagger/pos.config # tagging algorithm. (choose from 'forward' and 'forward-backward') # the former is faster, the latter is better. taggingAlgorithm=forward-backward # self-explanatory (will print certain errors and warnings, if set to 'true') verbose=true ================================================ FILE: ccgbank/original/models/supertagger/stprior.flm ================================================ ## A prior probability model that estimates p(supertag | word, pos) ## with smoothed back-off (a "soft tagging dictionary" if you will). 1 ## lexical category ("supertag") (T) given POS tag (P) and word (W). T : 2 P(0) W(0) t_p0w0.count t_p0w0.lm 3 P0,W0 W0 wbdiscount gtmin 15 P0 P0 wbdiscount 0 0 wbdiscount ================================================ FILE: ccgbank/original/models/supertagger/vocab.flm ================================================ ## flm file for determining the vocab 3 ## word (W) unigram W : 0 w.count w.lm 1 0 0 ## supertag (T) unigram T : 0 t.count t.lm 1 0 0 ## pos tag (P) unigram P : 0 p.count p.lm 1 0 0 ================================================ FILE: ccgbank/plugins/MyGenSynScorer.java ================================================ package plugins; import opennlp.ccg.synsem.*; import java.io.*; public class MyGenSynScorer extends GenerativeSyntacticModel { static String modeldir = getModelDir(); static String getModelDir() { String retval = System.getProperty("gensyn.model.dir", "models/parser"); if (!retval.endsWith("/")) retval += "/"; return retval; } public MyGenSynScorer() throws IOException { super(modeldir+"top.flm", modeldir+"leaf.flm", modeldir+"unary.flm", modeldir+"binary.flm"); } } ================================================ FILE: ccgbank/plugins/MyNgramCombo.java ================================================ package plugins; import opennlp.ccg.ngrams.*; import java.io.*; import java.util.*; import java.lang.Thread; public class MyNgramCombo extends LinearNgramScorerCombo { static String bigWordsLM() { String retval = System.getProperty("big.words.lm", "models/realizer/gigaword4.5g.kenlm.bin"); if (new File(retval).exists()) return retval; System.out.println("Reusing trigram model as a stand-in for the big LM"); return null; } static String wordsLM() { return System.getProperty("words.lm", "models/realizer/train.3bo"); } static String wordsSCLM() { return System.getProperty("words.sc.lm", "models/realizer/train-sc.3bo"); } static String stposFLM() { return System.getProperty("stpos.flm", "models/realizer/stp3.flm"); } // map to keep track of trigram model for reuse static Map lmMap = new IdentityHashMap(5); // return big lm, while setting trigram model if using it as a stand-in static NgramScorer getBigLM() throws IOException { String biglm = bigWordsLM(); if (biglm != null) return new KenNgramModel(5, biglm, false, true, true, '_', false); NgramScorer retval = new StandardNgramModel(3, wordsLM()); lmMap.put(Thread.currentThread(), retval); return retval; } // return trigram lm, reusing existing one if present static NgramScorer getWordsLM() throws IOException { NgramScorer retval = lmMap.get(Thread.currentThread()); if (retval != null) { lmMap.remove(Thread.currentThread()); return retval; } return new StandardNgramModel(3, wordsLM()); } public MyNgramCombo() throws IOException { super(new NgramScorer[] { getBigLM(), getWordsLM(), new StandardNgramModel(3, wordsSCLM(), true), new FactoredNgramModelFamily(stposFLM()) }); } } ================================================ FILE: ccgbank/plugins/MyNgramGenSynProduct.java ================================================ package plugins; import opennlp.ccg.ngrams.*; import opennlp.ccg.synsem.*; import java.io.*; public class MyNgramGenSynProduct extends SignScorerProduct { public MyNgramGenSynProduct() throws IOException { super(new SignScorer[] { new MyGenSynScorer(), new MyNgramCombo() }); } } ================================================ FILE: ccgbank/plugins/MyNgramPrecisionBaselineGenInterp.java ================================================ package plugins; import opennlp.ccg.ngrams.*; import opennlp.ccg.synsem.*; import java.io.*; public class MyNgramPrecisionBaselineGenInterp extends SignScorerInterpolation implements SelfParaphraseBiaser { static String[] targets = { "e plurubus unum" }; NgramPrecisionModel selfBiaser; public MyNgramPrecisionBaselineGenInterp() throws IOException { super( new SignScorer[] { new NgramPrecisionModel(targets), new MyNgramGenSynProduct() }, new double[] { 100.0, 1.0 } ); selfBiaser = (NgramPrecisionModel) models[0]; } public void setTargets(String[] targets) { selfBiaser.setTargets(targets); } } ================================================ FILE: ccgbank/plugins/MyNgramPrecisionPerceptronInterp.java ================================================ package plugins; import opennlp.ccg.ngrams.*; import opennlp.ccg.synsem.*; import java.io.*; public class MyNgramPrecisionPerceptronInterp extends SignScorerInterpolation implements SelfParaphraseBiaser { static String[] targets = { "lee said brianna had dragged food , toys and other things into the bedroom .", "lee , 33 , said the girl had dragged the food , toys and other things into her mother 's bedroom ." //"charles o. prince , 53 , was named as mr. weill 's successor .", //"mr. weill 's longtime confidant , charles o. prince , 53 , was named as his successor ." }; NgramPrecisionModel selfBiaser; public MyNgramPrecisionPerceptronInterp() throws IOException { super( new SignScorer[] { new NgramPrecisionModel(targets), new MyRealizerPerceptronScorer() }, //new double[] { 100.0, 1.0 } new double[] { 10000.0, 1.0 } ); selfBiaser = (NgramPrecisionModel) models[0]; } public void setTargets(String[] targets) { selfBiaser.setTargets(targets); } } ================================================ FILE: ccgbank/plugins/MyParserPerceptronScorer.java ================================================ package plugins; import opennlp.ccg.synsem.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.hylo.*; import java.io.*; public class MyParserPerceptronScorer extends ReRankingPerceptronScorer { static String modeldir = getModelDir(); static String getModelDir() { String retval = System.getProperty("parser.models.dir", "models/parser"); if (!retval.endsWith("/")) retval += "/"; return retval; } static String modelname = getModelName(); static String getModelName() { return System.getProperty("parser.model.name", "model.gz"); } public MyParserPerceptronScorer() throws IOException { super( //new ComposedFeatureExtractor(new MyGenSynScorer(), new SyntacticFeatureExtractor()), new ComposedFeatureExtractor(new MyGenSynScorer(), new MySynSemFeatureExtractor()), new Model(modeldir + modelname) ); } protected SignScorer getBaseScorer(FeatureExtractor featureExtractor) { return (SignScorer) ((ComposedFeatureExtractor)featureExtractor).featureExtractors[0]; } } ================================================ FILE: ccgbank/plugins/MyRealizerPerceptronScorer.java ================================================ package plugins; import opennlp.ccg.synsem.*; import opennlp.ccg.perceptron.*; import java.io.*; public class MyRealizerPerceptronScorer extends PerceptronScorer { static String getModelDir() { String retval = System.getProperty("realizer.models.dir", "models/realizer"); if (!retval.endsWith("/")) retval += "/"; return retval; } static String getModelName() { return System.getProperty("realizer.model.name", "model.gz"); } static String getAgrMultiplier() { return System.getProperty("realizer.agr.mult", "10"); } static double calcAgrMultiplier() { try { double mult = Double.parseDouble(getAgrMultiplier()); return mult; } catch (NumberFormatException exc) { System.out.println("Ignoring unparseable negative agreement feature weight multiplier: " + getAgrMultiplier()); return 1.0; } } static double agrMult = calcAgrMultiplier(); public MyRealizerPerceptronScorer() throws IOException { super( //new ComposedFeatureExtractor(new MyNgramGenSynProduct(), new MySynAgrFeatureExtractor()), new ComposedFeatureExtractor(new MyNgramGenSynProduct(), new MySynSemAgrFeatureExtractor()), new Model(getModelDir() + getModelName(), agreementFilter) ); System.out.println("Loading perceptron model from: " + getModelDir() + getModelName()); System.out.println("Boosting negative agreement and punctuation balancing feature weights by a factor of " + agrMult); } // feature filter for boosting negative agreement feature weights private static Model.FeatureFilter agreementFilter = new Model.FeatureFilter() { /** Returns the modified feature weight for the given feature. */ public double adjustedWeight(String name, double weight) { if (weight >= 0) return weight; if (name.equals("$punct") || name.startsWith("AGR") || isAdjacentPunctFeat(name)) return weight * agrMult; return weight; } }; // returns whether a feature is an adjacent punctuation n-gram feature private static boolean isAdjacentPunctFeat(String name) { String[] keys = name.split(":"); if (keys.length >= 2 && isPunct(keys[0]) && isPunct(keys[1])) return true; if (keys.length >= 4 && keys[0].equals("P") && keys[2].equals("P") && isPunct(keys[1]) && isPunct(keys[3])) return true; return false; } // sentence-boundary markers treated like punctuation private static boolean isPunct(String token) { return token.equals("-") || token.equals("--") || token.equals(",") || token.equals(";") || token.equals(":") || token.equals("!") || token.equals("?") || token.equals(".") || token.equals("...") || token.equals("``") || token.equals("'") || token.equals("''") || token.equals("LRB") || token.equals("RRB"); } } ================================================ FILE: ccgbank/plugins/MySynAgrFeatureExtractor.java ================================================ package plugins; import opennlp.ccg.synsem.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.hylo.*; public class MySynAgrFeatureExtractor extends ComposedFeatureExtractor { public MySynAgrFeatureExtractor() { super(new SyntacticFeatureExtractor(), new EnglishAgreementExtractor()); } } ================================================ FILE: ccgbank/plugins/MySynSemAgrFeatureExtractor.java ================================================ package plugins; import opennlp.ccg.synsem.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.hylo.*; public class MySynSemAgrFeatureExtractor extends ComposedFeatureExtractor { public MySynSemAgrFeatureExtractor() { super(new MySynSemFeatureExtractor(), new EnglishAgreementExtractor()); } } ================================================ FILE: ccgbank/plugins/MySynSemFeatureExtractor.java ================================================ package plugins; import opennlp.ccg.synsem.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.hylo.*; public class MySynSemFeatureExtractor extends ComposedFeatureExtractor { public MySynSemFeatureExtractor() { super(new SyntacticFeatureExtractor(), new LexDepFeatureExtractor()); } } ================================================ FILE: ccgbank/stanford-nlp/classifiers/stanfordner-README ================================================ Place your Stanford NE tagging models here. ================================================ FILE: ccgbank/stanford-nlp/stanfordnlp-README ================================================ Place your Stanford core NLP jar file here (preferably renaming it to 'stanford-core-nlp.jar') and then also place, e.g., NE tagging models under './classifiers'. ================================================ FILE: ccgbank/templates/addFilterLexFeats.xsl ================================================ ================================================ FILE: ccgbank/templates/addStems.xsl ================================================ PERF PROG PASS ================================================ FILE: ccgbank/templates/adjustAppos.xsl ================================================ ================================================ FILE: ccgbank/templates/adjustCandCcats1.xsl ================================================ ================================================ FILE: ccgbank/templates/adjustCats.xsl ================================================ ================================================ FILE: ccgbank/templates/adjustParenthetical.xsl ================================================ ================================================ FILE: ccgbank/templates/adjustReportedSpeech.xsl ================================================ ================================================ FILE: ccgbank/templates/adjustRoles.xsl ================================================ ================================================ FILE: ccgbank/templates/adv-placement.xsl ================================================ ================================================ FILE: ccgbank/templates/agr-macroInsert.xsl ================================================ ================================================ FILE: ccgbank/templates/allotIdLeaf.xsl ================================================ '' ================================================ FILE: ccgbank/templates/allotIdTree.xsl ================================================ ================================================ FILE: ccgbank/templates/allotIndexRel.xsl ================================================ First GenRel GenRel GenRel moodColon ApposRel ApposRel ApposRel ApposRel ApposRel whApposRel emph-intro EmphIntro ElabRel modFeat emph-final EmphFinal interrupt InterruptRel ParenRel colonExp DashInterp EllipsisRel ================================================ FILE: ccgbank/templates/anim-macroInsert.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateAppos-Dash.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateAppos1.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateAppos2.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateAppos3.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateBrackets.xsl ================================================ res arg ================================================ FILE: ccgbank/templates/annotateColons.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateDots.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateExtraposedAppos.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateNom-AdjConj.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateParentheticals1.xsl ================================================ ADV-PAREN PP-PAREN ================================================ FILE: ccgbank/templates/annotateParentheticals2.xsl ================================================ ================================================ FILE: ccgbank/templates/annotatePlace.xsl ================================================ ================================================ FILE: ccgbank/templates/annotatePrtConjs.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateQuotes.xsl ================================================ '' `` ================================================ FILE: ccgbank/templates/annotateReportedSpeech.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateStrayAppos.xsl ================================================ ================================================ FILE: ccgbank/templates/annotateVPCommas.xsl ================================================ \ / ================================================ FILE: ccgbank/templates/balanceAppos.xsl ================================================ ================================================ FILE: ccgbank/templates/balanceDash-Paren.xsl ================================================ ================================================ FILE: ccgbank/templates/catCheck.xsl ================================================ ================================================ FILE: ccgbank/templates/ccgRules.xsl ================================================ Purpose GenRel ================================================ FILE: ccgbank/templates/changePunct.xsl ================================================ ================================================ FILE: ccgbank/templates/closedCatInsert.xsl ================================================ *NoSem* *NoSem* *NoSem* *NoSem* *NoSem* GenOwn GenRel int excl imp dcl ================================================ FILE: ccgbank/templates/collapseMWUFull.xsl ================================================ ================================================ FILE: ccgbank/templates/collapseMWUPart.xsl ================================================ ================================================ FILE: ccgbank/templates/collapseMWUSharedTask.xsl ================================================ ================================================ FILE: ccgbank/templates/computeCats.xsl ================================================ ( ) $ _ [ ] _ ~ ( ) $ [ ] ================================================ FILE: ccgbank/templates/convTags.xsl ================================================ ================================================ FILE: ccgbank/templates/correctMistakes1.xsl ================================================ ================================================ FILE: ccgbank/templates/correctPPHeads.xsl ================================================ ================================================ FILE: ccgbank/templates/exportToAuto.xsl ================================================ PARSER=GOLD NUMPARSE=1 Missing head index or length in (<T > ) (<L >) ( ) ( ) $ _ [ ] _ ~ ================================================ FILE: ccgbank/templates/filterLex.xsl ================================================ ================================================ FILE: ccgbank/templates/find-s-back-n.xsl ================================================ ================================================ FILE: ccgbank/templates/genchal11-out.xsl ================================================ ================================================ FILE: ccgbank/templates/inferConjRules.xsl ================================================ ================================================ FILE: ccgbank/templates/insertLF.xsl ================================================ ================================================ FILE: ccgbank/templates/insertOrigPunctsLF.xsl ================================================ *NoSem* ================================================ FILE: ccgbank/templates/insertPTBInfo.xsl ================================================ ================================================ FILE: ccgbank/templates/insertPunctLF-PosMod.xsl ================================================ First GenRel GenRel GenRel moodColon ApposRel ApposRel ApposRel ApposRel ApposRel whApposRel emph-intro EmphIntro ElabRel modFeat emph-final EmphFinal interrupt InterruptRel ParenRel colonExp DashInterp EllipsisRel quote-rel * ================================================ FILE: ccgbank/templates/insertPunctLF.xsl ================================================ First GenRel GenRel GenRel moodColon ApposRel ApposRel ApposRel ApposRel ApposRel whApposRel emph-intro EmphIntro ElabRel modFeat emph-final EmphFinal interrupt InterruptRel ParenRel colonExp DashInterp EllipsisRel * ================================================ FILE: ccgbank/templates/insertQuoteSemClassInfo.xsl ================================================ NAME ================================================ FILE: ccgbank/templates/insertSemFeats.xsl ================================================ ================================================ FILE: ccgbank/templates/introduceMMExtns.xsl ================================================ ================================================ FILE: ccgbank/templates/labelAppos.xsl ================================================ ================================================ FILE: ccgbank/templates/labelConj1.xsl ================================================ ================================================ FILE: ccgbank/templates/labelConj2.xsl ================================================ ================================================ FILE: ccgbank/templates/labelConj3.xsl ================================================ ================================================ FILE: ccgbank/templates/labelPlace1.xsl ================================================ ================================================ FILE: ccgbank/templates/labelPlace2.xsl ================================================ ================================================ FILE: ccgbank/templates/labelPuncts.xsl ================================================ ================================================ FILE: ccgbank/templates/lexExtr.xsl ================================================ ================================================ FILE: ccgbank/templates/macroInsert.xsl ================================================ ================================================ FILE: ccgbank/templates/macroLexDef.xsl ================================================ ================================================ FILE: ccgbank/templates/markMistakes.xsl ================================================ ================================================ FILE: ccgbank/templates/markUnmatched.xsl ================================================ ================================================ FILE: ccgbank/templates/mergeMorph.xsl ================================================ morph.xml ================================================ FILE: ccgbank/templates/morphExtr.xsl ================================================ ================================================ FILE: ccgbank/templates/normPTBTags.xsl ================================================ X X ================================================ FILE: ccgbank/templates/normPunctPos.xsl ================================================ PUNCT_APPOS np_~1\np_1/s[dcl]_2\(np_3\np_4/np_1) pp[]_~2/np_2 ================================================ FILE: ccgbank/templates/origPunctRules.xsl ================================================ , ================================================ FILE: ccgbank/templates/overtWHLexRels.xsl ================================================ ================================================ FILE: ccgbank/templates/overtWHPronouns.xsl ================================================ ================================================ FILE: ccgbank/templates/phraseExtractor.xsl ================================================ ================================================ FILE: ccgbank/templates/preSentAdj.xsl ================================================ ================================================ FILE: ccgbank/templates/punctLexConjRules.xsl ================================================ args res res args res args args res args res args args args args ================================================ FILE: ccgbank/templates/reinsertPTBInfo.xsl ================================================ ================================================ FILE: ccgbank/templates/repairUnmatched.xsl ================================================ ================================================ FILE: ccgbank/templates/replaceColons.xsl ================================================ ================================================ FILE: ccgbank/templates/rulesExtr.xsl ================================================ ================================================ FILE: ccgbank/templates/sentFinalPuncts.xsl ================================================ ================================================ FILE: ccgbank/templates/trueCaser.xsl ================================================ ================================================ FILE: ccgbank/templates/uncurryBareParse.xsl ================================================ ================================================ FILE: devel/BEN.TODO ================================================ for tuesday 1/9/07: -- test new code in ccg.ply for --prefix and such. first make sure it still works with no --prefix option, producing files with prefixes from the filename and that foo-grammar.xml has the right names in it that it points to. (and no interference from the graphics code.) then test with --prefix and make sure it does what it should (again check grammar.xml), even if --prefix is specified to be blank. -- make sure that the existing test grammars still work. (might need to be done after resolving the issue of caps vs. lowercase, since the current grammars already use switch feature vars to lowercase, which will be wrongly interpreted as semantic vars.) check that the testbed still does what it should and gives results it should. -- take a look at the rough grammar; make sure it gets compiled OK by TeX and the results come out. maybe add some more text. -- think about that mod to family, so that a simple family with one entry can appear without extra syntax: family foo: entry of some sort: hybrid logic spec; which is the same as family foo { entry: entry of some sort: hybrid logic spec; } ================================================ FILE: devel/schedule.txt ================================================ CCG editor schedule Step 1 - get a simple editor working; has buttons and/or menu across the top with new/edit/save buttons; for "formatted text" just display text as-is -- add some small buttons at top of editing area for insertion of common CCG structures -- add popup help on these buttons (right click, select "help", or something) that shows in more detail the expected format -- add extra buttons (top or bottom) for particular CCG symbols (e.g. the symbols used to indicate different slash modes); pressing on them inserts the appropriate symbol char(s) and they have help text shown when moving over them More strictly: -- use the PyEdit code from Python Cookbook (not sure about legality; if not, we can just rewrite it, but i don't think it'll be a problem). -- run the editor, examine it in action, figure out how it works by examining the code; in particular, figure out how keystrokes are handled, since i don't see any keystroke code. -- create two separate modules. one just compiles the code, while the other is the GUI. -- one idea for layout: (1) for the GUI, originally follow Wikipedia's model in which the layout view is the "normal" view and editing is performed as a temporary operation with a "preview" that brings up the layout view in a pane above the edit window, so that further editing can be performed, in addition to an "accept" that goes back to the regular layout view only. (2) this one is editor centric. View pops up a "view" [FIX THIS] add a toolbar under the window with textual buttons for Load, New, View; originally display an empty edit screen. New, Load create new top-level windows; make sure that multiple windows work. Load loads an existing file into a new window. View runs the compiler (see below), and displays the error results into a new pane below the edit window. There should be next-error and previous-error commands. This pane can be hidden, but it reappears on the next next/previous-error commands. -- copy all related files from quiche/mangue, get full environment set up locally; also create subversion repository -- add a "compile" button; this runs the CCG2XML compiler and displays any error messages in a separate pane of the window; pane can be viewed or hidden using menu item or button -- then, parse error messages to get the line number; clicking on an error message shows the appropriate line in the text -- improve on display -- run compiler before displaying; using result of parsing, display separate sections corresponding to major sections of text (e.g. feature {} declaration, family {} declarations) -- initially, just show actual text corresponding to section -- get working the ability to hide/display a particular section and to edit just that section -- gradually, format into something more intelligent; do this section by section -- we already have some code for formatting CCG categories -- add controls for display of CCG categories -- figure out what to do about macros -- at first, just do whatever works in order to get the editor up and running -- macro definitions themselves should be shown with their raw text; it's not clear we can do much more -- macros can cause arbitrary text to be expanded at a particular position; perhaps in display we show macro-expanded sections in a separate color, and allow the sections to be collapsed back to the original text that led to the expansions, on a macro-by-macro basis (some sort of options menu showing all macros and check marks by them for expansion or not) Step 1: toolbar across top, with buttons "Display", "Edit", "Test" each of these changes the widgets below it, and potentially the menubar above it; each has a second toolbar above a large text widget showing something title bar should show name of file status bar at the bottom, ala emacs shows the current mode (display/edit/test) plus other mode-specific info (e.g. for edit, success or failure of recent compilation) Display: buttons for controlling the display (e.g. presence or absence of features) widget showing formatted display Edit: buttons for editing common CCG structures has a preview button, which compiles the text and then displays a second pane below, showing either the errors from compilation or has a save button, which saves text editor widget Test: FIXME: fill in; should allow for running the CCG interpreter and/or web front end (or something that behaves similarly) implementation: - should allow for multiple top-level windows editing different files; hence do not use global variables for status information, but store inside of a "file" object Display, Edit, Test are subclasses of Frame clicking on Display/Edit/Test toolbar buttons: -- hides the existing frame for this mode and displays the new frame -- should not cause window resize! -- should start in Edit mode -- when switching from edit to display, if text has been modified, we either need to bring up a dialog box asking whether to discard the text, or remember the modified text and display in the status bar that the text has been modified -- in Edit mode, you have Save and Display buttons; Display asks to save Save saves text another row of toolbar buttons below, for inserting common brings up a - get a simple editor working; has buttons and/or menu across the top with new/edit/save buttons; for "formatted text" just display text as-is -- add some small buttons at top of editing area for insertion of common CCG structures -- add popup help on these buttons (right click, select "help", or something) that shows in more detail the expected format -- add extra buttons (top or bottom) for particular CCG symbols (e.g. the symbols used to indicate different slash modes); pressing on them inserts the appropriate symbol char(s) and they have help text shown when moving over them ================================================ FILE: docs/build.xml ================================================ ================================================ FILE: docs/ccgbank-README ================================================ Introduction ============ This README describes how to use the pre-built English models trained using the CCGbank, as well as how to train these models yourself starting with the CCGbank. On the realization side, the models make use of all published work on realization ranking (as of March, 2013) -- including discriminatively trained syntactic models, various n-grams, features for syntactic agreement and balanced punctuation, and features for dependency ordering and dependency length minimization -- as well as unpublished improvements to the hypertagger that make use of a two-stage, 'stacked' approach to supertag prediction. For linux, this release also includes support for using a very large 5-gram memory-mapped language model with KenLM. On the parsing side, a reimplementation of Hockenmaier's generative parse model is used, along with a reimplementation of Curran, Clark & Vadas's supertagger in Java. Note that the supertagger can also be used as a stand-alone tool; see taggers-README for details. The grammars take advantage of refinements to the CCGbank that make use of Propbank analyses as well as more precise analyses of punctuation. (See references at bottom.) Since the pre-built English models and CCGbank data for training represent much larger downloads than the OpenCCG core files, they are available as separate downloads (where YYYY-MM-DD represents the date of creation): english-models.YYYY-MM-DD.tgz ccgbank-data.YYYY-MM-DD.tgz For linux, the very large KenLM language model, based on 5-grams in the Gigaword-4 corpus, is available as follows: gigaword4.5g.kenlm.bin Using the pre-built English models ================================== The pre-built statisical models for English allow you to parse novel text in English and to generate English sentences from the (quasi-) logical forms of the resulting parses, thereby producing a variety of grammatical paraphrases. Future releases will contain tools for generating a broader range of paraphrases using disjunctive logical forms, which can handle logical forms with similar, but not identical, structures. It is also possible to use the pre-built models to realize sentences from logical forms constructed programmatically. Note, however, that in comparison to realization with small, hand-crafted grammars, realization with the broad coverage grammar derived from the CCGbank is much slower (with realization typically taking a few seconds per sentence). For NLG applications, an interesting task for future work would be to automatically shrink the grammar's coverage to what is needed for a specific domain, which should yield considerable improvements in efficiency. The pre-built statistical models do not require you to have a copy of the CCGbank, and should run across different Java platforms. They do, however, assume the use of the Stanford Core NLP tools for tokenization, named entity tagging and morphological analysis. The Stanford NLP tools are licensed under the full GPL (rather than the LGPL, as with OpenCCG), and thus these tools are only loosely integrated into a chain of command-line tool invocations. If the GPL is not adequate for your purposes, you'll need to find your own substitute tools. The first step is to make sure you have configured your environment variables and increased your Java memory limit as described in the main README ($OPENCCG_HOME/README). A limit of 2g may be ok, though 4g may work even better; when using the very large KenLM language model, you should use a limit of at least 8g. The next step is to download the current version of english-models.YYYY-MM-DD.tgz, and move it into the ccgbank directory, i.e. $OPENCCG_HOME/ccgbank/, with the undated name english-models.tgz. From this directory, you can unpack the English models archive as shown below: $ mv english-models.YYYY-MM-DD.tgz $OPENCCG_HOME/ccgbank/english-models.tgz $ cd $OPENCCG_HOME/ccgbank $ ccg-build -f build-release.xml extract-models This command uses ant to extract the models in a cross-platform way. As noted in the main README, ccg-build is a simple front end for ant that configures the classpath (and a couple of properties) before invoking ant. The option '-f build-release.xml' simply says to use the build-release.xml build file instead of the default build file. The 'extract-models' target unpacks the archive in a way that makes sure the archive is in the right place before unpacking it. If extracting the models this way yields an error, however, you should use tar (or some other archive extraction tool), eg as follows: $ mv english-models.YYYY-MM-DD.tgz $OPENCCG_HOME/ccgbank/english-models.tgz $ cd $OPENCCG_HOME/ccgbank $ tar xzf english-models.tgz On linux, after downloading the very large language model file, you can install it for use as follows: $ mv gigaword4.5g.kenlm.bin $OPENCCG_HOME/ccgbank/models/realizer/. As noted in the main README, to use the very large LM, you'll also need to set the library load path: $ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$OPENCCG_HOME/lib If the file gigaword4.5g.kenlm.bin is not found, the Treebank-trained trigram model is reused as a stand-in, which will negatively impact realization quality to some extent. In principle it should also be possible to build your own large binary 5-gram model. At present, however, there is no working JNI interface to KenLM for OS X or Windows included. See http://kheafield.com/code/kenlm/ for further information on building large language models and getting KenLM to compile on different platforms, if you would like to try getting the JNI interface working beyond linux. After that, the next step is installing the Stanford Core NLP tools, as described in the section with this name below. At this point you should be ready to try out the models. You can try parsing and realization on a file with a couple of novel sentences containing some named entities that did not exist when the Penn Treebank was created, as well as an adverb that is too infrequent to appear in the training set lexicon, as follows: $ ccg-build -f build-ps.xml test-novel &> logs/log.ps.test.novel & You can follow the progress of the parsing tool chain by looking in the log file. The input file is data/novel/two-sents, and the output files are stored in a newly created directory data/novel/two-sents.dir/. The tool chain does PTB tokenization, truecasing, named entity tagging, POS tagging and stemming in order to create a truecased version of each sentence as well as a morph file that includes all the words in the file. The sentences are then parsed, and the resulting logical forms appear in a testbed file data/novel/two-sents.dir/tb.xml. The tool chain takes a while to run (a couple of minutes, perhaps) as it must load several large data models; thus, for more efficient processing, it will make sense to run many more than two sentences in a batch. Once the parser has run, you can test the realizer on the resulting testbed file as follows: $ ccg-build -f build-rz.xml test-novel &> logs/log.rz.test.novel & Again, you can follow the progress by consulting the log file. The realizations should appear in data/novel/two-sents.dir/realize.nbest. The input file for this test is given by the novel.file property (in build-models.properties). You can override this property on the command line to re-use these build files on your own text, where is the name of your file: $ ccg-build -Dnovel.file= -f build-ps.xml test-novel &> logs/log.ps.test.novel & $ ccg-build -Dnovel.file= -f build-rz.xml test-novel &> logs/log.rz.test.novel & After running these commands, the parses and realizations will be in a new directory .dir/. Tokenization and Normalization of Novel Input Texts =================================================== Note that when parsing text, we assume Penn Treebank III (PTB3) tokenization and character escaping conventions (see the PTB3 documentation).*** We assume the following: -Text is one sentence (or coherent utterance) per line with no intervening blank lines. (This you must do yourself.) -PTB3 parser-friendly tokenization, e.g.: do n't, wo n't, it 's, John 's, etc. and not: don 't, won 't, it ' s, John ' s, etc. (as some tokenization scripts are fond of doing) Of course, punctuation should be split off from words, so, e.g.: John said ``Hello'' to Mary, who replied ``Hi, John''. should be tokenized as: John said `` Hello '' to Mary , who replied `` Hi , John '' . -PTB3 escapes for characters that are meta-symbols in the PTB3, e.g.: (, ), { and } become: -LRB-, -RRB-, -LCB- and -RCB-, respectively. -"LaTeX"-style double quotation marks: `` Hello '' , said John to Mary . and not: " Hello " , said John to Mary . -Attributive quotations should be formatted in the "logical" or "British" style and not the "American" style, so, e.g.: `` Hello '' , said John to Mary . and NOT: `` Hello , '' said John to Mary. (N.B. this is the only other thing related to sentence segmentation and tokenization that the Stanford PTBTokenizer.java code does not accomplish.) -Also, Unicode punctuation symbols outside the ASCII range should be converted to their ASCII equivalents, e.g.: … should be re-written as ... -When a sentence-final abbreviation ending with a period/full-stop, ends a declarative sentence (i.e., there is therefore no sentence final punctuation mark), the final period/full-stop should not be split off so that we have, e.g.: Many products sold at Smithy's Department Store are not produced in the U.S. tokenized as: Many products sold at Smithy 's Department Store are not produced in the U.S. and not: Many products sold at Smithy 's Department Store are not produced in the U.S . These text transformations will bring novel texts more in line with the texts the parser and realizer were trained on and will improve parsing and realization performance. Finally, all text passed to the parser is assumed to be encoded as UTF-8 or ASCII (the latter being a subset of UTF-8). Other encodings may cause the parser or realizer to crash in unforseen ways. ***The Stanford Core NLP tools (which we use, see below) already do most of what this section covers, so if you plan not to use the Stanford tools you will need to find a replacement. The Stanford PTBTokenizer.java code does all that we mentioned above, except transforming "American"- style quotations to "British/logical"-style quotations, and ensuring that texts are encoded in UTF-8. American-to-logical quotation transformation is performed by the code in: $OPENCCG_HOME/ccgbank/bin/american-to-logical-quotes.py This is invoked by Ant through build-ps.xml. Unicode X-to-UTF-8 conversion can be accomplished with the script: $OPENCCG_HOME/ccgbank/bin/toUTF-8.py (assuming that Python's 'chardet' package has been installed). The first takes text from and pipe text to ; the second has various option flags (type 'python toUTF-8.py -h' for more information. Both have only been tested with Python 2.6x. As for formatting your texts to be one sentence per line, this you must do yourself, as we cannot anticipate what forms of marked up texts will be passed in for parsing. Building the English models from the CCGbank ============================================ You can train your own English models if you have a licensed copy of the CCGbank. To build the models, you'll need to download and patch Zhang Le's maxent toolkit and install the SRILM language modeling toolkit, assuming their licenses are compatible with your usage. In theory it may be possible to use these tools on different platforms, but in practice it will be much easier to use a linux platform, preferably one with multiple processors and lots of memory. For installing Zhang Le's maxent toolkit, see the section with this title below. For the SRILM toolkit, follow the installation instructions on the SRILM website, and make sure the SRILM executables are available on your PATH environment variable. After installing the required toolkits, the next step is to download the current version of ccgbank-data.YYYY-MM-DD.tgz, and move it into the ccgbank directory, i.e. $OPENCCG_HOME/ccgbank/, with the undated name ccgbank-data.tgz. You'll also need to create a symbolic link to your original CCGbank directory from $OPENCCG_HOME/ccgbank/. (Alternatively, you can edit the original.ccgbank.dir property in the build.properties file in the ccgbank directory.) From this directory, you can unpack the data archive as shown below (where is the path to your original CCGbank directory): $ mv ccgbank-data.YYYY-MM-DD.tgz $OPENCCG_HOME/ccgbank/ccgbank-data.tgz $ cd $OPENCCG_HOME/ccgbank $ ln -s /ccgbank1.1 $ ccg-build -f build-release.xml extract-data The ccgbank-data tarball contains a patch file for converting the original CCGbank to the Propbank-enhanced version described in Boxwell and White (2008). The 'extract-data' target in build-release.xml does the patching after unpacking the archive and doing a space-to-newline conversion that enables the patch file to only contain the real differences between these CCGbank versions, given that diff works line-by-line and CCGbank derivations are given one per line. (Note that this conversion also ensures that the CCGbank cannot be recovered from the patch file, thereby avoiding a copyright violation for distributing the CCGbank.) The ccgbank-data tarball also contains various auxiliary files that make it possible to use the BBN named entity annotations on the Penn Treebank as well as to insert the quotes that were unfortunately removed from the original CCGbank. Once the Propbank-enhanced version of the CCGbank has been created and the aux files unpacked, the next step is to convert this version of the CCGbank to the one used by OpenCCG, which has a refined treatment of punctuation, refined categories for various function words, collapsed named entities and truecased text. The conversion is done by a series of XSLT transformations, which have the advantage of the declarative use of XPATH matching but unfortunately end up making the conversion quite slow. As such, the best way to do the conversion is to convert the sections in parallel. The bin/convert_all script converts all sections in parallel, as follows: $ cd $OPENCCG_HOME/ccgbank $ bin/convert_all As each section requires up to 1GB of memory to convert, it really only makes sense to convert all sections in parallel on a machine with at least 25GB of memory and multiple processors. The bin/convert_all script is very simple and can easily be edited to run fewer sections in parallel at a time. Running all sections in parallel should take less than an hour. Once the corpus conversion is complete, the next step is to extract grammars for the training, development and test sections of the CCGbank. (For testing, normally only the morph file is used from the dev and test sets, together with the training set grammar.) The grammar extraction process also creates testbed files for these sections, which contain logical forms derived by following the gold-standard derivations. Grammar extraction is done using the 'extract-various' target in the main build file (build.xml, the default): $ cd $OPENCCG_HOME/ccgbank $ ccg-build extract-various &> logs/log.extract.various & Extracting the grammars and creating the logical forms may take up to an hour and a half or so. Following corpus conversion and grammar extraction, the next step is to train the models. Most of the models can be trained using the 'all' target in build-models.xml: $ cd $OPENCCG_HOME/ccgbank $ ccg-build -f build-models.xml all &> logs/log.models.all & This target trains the supertagger and hypertagger as well as the generative parsing and realization models. As these models require several maxent training runs, this step will take a while, for example up to 24 hours depending on the speed of the machine. (In principle these steps could be partly parallelized, but doing so would be nontrivial given the existing dependencies between steps.) The final step is to train the realizer's averaged perceptron model. (It is also possible to train an averaged perceptron model for the parser, but it has not been found to yield significant gains over the generative model, most likely due to the size of the discrimination space.) Note that you should first install the very large language model as described in the section on using the pre-built models, if possible. Training the perceptron model requires generating training events for each training section, which is quite time consuming. Event generation is easily done in parallel, so the perceptron training sequence has been broken up to allow this step to be done separately: $ cd $OPENCCG_HOME/ccgbank $ ccg-build -f build-rz.xml event-gen-prep &> logs/log.rz.event.gen.prep & $ bin/gen_realizer_events_a & $ bin/gen_realizer_events_b & $ bin/gen_realizer_events_c & $ bin/gen_realizer_events_d & $ bin/gen_realizer_events_e & Preparing for event generation is fairly quick; once that's done, the bin/gen_realizer_events* scripts can all be run at the same time. These scripts are set up to run five sections at a time; they can be easily edited to run more or fewer in parallel. Event generation may take up to 12 hours, even with running 5 sections in parallel. Once event generation is complete, the actual perceptron training can be run: $ ccg-build -f build-rz.xml train-perceptron &> logs/log.rz.train.perceptron & Perceptron training is apt to take 8 hours. If your machine has 16g of memory available, training can be run in less than half the time by commenting in the "-in_mem" options in build-rz.xml, and editing bin/ccg-env to use a memory limit of 16g. With the models all built, the parser and realizer can be tested on the CCGbank development section: $ cd $OPENCCG_HOME/ccgbank $ ccg-build -f build-ps.xml test &> logs/log.ps.test & $ ccg-build -f build-rz.xml test-perceptron &> logs/log.rz.test.perceptron & If the models have been built correctly, the realization exact matches should be over 46% (using the very large language model), and the parsing unlabeled dependencies f-score should be about 0.89. (Note that with named entities collapsed and some function words not represented in the logical forms, these f-scores are not comparable to the dependency f-scores reported for other CCG parsers on the CCGbank.) Naturally, the models can also be run on novel text, as described in the section on using pre-built models. Viewing CCGbank derivations =========================== Derivations in the original or converted CCGbank can be viewed as trees using ccg-draw-tree, a front-end to the tree-drawing routine in NLTK's Tree class. This tool reads in .auto files, so to view converted CCGbank derivations, you must first export them to .auto format, as shown below: $ cd $OPENCCG_HOME/ccgbank $ ccg-build export-to-auto $ ccg-draw-tree convert/00/wsj_0001.auto wsj_0001.1 The export-to-auto target creates auto files for the current 'sect' and 'file' properties, which can be set in build.properties or provided on the command line (e.g. by -Dfile=*). This example assumes that the first file in Section 00 is included in the files to export in .auto format. Displaying the derivation for the first sentence in the converted CCGbank shows how balanced appositive commas are handled with "Pierre_Vinken , 61 years old ," (as well as a collapsed named entity), and also shows how "as a nonexecutive director" has been converted into an adjunct rather than an argument of "join". Installing the Stanford Core NLP tools ====================================== For novel text, it is useful to perform named entity (NE) recognition and labelling, as well as morphological processing (lemmatization). For this we use the Stanford Core NLP tools, available at: http://www-nlp.stanford.edu/software/corenlp.shtml To install them, simply download the archive from the above link, unpack it, and place the Core NLP JAR file in the 'ccgbank/stanford-nlp' directory, re-naming it 'stanford-core-nlp.jar'. If you wish to use another location or naming convention, you will need to update the properties file (ccgbank/build-ps.properties). If you're using a bash shell, you might do the following: $ mkdir $OPENCCG_HOME/ccgbank/tmp $ cd $OPENCCG_HOME/ccgbank/tmp $ wget http://www-nlp.stanford.edu/software/stanford-corenlp-20xx-xx-xx.tgz $ tar xvfz stanford-corenlp-20xx-xx-xx.tgz $ cd stanford-corenlp-20xx-xx-xx $ cp stanford-corenlp-20xx-xx-xx.jar $OPENCCG_HOME/ccgbank/stanford-nlp/stanford-core-nlp.jar Note you will need to fill in the date, 20xx-xx-xx, with whatever the date of the most recent release is (see the particular file you get from the Stanford website). To get the NE tagging models out of this download (the Stanford NE tagger combines preditions from multiple models), you will need to locate the 'stanford-corenlp-20xx-xx-xx-models.jar' file (in the same directory as the other JAR files) and un-jar it, like so [assuming you're still in the same directory as above]: $ jar xf stanford-corenlp-20xx-xx-xx-models.jar $ cp edu/stanford/nlp/models/ner/* $OPENCCG_HOME/ccgbank/stanford-nlp/classifiers/. $ rm -rf edu You should check that the filenames for the NE tagging models copied to $OPENCCG_HOME/ccgbank/stanford-nlp/classifiers match those listed in $OPENCCG_HOME/ccgbank/build-ps.properties as ner.model1, ner.model2 and ner.model3, updating this properties file if necessary. This will set up the only external dependency we have for parsing novel text using 'ccgbank/build-ps.xml'. If you wish to recompile (perhaps after modifying) the application that interfaces with the Stanford NE tagger's API, see the ant build file $OPENCCG_HOME/ccgbank/bin/ner/build-ner-api.xml and its corresponding properties file (in the same location). Installing Zhang Le's maxent toolkit ==================================== To train the models, you'll need to have Zhang Le's maxent toolkit working with a small patch. Do the following to install the patched version of Zhang Le's toolkit in a directory of your choice: $ cd $ wget http://homepages.inf.ed.ac.uk/lzhang10/software/maxent/maxent-20061005.tar.bz2 Unpack and patch the maxent.cpp file (it doesn't cover the case where ':' can be part of the feature symbol itself, and not just a delimiter that separates string representations of features (contextual predicates, actually) from their real-valued activations). $ bunzip2 maxent-20061005.tar.bz2 $ tar xf maxent-20061005.tar $ cd maxent-20061005/src $ patch maxent.cpp $OPENCCG_HOME/docs/maxent.cpp.patched Now compile the maxent code. $ cd $ cd maxent-20061005 $ make clean all unittest Test to make sure it (more or less) works (7 out of the 8 tests only seem to pass, but the training seems to work): $ cd test $ ./runall.py Finally, add the 'maxent' binary (under 'maxent-20061005/src/opt') to your PATH environment variable. References ========== Michael White and Rajakrishnan Rajkumar. 2012. Minimal Dependency Length in Realization Ranking. In Proc. EMNLP-12. http://aclweb.org/anthology-new/D/D12/D12-1023.bib Rajakrishnan Rajkumar and Michael White. 2010. Designing Agreement Features for Realization Ranking. In Proc. of COLING-10. http://aclweb.org/anthology-new/C/C10/C10-2119.bib Michael White and Rajakrishnan Rajkumar. 2009. Perceptron Reranking for CCG Realization. In Proc. of the Conference on Empirical Methods in Natural Language Processing (EMNLP 2009). http://aclweb.org/anthology-new/D/D09/D09-1043.bib Rajakrishnan Rajkumar, Michael White and Dominic Espinosa. 2009. Exploiting Named Entity Classes in CCG Surface Realization. In Proc. of Human Language Technologies: The 2009 Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL HLT 2009). http://aclweb.org/anthology-new/N/N09/N09-2041.bib Michael White and Rajakrishnan Rajkumar. 2008. A More Precise Analysis of Punctuation for Broad-Coverage Surface Realization with CCG. In Proc. of the Workshop on Grammar Engineering Across Frameworks (GEAF08). http://aclweb.org/anthology-new/W/W08/W08-1704.bib Dominic Espinosa, Michael White and Dennis Mehay. 2008. Hypertagging: Supertagging for Surface Realization with CCG. In Proceedings of the 46th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies (ACL-08: HLT). http://aclweb.org/anthology-new/P/P08/P08-1022.bib Stephen A. Boxwell and Michael White. 2008. Projecting Propbank Roles onto the CCGbank. In Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC-08). http://www.lrec-conf.org/proceedings/lrec2008/pdf/789_paper.pdf James R. Curran, Stephen Clark and David Vadas. 2006. Multi-Tagging for Lexicalized-Grammar Parsing. In Proc. ACL-06. http://aclweb.org/anthology-new/P/P06/P06-1088.bib Julia Hockenmaier and Mark Steedman. 2002. Generative Models for Statistical Parsing with Combinatory Categorial Grammar. In Proc. ACL-02. http://aclweb.org/anthology-new/P/P02/P02-1043.bib ================================================ FILE: docs/guide/build.xml ================================================ ================================================ FILE: docs/guide/cgloss4e.sty ================================================ % -*- LaTeX -*- % Following borrows from Covington's style files inspired by Midnight by M. % de Groot, adapted to be used with gb4e.sty: examples beginning with \ex can % contain glosses directly. Default is % Linguistic Inquiry style with all lines in \rm; to change a line (eg. to % \it for a particular journal, change the appropriate line: e.g., % \let\eachwordone=\rm in a copy of this file. Note that it will NOT work % to put \it before the line as the words are parsed separately. % Use \singlegloss to force single-spaced glosses even in double-space % environments. Works also in footnotes (^M as delimiter replaced by % \\)---hpk % %%% %%% Sentences with word-by-word glosses %%% % See covingtn.tex for full documentation. Some examples: % % Displayed sentence with gloss and translation: % % \gll Dit is een Nederlands voorbeeld.\\ % This is a Dutch example.\\ % \glt `This is an example in Dutch.' % % Same, using bracketing where words do not correspond one-to-one: % % \gll Dit is een voorbeeldje in het Nederlands.\\ % This is a {little example} in {} Dutch.\\ % \glt `This is a little example in Dutch.' % % If you want to align 3 lines rather than two, use \glll instead of \gll. % % Layout is critical between \gll (or \glll) and \glt (or \gln). % % Thanks to Marcel R. van der Goot for permission to reproduce code. \let\@gsingle=1 \def\singlegloss{\let\@gsingle=1} \def\nosinglegloss{\let\@gsingle=0} \@ifundefined{new@fontshape}% {\def\@selfnt{\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi}} {\def\@selfnt{\selectfont}} \def\gll% % Introduces 2-line text-and-gloss. {\begin{flushleft} \ifx\@gsingle1% conditionally force single spacing (hpk/MC) \vskip\baselineskip\def\baselinestretch{1}% \@selfnt\vskip-\baselineskip\fi% \bgroup \twosent } \def\glll% % Introduces 3-line text-and-gloss. {\begin{flushleft} \ifx\@gsingle1% conditionally force single spacing (hpk/MC) \vskip\baselineskip\def\baselinestretch{1}% \@selfnt\vskip-\baselineskip\fi% \bgroup \threesent } \def\glt{\vskip.17\baselineskip} % Introduces a translation \let\trans\glt \def\glend{} % obsolete % Ends the gloss environment. % The following TeX code is adapted, with permission, from: % gloss.tex: Macros for vertically aligning words in consecutive sentences. % Version: 1.0 release: 26 November 1990 % Copyright (c) 1991 Marcel R. van der Goot (marcel@cs.caltech.edu). % Original Midnight/gloss.tex and Midnight/gloss.doc are available from % csvax.cs.caltech.edu [131.215.131.131] in pub/tex % and many other anonymous ftp archives. \newbox\lineone% boxes with words from first line \newbox\linetwo% \newbox\linethree% \newbox\wordone% a word from the first line (hbox) \newbox\wordtwo% \newbox\wordthree% \newbox\gline% the constructed double line (hbox) \newskip\glossglue% extra glue between glossed pairs or triples \glossglue = 0pt plus 2pt minus 1pt % allow stretch/shrink between words %\glossglue = 5pt plus 2pt minus 1pt % allow stretch/shrink between words \newif\ifnotdone \@ifundefined{eachwordone}{\let\eachwordone=\rm}{\relax} \@ifundefined{eachwordtwo}{\let\eachwordtwo=\rm}{\relax} \@ifundefined{eachwordthree}{\let\eachwordthree=\rm}{\relax} \def\lastword#1#2#3% #1 = \each, #2 = line box, #3 = word box {\setbox#2=\vbox{\unvbox#2% \global\setbox#3=\lastbox% }% \ifvoid#3\global\setbox#3=\hbox{#1\strut{} }\fi % extra space following \strut in case #1 needs a space } \def\testdone {\ifdim\ht\lineone=0pt \ifdim\ht\linetwo=0pt \notdonefalse % tricky space after pt \else\notdonetrue \fi \else\notdonetrue \fi } \gdef\getwords(#1,#2)#3 #4\\% #1=linebox, #2=\each, #3=1st word, #4=remainder {\setbox#1=\vbox{\hbox{#2\strut#3 }% adds space \unvbox#1% }% \def\more{#4}% \ifx\more\empty\let\more=\donewords \else\let\more=\getwords \fi \more(#1,#2)#4\\% } \gdef\donewords(#1,#2)\\{}% \gdef\twosent#1\\ #2\\{% #1 = first line, #2 = second line \getwords(\lineone,\eachwordone)#1 \\% \getwords(\linetwo,\eachwordtwo)#2 \\% \loop\lastword{\eachwordone}{\lineone}{\wordone}% \lastword{\eachwordtwo}{\linetwo}{\wordtwo}% \global\setbox\gline=\hbox{\unhbox\gline \hskip\glossglue \vtop{\box\wordone % vtop was vbox \nointerlineskip \box\wordtwo }% }% \testdone \ifnotdone \repeat \egroup % matches \bgroup in \gloss \gl@stop} \gdef\threesent#1\\ #2\\ #3\\{% #1 = first line, #2 = second line, #3 = third \getwords(\lineone,\eachwordone)#1 \\% \getwords(\linetwo,\eachwordtwo)#2 \\% \getwords(\linethree,\eachwordthree)#3 \\% \loop\lastword{\eachwordone}{\lineone}{\wordone}% \lastword{\eachwordtwo}{\linetwo}{\wordtwo}% \lastword{\eachwordthree}{\linethree}{\wordthree}% \global\setbox\gline=\hbox{\unhbox\gline \hskip\glossglue \vtop{\box\wordone % vtop was vbox \nointerlineskip \box\wordtwo \nointerlineskip \box\wordthree }% }% \testdone \ifnotdone \repeat \egroup % matches \bgroup in \gloss \gl@stop} \def\gl@stop{{\hskip -\glossglue}\unhbox\gline\end{flushleft}} \endinput ================================================ FILE: docs/guide/gb4e.sty ================================================ % -*- LaTeX -*- \def\gbVersion{4e} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Version 4export (= v. 4 minus the compatibility code) % Based on hpk's gb.sty, revised for GM syllabus by ct, % and incorporating macros adapted from J.Frampton, M. de Groot en M. % Covington. Full documentation soon to come in gb4doc.tex. Bug-reports % and suggestions for improvements, other used features, please! % % Notes: % % Various styles for X-bar levels; can be changed, but note that {picture} % environements (e.g. trees) will then come out wrong and have to be % fixed % % This file allows _ and ^ to be used in ordinary text, hence must be % loaded AFTER any file that uses them in their TeX meaning. Hence % cgloss(n).sty is loaded early in this file. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%% % Format of examples: % %%%%%%%%%%%%%%%%%%%%%%%% % \begin{exe} or \exbegin % (arab.) % \begin{xlist} or \xlist % (1st embedding, alph.) % \begin{xlisti} or \xlisti % (2st embedding, rom.) % \end{xlisti} or \endxlisti % % \end{xlist} or \endxlist % % \end{exe} or \exend % % Other sublist-styles: xlistA (Alph.), xlistI (Rom.), xlistn (arab) % % \ex (produces Number) % \ex (numbered example) % \ex[jdgmt]{sentence} (numbered example with judgement) % % \exi{ident} (produces identifier) % \exi{ident} (example numbered with identifier) % \exi{ident}[jdgmt]{sentence} (dito with judgement) % (\exr, \exp and \sn are defined in terms of \exi) % % \exr{label} (produces cross-referenced Num.) % \exr{label} (cross-referenced example) % \exr{label}[jdgmt]{sentence} (cross-referenced example with judgement) % % \exp{label} (same as % \exp{label} \exr but % \exp{label}[jdgmt]{sentence} with prime) % % \sn (unnumbered example) % \sn[jdgmt]{sentence} (unnumbered example with judgement) % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \@ifundefined{new@fontshape}{\def\reset@font{}\let\mathrm\rm}{} \let\prmbrs=0 \def\primebars{\let\prmbrs=1} \def\obar#1{\ifmmode#1^{0}\else#1$^{0}$\fi} %% FIX \def\mbar#1{\ifmmode#1^{\mathrm{max}}\else#1$^{\mathrm{max}}$\fi} \def\ibar#1{\ifx\prmbrs0% \ifmmode\overline{\mathrm{#1}}\else$\overline{\mbox{#1}}$\fi% \else\ifmmode#1^{'}\else#1$^{'}$\fi\fi} \def\iibar#1{\ifx\prmbrs0% \ifmmode\overline{\overline{\mathrm{#1}}}% \else$\overline{\overline{\mbox{#1}}}$\fi% \else #1P\fi} \def\th{\ifmmode\theta\else$\theta$\fi} \def\al{\ifmmode\alpha\else$\alpha$\fi} \def\be{\ifmmode\beta\else$\beta$\fi} \def\ga{\ifmmode\gamma\else$\gamma$\fi} \def\de{\ifmmode\delta\else$\delta$\fi} \def\spec#1{[Spec,#1]} %Def. of "Specifier of #1" \def\ct#1{{\em #1\/}} %Citation of linguistic material with alternative style: %\def\ct#1{`#1'} \def\tx{\bf} %Introduction of technical terms with alternative style: %\def\tx{\em} \input{cgloss\gbVersion.sty} %%% NEWSTUFF: %\newcommand{\indexgroupmark}[1]{\item{\bf #1}} % ?? -CT % this allows _ to be used in horizontal mode (from J.Frampton): % \catcode`_=\active % \def_#1{\ifmmode\mit{\sb{#1}}\else${}\sb{#1}$\fi} % \catcode`^=\active % \def^#1{\ifmmode\mit{\sp{#1}}\else${}\sp{#1}$\fi} % \def\lb#1{\@ifnextchar [{\@glarph{#1}}{\@bl{#1}}} %\def\@glarph#1[#2]{\ifmmode{[}\sb{{\mathrm{#1}}\sb{#2}}\else% % ${[}\sb{{\mathrm{#1}}\sb{#2}}$\fi} % \def\@bl#1{\ifmmode{[}\sb{\mathrm{#1}}\;\else${[}\sb{\mathrm{#1}}\;$\fi} % \def\rb#1{\@ifnextchar [{\@grarph{#1}}{\@br{#1}}} %\def\@grarph#1[#2]{\ifmmode{]}\sb{{\mathrm{#1}}\sb{#2}}\else% % ${]}\sb{{\mathrm{#1}}\sb{#2}}$\fi} % \def\@br#1{\ifmmode{]}\sb{\mathrm{#1}}\;\else${]}\sb{\mathrm{#1}}\;$\fi} %%% END_NEWSTUFF. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Macros for examples, roughly following Linguistic Inquiry style. % % From here on best not to tamper, else all the examples and cross- % % references will come out scrambled! (see also note below) - CT % % Completely rewritten for more robustness and flexibility. (hpk) % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\qlist{\begin{list}{\Alph{xnum}.}{\usecounter{xnum}% \setlength{\rightmargin}{\leftmargin}}} \def\endqlist{\end{list}} \newif\if@noftnote\@noftnotetrue \newif\if@xrec\@xrecfalse \@definecounter{fnx} %%%% adapted from latex.tex to get examples in footnotes right \long\def\@footnotetext#1{% \@noftnotefalse\setcounter{fnx}{0}% \insert\footins{\reset@font\footnotesize \interlinepenalty\interfootnotelinepenalty \splittopskip\footnotesep \splitmaxdepth \dp\strutbox \floatingpenalty \@MM \hsize\columnwidth \@parboxrestore \edef\@currentlabel{\csname p@footnote\endcsname\@thefnmark}\@makefntext {\rule{\z@}{\footnotesep}\ignorespaces #1\strut}}\@noftnotetrue} \newcount\@xnumdepth \@xnumdepth = 0 \@definecounter{xnumi} \@definecounter{xnumii} \@definecounter{xnumiii} \@definecounter{xnumiv} \@definecounter{exx} \setcounter{exx}{0} \def\thexnumi{\@xsi{xnumi}} \def\thexnumii{\@xsii{xnumii}} \def\thexnumiii{\@xsiii{xnumiii}} \def\thexnumiv{\@xsiv{xnumiv}} \def\p@xnumii{\thexnumi} \def\p@xnumiii{\thexnumi\thexnumii-} \def\p@xnumiv{\thexnumi\thexnumii-\thexnumiii-} \def\xs@default#1{\csname @@xs#1\endcsname} \def\@@xsi{\let\@xsi\arabic} \def\@@xsii{\let\@xsii\alph} \def\@@xsiii{\let\@xsiii\roman} \def\@@xsiv{\let\@xsi\arabic} \@definecounter{rxnumi} \@definecounter{rxnumii} \@definecounter{rxnumiii} \@definecounter{rxnumiv} \def\save@counters{% \setcounter{rxnumi}{\value{xnumi}}% \setcounter{rxnumii}{\value{xnumii}}% \setcounter{rxnumiii}{\value{xnumiii}}% \setcounter{rxnumiv}{\value{xnumiv}}}% \def\reset@counters{% \setcounter{xnumi}{\value{rxnumi}}% \setcounter{xnumii}{\value{rxnumii}}% \setcounter{xnumiii}{\value{rxnumiii}}% \setcounter{xnumiv}{\value{rxnumiv}}}% \def\exewidth#1{\def\@exwidth{#1}} \exewidth{(234)} \def\exe{\@ifnextchar [{\@exe}{\@exe[\@exwidth]}} \def\@exe[#1]{\ifnum \@xnumdepth >0% \if@xrec\@exrecwarn\fi% \if@noftnote\@exrecwarn\fi% \@xnumdepth0\@listdepth0\@xrectrue% \save@counters% \fi% \advance\@xnumdepth \@ne \@@xsi% \begin{list}{(\thexnumi)}% {\usecounter{xnumi}\@subex{#1}{1em}% \if@noftnote% \setcounter{xnumi}{\value{exx}}% \else% \setcounter{xnumi}{\value{fnx}}% \fi}} \def\endexe{\if@noftnote\setcounter{exx}{\value{xnumi}}% \else\setcounter{fnx}{\value{xnumi}}% \reset@counters\@xrecfalse\fi\end{list}} \def\@exrecwarn{\typeout{*** Recursion on "exe"---your example numbering will probably be screwed up!}} \def\xlist{\@ifnextchar [{\@xlist{}}{\@xlist{}[iv.]}} \def\xlista{\@ifnextchar [{\@xlist{\alph}}{\@xlist{\alph}[m.]}} \def\xlisti{\@ifnextchar [{\@xlist{\roman}}{\@xlist{\roman}[iv.]}} \def\xlistn{\@ifnextchar [{\@xlist{\arabic}}{\@xlist{\arabic}[9.]}} \def\xlistA{\@ifnextchar [{\@xlist{\Alph}}{\@xlist{\Alph}[M.]}} \def\xlistI{\@ifnextchar [{\@xlist{\Roman}}{\@xlist{\Roman}[IV.]}} \def\endxlist{\end{list}} \def\endxlista{\end{list}} \def\endxlistn{\end{list}} \def\endxlistA{\end{list}} \def\endxlistI{\end{list}} \def\endxlisti{\end{list}} %%% a generic sublist-styler \def\@xlist#1[#2]{\ifnum \@xnumdepth >3 \@toodeep\else% \advance\@xnumdepth \@ne% \edef\@xnumctr{xnum\romannumeral\the\@xnumdepth}% \def\@bla{#1} \ifx\@bla\empty\xs@default{\romannumeral\the\@xnumdepth}\else% \expandafter\let\csname @xs\romannumeral\the\@xnumdepth\endcsname#1\fi \begin{list}{\csname the\@xnumctr\endcsname.}% {\usecounter{\@xnumctr}\@subex{#2}{1.5ex}}\fi} \def\@subex#1#2{\settowidth{\labelwidth}{#1}\itemindent\z@\labelsep#2% \ifnum\the\@xnumdepth=1\topsep 7\p@ plus2\p@ minus3\p@\else% \topsep 2\p@ plus2\p@\fi\parsep 2\p@ plus\p@ minus\p@% \itemsep \parsep\leftmargin\labelwidth\advance\leftmargin#2\relax} %%% the example-items \def\ex{\@ifnextchar [{\@ex}{\item}} \def\@ex[#1]#2{\item\@exj[#1]{#2}} \def\@exj[#1]#2{\@exjbg{#1} #2 \end{list}} \def\exi#1{\item[#1]\@ifnextchar [{\@exj}{}} \def\judgewidth#1{\def\@jwidth{#1}} \judgewidth{??} \def\@exjbg#1{\begin{list}{#1}{\@subex{\@jwidth}{.5ex}}\item} \def\exr#1{\exi{{(\ref{#1})}}} \def\exp#1{\exi{{(\ref{#1}$'$)}}} \def\sn{\exi{}} \def\bu{\item[$\bullet$]} %%%%%%% \lcomment for breaks in (example-)lists (leaves all counters %%%%%%% as they are) (hpk) \newlength{\lcommentsep} \lcommentsep = 1ex \long\def\lcomment#1% {\vspace{\lcommentsep} \item[]\hspace*{-\leftmargin}% \@tempskipa=\linewidth% \addtolength{\@tempskipa}{\rightmargin}% \addtolength{\@tempskipa}{\leftmargin}% \parbox{\@tempskipa}{#1}% \vspace{\lcommentsep}% } %%%%%% control the alignment of exampleno. and (picture-)example %%%%%% (by Lex Holt ). \def\attop#1{\leavevmode\vtop{\strut\vskip-\baselineskip\vbox{#1}}} \def\atcenter#1{$\vcenter{#1}$} %%%%%% %-------------------Move Arrows (from J.Frampton): \def\leaderfill{\leaders\hrule\hfil} \def\pointerup{\hbox to 0pt{\hss \vbox{\offinterlineskip\vskip-1pt\hbox{\elevenex\char'170}\null}\hss}} \def\pointerdown{\hbox to 0pt{\hss \vtop{\offinterlineskip\null\hbox{\elevenex\char'171}\vskip-1pt}\hss}} \let\pu=\pointerup \let\pd=\pointerdown \let\lf=\leaderfill \def\spacer{\hskip4.5pt} \def\fillright#1{\hfil#1\leaderfill} \def\fillleft#1{\leaderfill#1\hfil} % Changed spelling to \centr, else conflicts with LaTeX \center{} -CT \def\centr#1{\leaderfill#1\leaderfill} \def\link#1{\multispan#1\leaderfill} \def\arrowalign#1{\vtop{\baselineskip=0pt \lineskiplimit=0pt \lineskip=2pt \halign{&##\cr#1}}} %\font\elevenex=cmex10 scaled\magstephalf % just for the arrow! %PS: this may not work on some installations, not sure why. CT %%PPS: (e.g., PCTeX, but it works find works fine with EmTeX) %----------------END Move Arrows \def\pijl{$\rightarrow$\ } % Special accents for Vata & Gbadi; Navajo coming soon, I hope...: %\def\bb#1{$\mathrm{\overline{#1}}$} Following looks better: \def\bb#1{\ifmmode\overline{\mathrm{#1}}\else$\bar{\mathrm{#1}}$\fi} \def\boven#1#2{\raisebox{-0.2pt}{$\stackrel{#1}{\mathrm{#2}}$}} \def\bovenop#1#2{\raisebox{-0.06ex}[0ex][0ex]{$\stackrel{#1}{\mathrm{#2}}$}} \def\vl{\rule{0.05em}{0.30em}} \def\|#1{\ifmmode\vert#1\else\bovenop{\vl}{#1}\fi} ================================================ FILE: docs/guide/guide.tex ================================================ %% %% nb: use pdflatex to create pdf file with hyperlinks %% %% ===================================================================== %% DOCUMENT DATA %% ===================================================================== \documentclass[11pt]{article} \title{Specifying Grammars for OpenCCG: \\ A Rough Guide} \author{Cem Boz\c{s}ahin \and Geert-Jan M. Kruijff \and Michael White} %% ===================================================================== %% PACKAGES %% ===================================================================== \usepackage{openccg} % for hlds/ccg \usepackage{graphicx} % for figs \usepackage{gb4e} % for examples \usepackage[ colorlinks=true, linkcolor=blue, citecolor=blue, urlcolor=blue, pdfstartview=FitH, pdftitle={Specifying Grammars for OpenCCG: A Rough Guide}, pdfauthor={Cem Bozsahin, Geert-Jan M. Kruijff and Michael White} ]{hyperref} %% ===================================================================== %% NEW COMMANDS %% ===================================================================== %\newcommand{\occg}{\textsf{OpenCCG}} \newcommand{\occg}{OpenCCG} \newcommand{\tccg}{\textsf{tccg}} %% ===================================================================== %% DOCUMENT BODY %% ===================================================================== \begin{document} \thispagestyle{empty} \maketitle \tableofcontents \listoftables \listoffigures \newpage \section{OpenCCG} \occg\ is an open source natural language processing library written in Java, which provides parsing and realization services based on Mark Steedman's Combinatory Categorial Grammar (CCG) formalism \cite{Steedman:SynProc}. The library makes use of the multi-modal extensions to CCG devised by Jason Baldridge in his dissertation \cite{Baldridge:2002} and in a joint EACL-03 paper with Geert-Jan Kruijff \cite{Baldridge/Kruijff:2003}. For a concise introduction to CCG with these extensions, see \cite{Steedman/Baldridge:2003}. \occg\ grew out of the Grok system developed by Gann Bierner and Jason Baldridge, and has been refined and extended by Michael White, with further contributions from Cem Boz\c{s}ahin, G\"une\c{s} Erkan, Geert-Jan Kruijff, David Reitter and Alexandros Triantafyllidis. Recent development efforts, managed by Michael White, have focused on making the realizer \cite{White/Baldridge:2003,White-RLAC:2004,White-INLG:2004,White-ACLSoft:2005} practical to use in dialogue systems, and improving (somewhat) the grammar development process. You can download and install \occg\ from its website, located at \url{http://openccg.sourceforge.net}. Once you've unpacked the archive, have a look at the \texttt{README} file for installation instructions. \section{About this (rough) guide} This guide is intended to provide a brief introduction to writing grammars for \occg. The system is implemented in Java, but you do not need to know Java. \occg\ provides its own formats for describing grammars, including the combinatory rules, the lexicon (i.e.\ the lexicalized grammar), feature structures, LF, morphology etc. Two formats are available; one is based on XML and one is a higher-level format that looks similar to C or Java. The syntax of the XML-based format is very simple, but at the same time it can be verbose and hard-to-read. The other format, the so-called ``CCG format'' (\texttt{.ccg}), was specifically designed to be written by hand, and has a richer and more concise syntax. It is a ``front-end'' format in that it is converted internally to XML before it is actually used by \occg, using the \texttt{ccg2xml} tool. As a result, the two formats share many conceptual similarities. \textbf{NB:} Note that the XML format is more stable than the \texttt{.ccg} format, and in particular, the way in which unification constraints are specified in the \texttt{.ccg} format is apt to change. This manual was originally created before the CCG format existed. As a result, it is primarily geared towards writing grammars directly in XML. Over time, however, it will be updated to cover the use of the CCG format as well. For the time being, see \texttt{src/ccg2xml/README} for documentation of the \texttt{.ccg} format. \section{Using the XML-based format} In order to write \occg grammars directly in the XML-based format, you should be familiar with XML. Actually, all you need to know is that tags can be hierarchically and linearly organized, and that they must be ``closed'' (by \texttt{} or \texttt{/>}) with proper nesting, e.g. \begin{verbatim} Bond James Jimbo Double-Oh (Seven) \end{verbatim} \begin{verbatim} \end{verbatim} All the \occg-defined elements and attributes are listed in the XML schema validation files. For reference documentation, you can have a look at these files, which are located in the \texttt{\$OPENCCG\_HOME/grammars/} directory of your installation. For example, \texttt{categories.xsd} describes the tags that go into \occg\ categories. For more advanced use of \occg, it helps to know \href{http://www.w3.org/Style/XSL/}{XSLT}. \subsection{XML-based grammar architecture in \occg} A run-time grammar for \occg\ typically consists of five primary files, with the following canonical names: \begin{description} \item[\texttt{grammar.xml}] Specifies the name of the grammar, and lists the names of the other files. This file may also specify XSLT transformations to use in converting LFs to/from XML, and/or properties of a custom tokenizer (see \texttt{grammar.xsd} for details). \item[\texttt{lexicon.xml}] Specifies \emph{lexical families}. A lexical family specifies one or more related categories, with their associated feature structures and logical forms. Lexical families are loosely based on the notion of \emph{tree families} in \href{http://www.cis.upenn.edu/~xtag/}{XTAG}. \item[\texttt{morph.xml}] Specifies the \emph{words} of the grammar. Each word is related to a lexical family through the part-of-speech tag of the word. If a family is a closed class, we specify explicitly with a family what words are its members. \item[\texttt{rules.xml}] Specifies which combinatory rules are available to the grammar. For the purpose of this document, we assume that application, type-raising, and composition (harmonic as well as crossed) are available. Unary type changing rules are also placed into \texttt{rules.xml}. \item[\texttt{types.xml} (optional)] Specifies the syntactic and semantic type/sort hierarchies. Unlike HPSG, only atomic types are supported in \occg. Multiple-inheritance is allowed \cite{erkanms03}. \end{description} Standard practice is to store these files in a directory under the \texttt{grammars} directory of the \occg\ distribution. Besides the above files, it is also a good idea to have a \texttt{testbed.xml} file. A testbed is a list of test expressions, where we specify for each expression the number of parses ($\geq 0$) the grammar should yield, and optionally the intended LF. \section{Words and categories} \label{sec:cats} \subsection{Lexical families} Traditionally, the lexicon for a categorial grammar specifies for each word its own category. In \occg, categories are instead organized into lexical \textsl{families}, which are related to whole sets of words. (As mentioned earlier, the idea of families we employ here is loosely based on the notion of \emph{tree families} in XTAG.) This makes it possible to avoid giving the same specification over and over again in a lexicon. The simplest way in which words can be related to families is through their parts of speech: for a word we have to specify its part of speech, and for a family we have to specify the part of speech a word has to have for the family to be applicable. To control the applicability of a family, we can also declare it to be \textsl{closed}. A closed family is not applicable to \emph{every} word that has the appropriate part of speech, but only to those words (stems) that are listed with the family as its members. Note that a closed family does not exactly correspond to the notion of a closed class word, as open class words (especially verbs) are often listed as members of closed families, in order to assign them appropriate subcategorization frames. To illustrate, let's look at some examples from the \texttt{tiny} sample grammar. A family is defined within the following element: \begin{verbatim} : : \end{verbatim} \noindent These two families are for nouns and pronominal NPs, as their \textsl{name} attributes indicate; they have parts of speech \texttt{N} and \texttt{Pro}, respectively, given by the \textsl{pos} attribute. The pronominal NP family has \texttt{closed="true"}, indicating that it's a closed family. The members are \texttt{pro1} \ldots \texttt{pro3n}, where \texttt{pro1} is an abstract stem for the first person pronouns \gf{I, we, me, us}, and so on. In each family, we define one or more entries, using an \textsl{entry} element. Each entry defines a category with accompanying feature structure and logical form. Each entry is given a name; we usually give the main entry \texttt{name="Primary"}. An example of a family with multiple (ok, two) entries appears below. The first entry is named \texttt{DTV}, for ditransitive verb, as it specifies a category with two NP complements; the second entry is named \texttt{NP-PPfor}, as it specifies a category with an NP complement followed by a PP complement headed by \gf{for}. In both cases, the extra complement plays the role of Beneficiary in the semantics, motivating the grouping of these two entries into a single family. \begin{small} \begin{verbatim} : : \end{verbatim} \end{small} \subsection{Categories} Within an entry we define a category. A category can either be atomic or complex (i.e.\ a function). The example below illustrates how we specify an atomic category using the \textsl{atomcat} element, giving its label as a value of the attribute \textsl{type}. \begin{verbatim} : \end{verbatim} We can assign a feature structure to an atomic category using the \textsl{fs} element. The \textsl{fs} element has an \textsl{id} attribute so that we can explicitly reference the feature structure, when needed. \begin{verbatim} .. : \end{verbatim} We can add individual features using \textsl{feat} elements. In their simplest form, a feature has an \textsl{attr} specifying the attribute and a \textsl{val} giving the value of the attribute. \begin{verbatim} \end{verbatim} Now, since we don't want all nouns to be singular, we can instead declare the value of the \texttt{num} feature to be a variable, as follows: \begin{verbatim} : : \end{verbatim} \noindent Here the \textsl{featvar} element introduces a variable with \texttt{name="NUM"} as the value of the feature. Note that this feature specification serves as an implicit declaration that nouns have a \texttt{num} feature. As such, it interacts with the \textsl{inheritsFrom} mechanism for default unification, as will be explained below. With basic categories such as this one, it is a good idea to specify all relevant features. An entry can also specify a \emph{complex} category, i.e.\ a function. For that, we use the \textsl{complexcat} element. This element is essentially a list, enumerating the result category and its arguments in the order as given by a Steedman-style category. Argument categories may be atomic or complex (i.e., creating a higher-order function); the result category must be atomic (see \cite{Baldridge:2002} for discussion). \begin{table} \begin{center} \begin{tabular}{rcc} Rules & \occg & MMCCG \\ \hline application only & \texttt{*} & $\star$ \\ associative & \verb+^+ & $\diamond$ \\ permutative & \texttt{x} & $\times$ \\ permutative right & \texttt{x>} & $\times\triangleright$\\ permutative left & \texttt{} & $\triangleright$\\ associative permutative left & \texttt{<} & $\triangleleft$\\ all rules & \texttt{.} & $\bullet$ \\ \hline %[.2em] \end{tabular} \end{center} \caption{Slash modes} \label{slash-modes} \end{table} For each argument we give the slash using a \textsl{slash} element that has attributes \textsl{mode} and \textsl{dir} to specify what kind of slash we are dealing with. The available slash modes are given in Table~\ref{slash-modes}. (Note that in XML, the angle brackets \texttt{<} and \texttt{>} must be escaped as \texttt{\<} and \texttt{\>}, respectively.) See \cite{Baldridge:2002}[p.\ 100] and \cite{Baldridge/Kruijff:2003} for discusion of the slash modes in multimodal CCG. Slashes may also have variables over modes, and may be inert, as discussed in \cite{Baldridge:2002}[Ch.\ 8]. \begin{figure} \begin{quote} \begin{verbatim} .. .. .. : \end{verbatim} \end{quote} \[ \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{nom}} / \cf{np\fsb{3}{acc}} \] \caption{Transitive verb category} \label{tv-cat} \end{figure} Figure~\ref{tv-cat} shows how the category for a transitive verb can be defined; at the bottom of the figure is a more human-friendly notation for the category. The result category is \cf{s}. There are two argument categories, an \cf{np} with accusative case to the right, and an \cf{np} with nominative case to the left. In the human notation, the feature structure id's are shown subscripted in angle brackets, followed by the features themselves. Note that when the intended feature is evident from the feature value, the feature name is left off; also, when the slash mode is consistent with the slash direction (e.g.\ $\triangleright$ and /), the mode is not shown, as in \cite{Baldridge:2002}. \subsection{Words} \label{words} Since pronouns retain case marking in English, the case requirements on the arguments of a transitive verb have the effect of determining which pronouns can appear in which positions. For example, the first person pronoun \gf{I} is allowed in subject position, while \gf{me} is allowed in object position, but not vice-versa. This naturally leads us to how we specify properties of words in the \texttt{morph.xml} file. For each word, we have to give its wordform and part of speech, as follows: \begin{verbatim} \end{verbatim} \noindent If the word's stem differs from its form, the stem must be listed too: \begin{verbatim} \end{verbatim} \begin{figure} %\begin{quote} \begin{small} \begin{verbatim} : \end{verbatim} \end{small} %\end{quote} \[ \begin{array}{rcl} \gf{I} & \vdash & \cf{np\fsb{2}{1st,sg,nom}} \\ \gf{me} & \vdash & \cf{np\fsb{2}{1st,sg,acc}} \\ \gf{we} & \vdash & \cf{np\fsb{2}{1st,pl,nom}} \\ \gf{us} & \vdash & \cf{np\fsb{2}{1st,pl,acc}} \\ \end{array} \] \caption{Case macros} \label{case-macros} \end{figure} To add further information, such as case, we use \textsl{macros}, as illustrated in Figure~\ref{case-macros}. In the figure, the entries for the first person pronouns are given, along with their syntactic macros, specified by the \textsl{macros} attribute. The case macros, named \texttt{@nom} and \texttt{@acc}, appear next in the figure, defined by the \textsl{macro} elements. These macros set the case feature on the category associated with the word (via its part of speech), by accessing the feature structure with id 2 and setting the value of the \texttt{case} feature to \texttt{nom} and \texttt{acc}, respectively. (The number macros \texttt{@sg} and \texttt{@pl} are analogous.) The effects of the macros are shown at the bottom of the figure, where the word forms for the first person pronouns are paired with their associated categories, which differ in their number and case values. \begin{figure} %\begin{quote} \begin{small} \begin{verbatim} : \end{verbatim} \end{small} %\end{quote} \[ \begin{array}{rcl} \gf{buy} & \vdash & \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{non\mbox{-}3rd,sg,nom}} / \cf{np\fsb{3}{acc}} \\ \gf{buys} & \vdash & \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{3rd,sg,nom}} / \cf{np\fsb{3}{acc}} \\ \gf{buy} & \vdash & \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{pl,nom}} / \cf{np\fsb{3}{acc}} \\ \gf{bought} & \vdash & \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{nom}} / \cf{np\fsb{3}{acc}} \\ \end{array} \] \caption{Person macros} \label{pers-macros} \end{figure} As another example, Figure~\ref{pers-macros} shows how the person macros are used (together with the number macros) in setting up person and number agreement constraints with various forms of the verb \gf{buy}. Note that the tense macros \texttt{@pres} and \texttt{@past} do not contribute syntactic features; instead they contribute semantic features to the logical form (cf.\ Section~\ref{lfs}). Additionally, note that the macro \texttt{@non-3rd} supplies a syntactic person value that is compatible with both \texttt{1st} and \texttt{2nd}, as specified in \texttt{types.xml} (cf.\ Section~\ref{types}). It is important to note that macro instantiation does not involve unification: macros set feature values regardless of any value that might already be present for the feature in the feature structure. Conceivably, it would be convenient on occasion (though computationally more expensive) to use unification, rather than overwriting, during macro instantiation, but there is no support for doing so at present. \subsection{Unification} \begin{figure} \begin{quote} %\begin{small} \begin{verbatim} .. .. \end{verbatim} %\end{small} \end{quote} \[ \gf{the} ~ \vdash ~ \cf{np\fsb{2}{3rd}}/_{\!\!\diamond}\cf{n\fsb{2}{}} \] \caption{The definite article} \label{def-art} \end{figure} \begin{figure} \begin{center} \deriv{3}{ \gf{the} & \gf{teacher} & \gf{buys} \\ \uline{1} & \uline{1} & \uline{1} \\ \cf{np\fsb{2}{3rd}}/_{\!\!\diamond}\cf{n\fsb{2}{}} & \cf{n\fb{sg}} & \cf{s} \bs \cf{np\fb{3rd,sg,nom}} / \cf{np\fb{acc}} \\ \fapply{2} \\ \cmc{2}{\cf{np\fsb{2}{3rd,sg}}} \\ \ftype{2} \\ \cmc{2}{\cf{s\fsb{1}{}} / \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{3rd,sg}}} \\ \fcomp{3} \\ \cmc{3}{\cf{s\fsb{1}{}} / \cf{np\fb{acc}}} } \vspace{1cm} \deriv{3}{ \gf{the} & \gf{teachers} & \gf{*buys} \\ \uline{1} & \uline{1} & \uline{1} \\ \cf{np\fsb{2}{3rd}}/_{\!\!\diamond}\cf{n\fsb{2}{}} & \cf{n\fb{pl}} & \cf{s} \bs \cf{np\fb{3rd,sg,nom}} / \cf{np\fb{acc}} \\ \fapply{2} \\ \cmc{2}{\cf{np\fsb{2}{3rd,pl}}} \\ \ftype{2} \\ \cmc{2}{\cf{s\fsb{1}{}} / \cf{s\fsb{1}{}} \bs \cf{np\fsb{2}{3rd,pl}}} \\ \badcomb{3}{{>}\mathbf{B}} \\ \cmc{3}{\cf{s\fsb{1}{}} / \cf{np\fb{acc}}} } \end{center} \caption{Unification and subject-verb agreement} \label{subj-v-agr} \end{figure} Speaking of unification, let's examine the role it plays in enforcing subject-verb agreement. The category for the definite article is given in Figure~\ref{def-art}. The definite article is compatible with both singular and plural nouns, but it must retain this number information for subject-verb agreement to work. Propagating number information is accomplished here by setting the feature structure id to be same on both the \cf{np} result category and on the argument category \cf{n} (i.e., we have \texttt{id="2"} in both cases). Figure~\ref{subj-v-agr} provides an illustration, showing how \gf{the teacher} ends up as a singular \cf{np}, while \gf{the teachers} ends up as a plural one.\footnote{Note that feature structure id's are only shown when relevant to unification. Also, in \occg\ derivations, the id's are actually mapped to ``fresh'' ones after lexical lookup, to avoid any accidental coindexations across different lexical items.} Type raising the \cf{np} and forward composing it with \gf{buys} requires it to unify with the backwards \cf{np} argument of the verb, and in particular, requires the number feature to be singular. Since this is only the case with \gf{the teacher}, the last step in the derivation of \gf{the teachers *buys} will be blocked by a unification failure. \begin{figure} %\begin{quote} \begin{small} \begin{verbatim} \end{verbatim} \end{small} %\end{quote} \[ \gf{for} ~ \vdash ~ \cf{pp\fb{for,acc,X}} \, /_{\!\!\triangleleft} \, \cf{n\fb{acc,X}} \] \caption{Default unification with case marking prepositions} \label{prep-nom} \end{figure} Coindexing two feature structures ensures that all their features will take on the same values. There are times, however, when we want two feature structures to just mostly take on the same values, except for one or two particular features. To support such cases, \occg\ includes a limited form of default unification, specified by the \textsl{inheritsFrom} attribute of a feature structure element. The \textsl{inheritsFrom} mechanism is implemented by compiling out the default unification of two feature structures into individual feature equations at the time of lexical lookup. This works as follows. First, any features appearing on the target category, but not the result category, are copied over. Then, for every feature that has been observed in the grammar for the result category---except for any features that already appear there---a feature equation is added, i.e.\ the feature is set to the same variable on both the target and result categories. As an example, Figure~\ref{prep-nom} shows the category for ``case marking'' prepositions, i.e.\ those prepositions which are assumed to play a purely syntactic role. The \cf{pp} result category has a \texttt{lex} feature which is instantiated by the stem of the actual lexical item, as specified by the keyword \texttt{"[*DEFAULT*]"}. Its remaining features are ``inherited from'' the feature structure with id 3, i.e.\ the one for the argument \cf{np}, as specified by \texttt{inheritsFrom="3"}. When the category for a case-marking preposition (such as \gf{for} here) is instantiated, a feature equation is established between the \cf{pp} and \cf{np} categories for the \texttt{index} feature (whose purpose will be discussed in the next section); the value of the \texttt{case} feature is also copied over. Thus, in the figure, both the \cf{pp} result category and the \cf{np} argument category have an index variable \texttt{X} (and accusative case), while only the result PP has a \texttt{lex} feature, with \texttt{for} as its value.\footnote{Since the case feature is arguably superfluous on the result PP, one could avoid it by just including an explicit feature equation for the \texttt{index} variable. Generally though, it's simpler and less error-prone to use the \textsl{inheritsFrom} mechanism than to manually include all the relevant feature equations.} \subsection{Set args} \begin{figure} \begin{quote} \begin{verbatim} .. .. .. : \end{verbatim} \end{quote} \[ \gf{nanghuhuli (``catches'')} ~ \vdash ~ \cf{s} \{ / \cf{np\fb{nom}}, /\!_{\times} \cf{np\fb{gen}} \} \] \caption{Tagalog transitive verb with set args} \label{tagalog-set-args} \end{figure} To conclude our discussion of specifying syntactic categories, we should mention the availability of set args and dollar variables in \occg. Set args enable us to define categories that allow arguments to appear in any order, as illustrated in Figure~\ref{tagalog-set-args} for the Tagalog verb \gf{nanghuhuli (``catches'')}. In the figure, both the nominative and genitive arguments must appear to the right of the verb, but their relative order is unconstrained. Note that the nominative argument is given the more powerful associative and permutative slash, while the genitive argument is given the permutative-only slash; see \cite{Baldridge:2002}[Ch.\ 7] for discussion, and \cite{bozsahinsteedman03} for further examples. \subsection{Dollar variables} \begin{figure} \begin{quote} \begin{verbatim} .. .. .. \end{verbatim} \end{quote} \[ \gf{and} ~ \vdash ~ \cf{s}\$_{1} \bs_{\star} \cf{s}\$_{1} /\!_{\star} \cf{s}\$_{1} \] \caption{Category with dollar variables for sentential coordination} \label{sent-coord-dollar} \end{figure} Dollar variables range over a stack of arguments, and can be useful in defining categories for conjunctions, type-raised categories for quantifiers, and categories for unary rules (cf.\ \cite{Baldridge:2002}[Ch.\ 8]). Figure~\ref{sent-coord-dollar} shows the category for \gf{and} that allows a range of clausal categories to be coordinated---e.g., transitive verbs, verb phrases, or subject-verb constituents, in right-node raising---as discussed in \cite{White-RLAC:2004}. \section{Logical forms} \label{lfs} \subsection{Hybrid logic dependency semantics} To associate meanings with categories, we need to take care of two things: the structure of the meaning (logical form) itself, and the relation between the category and that meaning. Usually the latter comes down to specifying how the meanings of arguments are to be fit into the logical form. As logical forms we use \emph{hybrid logical terms} that specify semantic dependency structures; for details, see \cite{Kruijff:2001,Baldridge/Kruijff:2002,White/Baldridge:2003,White-RLAC:2004}. (If you're wondering where the $\lambda$'s have gone, see Section~\ref{tko-lambdas}.) We give the logical form of a category using the \textsl{lf} element: \begin{verbatim} : .. \end{verbatim} \noindent The \textsl{lf} element must always appear at the end of a category specification (whether atomic or complex). The simplest logical form is of the form $@_{X} \phi$, with $\phi$ a proposition. We interpret $X$ as the \emph{discourse referent} of the proposition. For the proposition itself, we can follow linguistic tradition and use a word's stem to represent its meaning, except that we'll use boldface rather than prime notation (i.e., we'll represent the meaning of \gf{word} as \C{word} rather than $\mathit{word}'$). We achieve exactly this effect using the keyword \texttt{"[*DEFAULT*]"}, as shown below: \begin{verbatim} \end{verbatim} \noindent The \textsl{satop} element introduces a satisfaction operator @, along with a nominal variable, or \textsl{nomvar}, $X$. (In the grammar, we use logic variables rather than concrete instantiations for nominals; during parsing or realization, \occg\ instantiates these variables dynamically.) Nominals can have types (or sorts) associated with them, as will be explained further in Section~\ref{types}; here, $X$ is allowed to be any subtype of semantic object. The \textsl{prop} element specifies the proposition. \begin{figure} \begin{small} \begin{verbatim} [lexicon.xml] [morph.xml] : \end{verbatim} \end{small} \[ \gf{flower} ~ \vdash ~ \cf{n\fsb{2}{sg,X\!:\con{thing}}} ~ : ~ @_{X\!:\con{thing}}(\C{flower} \wedge \modp{num}\con{sg}) \] \caption{Noun with logical form} \label{noun-lf} \end{figure} \subsection{The syntax-semantics interface} To establish the interface between syntactic structure, as defined by the category, and the logical form, we co-index categories with nominals in the logical form, by adding an attribute \texttt{index} to the feature structure of each category and giving that attribute the corresponding nominal as value. To illustrate, Figure~\ref{noun-lf} (top) shows the complete category definition for nouns. Note how the feature structure includes a feature named \texttt{index}, whose value is a logical form just consisting of the nominal variable $X$---that is, a variable with the same name as the one introduced in the satisfaction operator of the LF further down.\footnote{It suffices to put all the semantic type restrictions on the nominals in the LF; i.e., it's not necessary to put them on the \texttt{index} variables as well.} In the middle of Figure~\ref{noun-lf}, the entry for the noun \gf{flower} in \texttt{morph.xml} is shown. This entry includes \texttt{thing} as the semantic class (i.e.\ its semantic type), which is unified with the type of the nominal head $X$ during lexical instantiation. The entry also includes the macros \texttt{@sg} and \texttt{@sg-X}, where the former adds the syntactic feature of singular number, while the latter adds the semantic feature of singular number on the nominal $X$ (as shown below the entry). At the bottom of the figure is the complete category that results from lexical lookup and instantiation of \gf{flower}. Note that it is possible to fill in the semantic head's proposition with something other than the stem. To do so, we can specify a \textsl{pred} to use in place of the stem, when listing a stem as a member of a family: \begin{verbatim} : : : \end{verbatim} \noindent With this specification, the proposition \C{catch} will appear in the logical form for the Tagalog verb \gf{nanghuhuli}: \begin{exe} \ex \label{tg-catch} \( \begin{array}{rcl} \gf{nanghuhuli} & \vdash & \cf{s\fb{E}} \{ / \cf{np\fb{nom,X}}, /\!_{\times} \cf{np\fb{gen,Y}} \} ~ : ~ \\ && @_{E}(\C{catch} \wedge \modp{Actor}X \wedge \modp{Patient}Y) \\ \end{array} \) \end{exe} \begin{figure} \begin{small} \begin{verbatim} .. .. \end{verbatim} \end{small} \[ \begin{array}{rcl} \gf{bought} & \vdash & \cf{s\fsb{1}{E:\con{action}}} \bs \cf{np\fsb{2}{nom,X\!:\con{animate\mbox{-}being}}} / \cf{np\fsb{3}{acc,Y\!:\con{sem\mbox{-}obj}}} ~ : \\ && @_{E:\con{action}}(\C{buy} \wedge \modp{tense}\con{past}) \wedge \\ && @_{E:\con{action}}(\modp{Actor}X\!\!:\!\con{animate\mbox{-}being}) \wedge \\ && @_{E:\con{action}}(\modp{Patient}Y\!\!:\!\con{sem\mbox{-}obj}) \\ \end{array} \] \caption{Transitive verb with logical form} \label{tv-lf} \end{figure} \subsection{Dependency relations} To introduce dependency relations---like those we just saw for the Tagalog verb \gf{nanghuhuli}---we use the \textsl{diamond} element. For example, to state that an English transitive verb has a logical form with an \modp{Actor} and a \modp{Patient}, we can specify the category shown in Figure~\ref{tv-lf}. At the bottom of the figure, this category appears instantiated for the verb \gf{bought}. Note that for ease of display, the logical form has been partially flattened, with each dependency relation appearing on a separate line. (In \occg, logical forms are automatically flattened prior to parsing or realization.) \begin{figure} \begin{small} \begin{verbatim} .. \end{verbatim} \end{small} \[ \gf{the} ~ \vdash ~ \cf{np\fsb{2}{3rd,X\!:\con{sem\mbox{-}obj}}}/_{\!\!\diamond}\cf{n\fsb{2}{X\!:\con{sem\mbox{-}obj}}} ~ : ~ @_{X\!:\con{sem\mbox{-}obj}}(\modp{det}\con{the}) \] \caption{Determiner with logical form} \label{det-lf} \end{figure} \subsection{Function words} Finally, Figure~\ref{det-lf} illustrates how we can specify the meaning of function words, i.e.\ words that have no independent meaning. The example gives a specification for determiners, which add a semantic feature $\modp{det}$ to the meaning of the nominal head they modify. The noun itself provides the meaning through co-indexation with $X$, the root nominal of the logical form for the determiner. At the bottom of the figure is the category instantiated for the definite article \gf{the}. To support the realization of function words, the semantic relation or feature introduced by the word must be declared using the \textsl{indexRel} attribute on the \textsl{family} element.\footnote{In principle, it would be possible for \occg\ to figure out when a semantic relation or feature should be used for indexing purposes, but the possibility of adding further semantic content via macros makes it non-trivial to do so.} For example, in Figure~\ref{det-lf} we have \texttt{indexRel="det"}, which indicates that the $\modp{det}$ feature should be used to trigger the lookup of the appropriate determiner. (Normally, semantic relations or features are introduced as part of the meaning of content words.) When function words are semantically null---e.g., with case-marking prepositions in the \texttt{tiny} grammar---the keyword \texttt{*NoSem*} should be given as the value of the \textsl{indexRel} attribute. \subsection{Relation sorting} It is possible to specify the order in which to display relations appearing at the same level in the logical forms. By default, relations are sorted alphabetically, with a few exceptions, e.g.\ that $\modp{Restr}$ should appear before $\modp{Body}$. You can customize the order in which relations appear using the optional \textsl{relation-sorting} element in \texttt{lexicon.xml}. See the \texttt{lexicon.xsd} schema for details. \subsection{From CCG to \occg: Taking care of lambdas} \label{tko-lambdas} In CCG, logical forms are normally given using terms from the lambda calculus, e.g. \begin{exe} \ex \label{buy-lambdas} \( \gf{buy} ~ \vdash ~ (\cf{s} \bs \cf{np\fb{nom}}) / \cf{np\fb{acc}} ~ : ~ \lambda x_2 x_1 . \mathrm{buy}^\prime x_2 x_1 \) \end{exe} \begin{exe} \ex \label{tg-catch-lambdas} \( \gf{nanghuhuli} ~ \vdash ~ \cf{s} \{ / \cf{np\fb{nom}}, /\!_{\times} \cf{np\fb{gen}} \} ~ : ~ \lambda \{ x_1, x_2 \} . \mathrm{catch}^\prime x_2 x_1 \) \end{exe} \noindent In (\ref{buy-lambdas}), $x_2$ corresponds to the outermost argument, $/ \cf{np\fb{acc}}$, and $x_1$ to the innermost one, $\bs \cf{np\fb{nom}}$. Example (\ref{tg-catch-lambdas}) uses set-lambda notation with the following convention (cf. \cite{bozsahinsteedman03}): the lambda operator binds a set of variables which are paired with the set of arguments in left-to-right order. Thus, in the second example above, $x_1$ corresponds to $/ \cf{np\fb{nom}}$, and $x_2$ to $/\!_{\times} \cf{np\fb{gen}}$. The interpretation of the CCG $\lambda$-terms above is as in (\ref{pas}), where the argument $x_i$ c-commands $x_{i+j}$ for $j=1,2,\cdots n-i$, at the level of predicate-argument structure (c-command is called LF-command in CCG for that reason). This is how (and where) CCG defines binding constraints. \begin{exe} \ex \label{pas} \begin{minipage}{0.7\textwidth} \includegraphics{pas.pdf} \end{minipage} \end{exe} As we have seen earlier in this section, \occg\ uses hybrid logic dependency semantics (HLDS) terms, rather than $\lambda$-terms, in its logical forms. For example, the HLDS terms for (\ref{buy-lambdas}) and (\ref{tg-catch-lambdas}) appeared in Figure~\ref{tv-lf} and in (\ref{tg-catch}). These terms differ from their $\lambda$-counterparts in a couple of ways. First, in semantic construction, argument binding is accomplished through unification, rather than via function application. And second, predicates are typically connected to their arguments via semantic roles, such as $\modp{Agent}$ and $\modp{Patient}$---though nothing prevents relations such as $\modp{Arg1}$ and $\modp{Arg2}$ from being used instead. Using semantic roles can be more convenient for applications, and makes it possible to capture semantic similarities across argument structure alternations. The downside is that it makes it impossible to enforce binding constraints. In principle, relations encoding semantic roles (e.g.\ $\modp{Agent}$ and $\modp{Patient}$) could be combined with ones for argument structure (e.g.\ $\modp{Arg1} \ldots \modp{ArgN}$) in the same HLDS logical form, though this has not yet been tried. \section{Types} \label{types} Types (aka sorts) allow for some abstraction, generalization and specialization in an \occg\ grammar. Unlike HPSG, \occg\ only employs atomic types. These types may be used as restrictions on syntactic or semantic feature variables, or given as values of syntactic or semantic features. Multiple-inheritance is allowed (see \cite{erkanms03} for further information). Types are kept in the types file, usually named \texttt{types.xml}. This file is optional, which means that features can be untyped (actually, all features will be considered to be of type \texttt{top} in this case, which is the only predefined type). \begin{figure} \begin{verbatim} \end{verbatim} \caption{Hierarchy of syntactic person values} \label{pers-vals} \end{figure} In Section~\ref{words}, Figure~\ref{pers-macros}, we saw how the value of the person feature \texttt{non-3rd} could be used to define a category compatible with both \texttt{1st} and \texttt{2nd} person singular subjects. This definition relies on the following specification of person values in the \texttt{tiny} grammar's \texttt{types.xml} file, listed in Figure~\ref{pers-vals}. As this example shows, types are defined using a \textsl{type} element and are required to have a \textsl{name}. They may also have a space-separated list of one or more \textsl{parent} types. Indenting may be used to show the primary type-subtype hierarchy:\footnote{Since multiple parents are allowed, nesting of elements is not used to define type-subtype relationships.} here, \texttt{1st} and \texttt{2nd} are subtypes of \texttt{non-3rd}, while \texttt{non-3rd} and \texttt{3rd} are subtypes of \texttt{pers-vals}. If no parent types are listed---as with \texttt{pers-vals}---the type is implicitly a subtype of \texttt{top}. \begin{figure} \begin{verbatim} \end{verbatim} \caption{Hierarchy of semantic types/sorts} \label{ont-sorts} \end{figure} The \occg\ type system does not distinguish between syntactic and semantic types; it is up to the grammar designer to ensure their systematic use. For example, if there is to be a syntactic hierarchy and a semantic hierarchy of types, it is a good idea to define a `top object' for each (or at least for one of them), e.g.\ \texttt{sem-obj} as the root of the semantic type hierarchy, as illustrated in Figure~\ref{ont-sorts}. The types in this figure are the ones assumed by the transitive verb category given in Figure~\ref{tv-lf}. With these types, \occg\ can parse and realize \gf{he bought a flower}, but not \gf{*a flower bought he}, since \gf{flower} has type \texttt{thing}, and \texttt{thing} is not compatible with the type \texttt{animate-being}, as is required for the \modp{Actor} of a \C{buy} action. \section{Rules} \label{rules} The rules file, typically named \texttt{rules.xml}, specifies the combinatory rules for a grammar. The rule specifications for the \texttt{tiny} grammar appear below: \begin{verbatim} \end{verbatim} \noindent In addition to the rules shown here, it is also possible to have substitution rules, as well as additional type raising rules. By default, the argument and result categories for a type raising rule are \cf{np} and \cf{s}, respectively. To create a type raising rule using different categories, you can use an \textsl{arg} and/or \textsl{result} element to specify the desired atomic category. For example, a backward type raising rule for prepositional phrases is included as the last rule above. Theoretically speaking, CCG combinatory rules are universal; \emph{any} lexicalized grammar has access to them if it uses in the lexical categories the modalities licensed by the rules (see \cite{Steedman/Baldridge:2003} for further information). The rules file can also incorporate rules that are language specific. To illustrate, let's consider the case of pro drop in Turkish. Turkish is a pro-drop language, which means that subjects of finite clauses can be dropped because morphology of the finite verb already indicates the subject: \begin{itemize} \item[(a)] \begin{tabular}{ll} Ben & uyu-du-m/*-n \\ I & sleep-PAST-1SG/*-2SG \\ \multicolumn{2}{l}{`I slept.'} \\ \end{tabular} \item[(b)] \begin{tabular}{l} Uyu-du-m \\ sleep-PAST-1SG \\ `(I) slept.' \\ \end{tabular} \end{itemize} Pro drop can be modelled in different ways. For example, one can write a lexical rule to generate the derived lexical entries of finite verbs \emph{in} the lexicon, so that every finite verb has two lexical entries, one derived from the other---e.g.\ (\ref{ex:prodrop}) below, which asserts $\cf{s\fb{fin}}\bs\cf{np\fb{acc}}$ and $\cf{s\fb{fin}}$ entries in the lexicon from $\cf{s\fb{fin}}\bs\cf{np\fb{nom}}\bs\cf{np\fb{acc}}$ and $\cf{s\fb{fin}}\bs\cf{np\fb{nom}}$, etc. \begin{equation} \label{ex:prodrop} \cf{s\fb{fin}}\bs\cf{np\fb{nom}}\$_1 \Rightarrow \cf{s\fb{fin}}\$_1 \end{equation} This strategy has theoretical and practical implications. Theoretically, it assumes that \emph{all} morphology is confined to the lexicon, including inflectional morphology, which is usually regarded as part of syntax (see e.g.\ \cite{bozsahin02cl} for its implications for the transparency of syntax-semantics correspondence). \occg\ does not assume that there are only words in the lexicon; anything that can bear a category (words, affixes, clitics) can be a lexical item.\footnote{Currently, \occg\ does not have any mechanism to enforce the \emph{Lexical Integrity Principle} of \cite{bresnanmchombo95}, which basically states that words are islands as far as syntax is concerned, e.g.\ it is not possible to extract out of a word. \cite{bozsahin02cl} proposes that different attachment characteristics of words and bound morphemes can be factored into CCG's lexical entries and combinatory rules (the latter simply projects them onto surface grammar), in effect rendering LIP as a phonological principle, but this is an open problem for now.} On the practical side, it assumes that all inflected forms of the verb are listed in the lexicon. For morphologically rich languages such as Turkish, this amounts to around 2$^{8}$ entries per verb because Turkish has 8 inflections in the verb paradigm, all of which are optional. \begin{figure} \begin{verbatim} \end{verbatim} \caption{Unary rule for pro drop} \label{pro-drop-rule} \end{figure} An alternative is to add a unary rule for pro drop to \texttt{rules.xml}. This rule will apply ``on the fly'', that is, it can apply to lexical or combinatorially-derived inflected verb forms. The rule, which implements (\ref{ex:prodrop}), may be specified as shown in Figure~\ref{pro-drop-rule} (we omit the \$ variable for simplicity). Unary rules are defined using a \textsl{typechanging} element, since such rules must change the type of the argument category---otherwise, nothing would prevent the rule from applying again and again to its own output. \section{Trying it out} Once you've configured and built \occg\, per the \texttt{README} file, you're ready to try out the grammar testing tools. You can experiment with the grammars described in \texttt{SAMPLE\_GRAMMARS} or make one of your own. In the latter case, it will be easier if you create your grammar in its own subdirectory of the \texttt{grammars} directory. There are (at present) three command line tools for trying grammars out: \texttt{tccg}, \texttt{ccg-test} and \texttt{ccg-realize}. \subsection{\texttt{tccg}} The \texttt{tccg} tool (for ``text CCG'') is for interactively testing a grammar. Its (primary) usage is \begin{verbatim} tccg () \end{verbatim} \noindent The default grammar file name is \texttt{grammar.xml}. You can try it out by going to the \texttt{grammars/tiny} directory and running \texttt{tccg}, like so:\footnote{Examples like this one may have an occasional extra line break to improve readability.} \begin{small} \begin{verbatim} D:\Mike\dev\openccg\grammars\tiny>tccg Loading grammar from URL: file:/D:/Mike/dev/openccg/grammars /tiny/grammar.xml Grammar 'tiny' loaded. Enter strings to parse. Type ':r' to realize selected reading of previous parse. Type ':h' for help on display options and ':q' to quit. You can use the tab key for command completion, Ctrl-P (prev) and Ctrl-N (next) to access the command history, and emacs-style control keys to edit the line. tccg> \end{verbatim} \end{small} Typing in \texttt{:h} shows all the available commands. For example, \texttt{:derivs} turns on the display of derivations when you parse an expression: \begin{small} \begin{verbatim} tccg> :derivs tccg> the teacher buys 3 parses found. Parse 1: s/np ------------------------------ (lex) the :- np/^n (lex) teacher :- n (>) the teacher :- np (>T) the teacher :- s/@i(s\@inp) (lex) buys :- s\np/np (>B) the teacher buys :- s/np tccg> \end{verbatim} \end{small} \noindent Here we see a (simplified) vertical display of the derivation seen earlier in Figure~\ref{subj-v-agr}. (If you have \LaTeX\ installed, it's also possible to see derivations like those in Figure~\ref{subj-v-agr} using the \texttt{:vison} command, but note that its current behavior is a bit flaky.) Only the first parse is shown; the other two parses, for the ditransitive and \cf{np} \cf{pp\fb{for}} categories of the verb, can be seen by turning on all derivations with the \texttt{:all} command. To see the features on the categories, you can use the \texttt{:feats} command, optionally with a subset of features to show. Logical forms can be shown with the \texttt{:sem} command: \begin{small} \begin{verbatim} tccg> :noderivs tccg> :sem tccg> she bought the policeman a flower 1 parse found. Parse: s : @b1:action(buy ^ past ^ (p1:animate-being ^ pro3f ^ sg) ^ (p2:person ^ policeman ^ the ^ sg) ^ (f1:thing ^ flower ^ a ^ sg)) tccg> \end{verbatim} \end{small} To see the realizations for this logical form (i.e., the one from the previous parse), use the \texttt{:r} command: \begin{small} \begin{verbatim} tccg> :nosem tccg> :r [1.000] she bought the policeman a flower :- s [0.167] she bought a flower for the policeman :- s tccg> \end{verbatim} \end{small} \noindent Realizations are ordered by their n-gram similarity to the previously entered expression. You can have a look in the \texttt{morph.xml} file for more words to form expressions with. Note that the settings of the various options available in \texttt{tccg}; use \texttt{:reset} to undo all these settings and return to the default ones. \subsection{\texttt{ccg-test}} The \texttt{ccg-test} tool is for regression testing, and also provides options for timing the realizer. Its (primary) usage is \begin{verbatim} ccg-test (-noparsing|-norealization) (-g ) () \end{verbatim} By default, \texttt{ccg-test} will use the grammar in the current directory and the default regression file, \texttt{testbed.xml}. Note that you can set realizer options, such as its time limit, in \texttt{tccg}, e.g.\ by issuing the command \texttt{:tl 1000} (for time limit 1000 ms.), and this value will persist and be used by \texttt{ccg-test}. \subsection{\texttt{ccg-realize}} The \texttt{ccg-realize} tool provides a sample interface to the realizer (see the underlying \texttt{opennlp/ccg/Realize.java} file), and can be an aid in debugging realization. It loads a grammar, runs the realizer on an input XML file, and logs its processing to an output text file (or to system out). Its usage is \begin{verbatim} ccg-realize (-g ) () \end{verbatim} You can create input files for \texttt{ccg-realize} using the \texttt{:2xml} option in \texttt{tccg}. \section{Building grammars} \occg\ comes with various utilities to help you build the files used by the runtime system---and to validate their contents---rather than writing them entirely by hand. The utilities take advantage of the \texttt{ccg-build} front end to the Apache Ant (\url{http://ant.apache.org}) build tool. In principle, \texttt{ccg-build} allows you to organize your files in any way you like to produce the runtime grammar files. \subsection{Validating the grammar files} You can use \texttt{ccg-build} to validate the grammar files against their XML schemas. To do so, you need to have a \texttt{build.xml} file in your grammar directory, which contains build tasks for the Apache Ant tool to carry out. The \texttt{tiny} grammar directory contains a build file which just validates the runtime files, as shown below: \begin{small} \begin{verbatim} D:\Mike\dev\openccg\grammars\tiny>ccg-build Buildfile: build.xml init: [echo] ----------- OpenCCG ------------ grammar: [echo] Validating grammar.xml, lexicon.xml, morph.xml, rules.xml and types.xml BUILD SUCCESSFUL Total time: 4 seconds \end{verbatim} \end{small} \noindent If there are any errors, validation with \texttt{ccg-build} gives relatively informative error messages. Loading a grammar into \texttt{tccg} will perform some further checks, but note that loading a grammar with errors usually means \texttt{tccg} croaks---outputting only (less informative) stack traces---so it's good practice to validate any changes you make to your grammars prior to running \texttt{tccg}. \subsection{Using a \texttt{dict.xml} file} Rather than creating a \texttt{morph.xml} file directly, you can employ a \texttt{dict.xml} file, which groups word forms by their stems and parts of speech, and lists the closed families for a given stem. A \texttt{dict.xml} files usually works together with a file called \texttt{lexicon-base.xml}, which does not contain \textsl{member} entries for families. From the \texttt{dict.xml} file---which also contains macro definitions---the \texttt{morph.xml} file can be generated automatically, with proper hooks to a derived \texttt{lexicon.xml} file, using \texttt{ccg-build}. (See the \texttt{dict.xsd} schema in the \texttt{grammars} directory for a complete description.) In short, the simplest way to use a \texttt{dict.xml} file with \texttt{ccg-build} is to prepare a family of categories in the file named \texttt{lexicon-base.xml} without \textsl{member} entries, and to group stems and their word forms in a file called \texttt{dict.xml}, along with macro definitions. A sample entry from the \texttt{cem-english} \texttt{dict.xml} file appears below: \begin{verbatim} \end{verbatim} \noindent Note that the stem \gf{eat} is declared intransitive and transitive without duplication in the morph or lexicon files. If you run \texttt{ccg-build} as follows, \begin{verbatim} cem-english> ccg-build grammar Buildfile: build.xml init: [echo] ----------- OpenCCG ------------ grammar: [echo] Adding family members from dict.xml to lexicon-base.xml, yielding lexicon.xml [echo] Extracting morph items from dict.xml to morph.xml [echo] Validating grammar.xml, lexicon.xml, morph.xml, rules.xml and types.xml BUILD SUCCESSFUL Total time: 5 seconds \end{verbatim} \noindent there will be two lexical assignments for every word form of \gf{eat}, one for its intransitive use, and one for transitive. \subsection{Reducing redundancy with XSLT} \href{http://www.w3.org/Style/XSL/}{XSLT} is a language for transforming XML documents. Two XSLT transformations, \texttt{add-family-members.xsl} and \texttt{extract-morph.xsl}, are used by \texttt{ccg-build} to handle \texttt{dict.xml} files. You can also use XSLT transformations to reduce redundancy in the lexico-grammar specifications. For example, the \texttt{worldcup} grammar illustrates a couple of ways of using XSLT to improve the specification of lexical families. With this grammar, the \texttt{lexicon-base.xml} file is generated from an XSLT transformation, called \texttt{lexicon-base.xsl}. This file begins with the definition of variables for the various atomic categories used in later complex category and family definitions. The variable names follow the format \begin{verbatim} (.)?(.from-)?(.?)(.)* \end{verbatim} \noindent i.e., the category label, followed optionally by the \texttt{id}, the \texttt{inheritsFrom} id, the \texttt{index} variable, and any further feature descriptions. This convention allows one to see what atomic categories are already in use, and to determine the contents of an atomic category at a glance. For example, \texttt{np.3.Y.acc} is the name of the category with label \texttt{np}, id \texttt{3}, index \texttt{Y}, and the case value \texttt{acc}; \texttt{np.2.X.default} is similar, but has default variables for all features other than the \texttt{index}. Once variables have been declared, they can be referenced further on using \texttt{xsl:copy-of} statements. For example, the variable \texttt{np.2.X.default} is referenced seven times in \texttt{lexicon-base.xsl}. In this way, if a change to \texttt{np.2.X.default} is desired, it can be made in one place in the file, rather than seven. Another XSLT mechanism employed in \texttt{lexicon-base.xsl} is a named templated called \texttt{extend} which serves to append one element as the last child of another. This mechanism is used to associate logical forms with syntactic categories, as well as to create new categories from existing ones. For example, in \begin{small} \begin{verbatim} \end{verbatim} \end{small} \noindent the category for a subject type-raised \emph{wh}-determiner, like \gf{which}, is created by extending the category of a subject type-raised \emph{wh}--noun phrase (e.g.\ \gf{who}) with an extra nominal argument, \texttt{fslash-n.2.X} (declared earlier as a forward slash plus a category with label \texttt{n}, id \texttt{2} and index \texttt{X}).\footnote{The \texttt{xalan:nodeset} function makes it possible to use a variable as a parameter to a named template; its use won't be necessary in future versions of XSLT.} In this way, any changes made to the category for \emph{wh}-determiners will carry over to the category for \emph{wh}--noun phrases. As XSLT is a powerful and extensible XML transformation language, there are many further possibilities for using it in grammar development---limited only by your imagination (and hacking ability). %% NB: Could eventually add lexical rule example. %% NB: Should add discussion of chunking rules. %% NB: Should add discussion of \textsl{licensing-features} %% ===================================================================== %% BIBLIOGRAPHY %% ===================================================================== \addcontentsline{toc}{section}{References} \bibliographystyle{alpha} \bibliography{openccg} \end{document} ================================================ FILE: docs/guide/openccg.bib ================================================ @string{nllt="Natural Language and Linguistic Theory"} @phdthesis{Baldridge:2002, author = {Baldridge, Jason}, title = {Lexically Specified Derivational Control in {C}ombinatory {C}ategorial {G}rammar}, school = {School of Informatics, University of Edinburgh}, year = {2002}, note={Available from \url{http://homepages.inf.ed.ac.uk/jbaldrid/dissertation/}} } % address = {Edinburgh, Scotland}, @inproceedings{Baldridge/Kruijff:2002, author={Baldridge, Jason and Kruijff, Geert-Jan M.}, title={Coupling {CCG} and Hybrid Logic Dependency Semantics}, booktitle={Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL 2002)}, year = {2002}, address = {Philadelphia, Pennsylvania} } @inproceedings{Baldridge/Kruijff:2003, author={Baldridge, Jason and Kruijff, Geert-Jan M.}, title={Multi-Modal Combinatory Categorial Grammar}, booktitle={Proceedings of the 10th Conference of the European Chapter of the Association for Computational Linguistics (EACL 2003)}, year = {2003}, address = {Budapest, Hungary}} } @phdthesis{Kruijff:2001, author={Kruijff, Geert-Jan M.}, title={A Categorial-Modal Logical Architecture of Informativity: Dependency Grammar Logic \& Information Structure}, school={Faculty of Mathematics and Physics, Charles University}, address={Prague, Czech Republic}, year={2001}} @book{Steedman:SynProc, author = {Mark Steedman}, title = {The Syntactic Process}, publisher = {The MIT Press}, address="Cambridge Mass.", year = {2000}, } @inproceedings{White/Baldridge:2003, author = {White, Michael and Baldridge, Jason}, title = {Adapting Chart Realization to {CCG}}, booktitle = {Proceedings of the Ninth European Workshop on Natural Language Generation}, year = {2003}, address = {Budapest, Hungary}} } @Article{White-RLAC:2004, author = {Michael White}, year = 2004, title = "{Efficient Realization of Coordinate Structures in Combinatory Categorial Grammar}", journal = {Research on Language and Computation}, note = {To appear} } @InProceedings{White-INLG:2004, author = {Michael White}, year = 2004, title = "{Reining in CCG Chart Realization}", booktitle = {Proceedings of the Third International Conference on Natural Language Generation, INLG-04} } @InProceedings{White-ACLSoft:2005, author = {Michael White}, year = 2005, title = "Designing an Extensible {API} for Integrating Language Modeling and Realization", booktitle = {Proc.\ ACL-05 Workshop on Software} } @mastersthesis{erkanms03, author = {G{\"u}ne{\c{s}} Erkan}, title = {A Type System for {CCG}}, school = {Middle East Technical University, Ankara}, note={Available from \url{http://www.LcsL.metu.edu.tr/ftp/theses/erkan-ms-03.pdf.gz}}, year = {2003}, } @unpublished{bozsahinsteedman03, author="Cem Bozsahin and Mark Steedman", title="Lexicalized Asymmetry and Syntactic Projection", year="2005", note="{Manuscript}, University of {E}dinburgh" } @unpublished{Steedman/Baldridge:2003, author = {Mark Steedman and Jason Baldridge}, title = {Combinatory {C}ategorial {G}rammar}, note = {Tutorial paper, available from \url{http://homepages.inf.ed.ac.uk/jbaldrid/ccg.pdf}}, year = {2003}, } @article{bozsahin02cl, author="Cem Bozsahin", title= "The Combinatory Morphemic Lexicon", journal= "Computational Linguistics", volume=28, number=2, pages="145--176", year="2002" } @article{bresnanmchombo95, author="Joan Bresnan and Sam A. Mchombo", title= "The {L}exical {I}ntegrity {P}rinciple: Evidence from {B}antu", journal= nllt, volume="13", pages="181--254", year="1995" } ================================================ FILE: docs/guide/openccg.sty ================================================ %% %% This style file contains a minimal set of commands for %% OpenCCG categories and logical forms, plus derivations. %% It borrows from earlier style files by Gann, Jason and Geert-Jan, %% and from Beryl and Mark for the derivations. %% \newcommand{\bs}{\backslash} % backslash, to save typing \newcommand{\gf}[1]{\textsf{\textsl{#1}}} % gloss font, for words \newcommand{\cf}[1]{\ensuremath{\mathsf{#1}}} % category font \newcommand{\fb}[1]{\ensuremath{_{\mathit{#1}}}} % features, subscripted \newcommand{\fsb}[2]{\ensuremath{_{\langle#1\rangle\mathit{#2}}}} % fs index and features, subscripted \newcommand{\C}[1]{\textbf{#1}} % concept font \newcommand{\con}[1]{\ensuremath{\mathrm{#1}}} % constant font, for sem feature values or sem sorts \newcommand{\modp}[1]{\ensuremath{\langle}\textsc{#1}\ensuremath{\rangle}} % sem relation/feature % CCG derivations % arguments: #1 = no. of words, #2 = body % Carsten suggests removing @{}, in order to eliminate a gap on the % left end of lines in the derivation \newcommand{\deriv}[2] { \renewcommand{\arraystretch}{.5} $\begin{array}[t]{*{#1}{c}} #2 \end{array}$ } % centered multicolumn (NB: changed \mc to \cmc to avoid conflict with kluwer.cls) \newcommand{\cmc}[2]{\multicolumn{#1}{c}{#2}} % Rules, argument #1 gives the number of columns to cover. \newcommand{\uline}[1] {\cmc{#1}{\hrulefill} } \newcommand{\fapply}[1] { \cmc{#1}{\hrulefill_{>}} } \newcommand{\bapply}[1] { \cmc{#1}{\hrulefill_{<}} } \newcommand{\fcomp}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}}}} \newcommand{\fxcomp}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}_{\times}}}} \newcommand{\fxcompN}[2] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}^{#2}_{\times}}}} \newcommand{\fcomptwo}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}^2}}} \newcommand{\fxcomptwo}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}_{\times}^2}}} \newcommand{\fcompthree}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}^3}}} \newcommand{\fxcompthree}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}_{\times}^3}}} \newcommand{\bcomp}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}}}} \newcommand{\bxcomp}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}_{\times}}}} \newcommand{\bxcompN}[2] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}^{#2}_{\times}}}} \newcommand{\bcomptwo}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}^2}}} \newcommand{\bxcomptwo}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}_{\times}^2}}} \newcommand{\bcompthree}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}^3}}} \newcommand{\bxcompthree}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}_{\times}^3}}} \newcommand{\fsubst}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{S}}}} \newcommand{\bsubst}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{S}}}} \newcommand{\fxsubst}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{S}_{\times}}}} \newcommand{\bxsubst}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{S}_{\times}}}} \newcommand{\ftype}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{T}}}} \newcommand{\btype}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{T}}}} \newcommand{\conj}[1] { \cmc{#1}{\hrulefill_{{<}\Phi{>}}}} \newcommand{\boundary}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{\%}}}} \newcommand{\asterisk}[1] { \cmc{#1}{\hrulefill_{\mathbf{*}}}} \newcommand{\comb}[2] % an arbitrary combinator { \cmc{#1}{\hrulefill_{#2}}} \newcommand{\badcomb}[2] % an inappropriate use of a combinator { \cmc{#1}{\hrulefill_{\mbox{ *** }}\hrulefill_{#2}}} \newcommand{\dcomp}[2] % an arbitrary dcomposition { \cmc{#1}{\dotfill_{#2}}} \newcommand{\unfreeze}[1] {\cmc{#1}{\hrulefill_{\mathbf{@}}} } \newcommand{\ul}{\uline{1}} \newcounter{CCG@counter} \newcommand{\CCG@amp}{&} \newcommand{\ulines}[1]{ \uline{1} \setcounter{CCG@counter}{1} \whiledo{\value{CCG@counter} < #1} { \CCG@amp \uline{1} \stepcounter{CCG@counter} } } ================================================ FILE: docs/index.html ================================================ The OpenCCG Homepage

OpenCCG: The OpenNLP CCG Library

Home

 
 

OpenCCG, the OpenNLP CCG Library, is an open source natural language processing library written in Java, which provides parsing and realization services based on Mark Steedman's Combinatory Categorial Grammar (CCG) formalism.

The library makes use of multi-modal extensions to CCG developed by Jason Baldridge as part of the Grok system (the precursor to OpenCCG). These extensions are described in Jason's dissertation and in a joint EACL-03 paper with Geert-Jan Kruijff.

Subsequent development efforts, led by Michael White, have focused on making the realizer practical to use in dialogue systems, and more recently, on realization with broad coverage grammars. See the papers on Mike's web page for details. Since version 0.9.4, OpenCCG has included broad coverage English parsing and realization support that together make it possible to experiment with open domain grammatical paraphrasing. Version 0.9.5 adds features for dependency ordering and dependency length minimization in realization, as in White and Rajkumar (2012), along with support for using 5-gram gigaword language models with KenLM, and creating disjunctive logical forms based on the differences between aligned semantic graphs, as in Martin and White (2011). It also includes ccg2jsgf, an extension developed for Knexus Research Corporation for compiling an OpenCCG grammar into a context-free grammar in the Java Speech Grammar Format used by the Sphinx speech recognizer, now released open source. (new!)

Also, Jason Baldridge and students at UT Austin have developed DotCCG, a new format for specifying OpenCCG grammars, and VisCCG, an editor and visualizer for grammars written in DotCCG format. These developments are described in Baldridge, Chatterjee, Palmer and Wing (2007). See the UT Austin computational linguistics lab's OpenCCG wiki, which has a number of tutorials and example grammars for DotCCG and VisCCG.

OpenCCG has been and is being used for a number of dialog systems: see the list of projects using OpenCCG. Please get in touch with Jason if you would like add yours.

For the latest news about OpenCCG, check out the SourceForge project page.

Further development of OpenCCG has moved to GitHub, where you can get the very latest code on branch master; releases and binaries will continue to be posted on SourceForge.

[Home] [Wiki] [GitHub] [SourceForge] [Download] [Forums]

Email: mwhite at (no spam please) ling dot osu dot edu
2015 March 16

SourceForge.net Logo

 
================================================ FILE: docs/maxent.cpp.patch ================================================ --- /home/dnm/maxent/maxent-20061005/src/maxent.cpp 2009-05-08 23:57:52.000000000 -0400 +++ /home/dnm/maxentfixes/maxentpatched.cpp 2009-05-08 23:57:47.000000000 -0400 @@ -154,6 +154,9 @@ const char* q = it->second; while (p < q && *p != ':') ++p; + // fixes the case where ':' is in the feature string (D.N. Mehay) + if((p+1) < q && *++p != ':') + --p; if (p == q) return false; context.push_back(make_pair(string(it->first, p - it->first), ================================================ FILE: docs/realizer/build.xml ================================================ ================================================ FILE: docs/realizer/cgloss4e.sty ================================================ % -*- LaTeX -*- % Following borrows from Covington's style files inspired by Midnight by M. % de Groot, adapted to be used with gb4e.sty: examples beginning with \ex can % contain glosses directly. Default is % Linguistic Inquiry style with all lines in \rm; to change a line (eg. to % \it for a particular journal, change the appropriate line: e.g., % \let\eachwordone=\rm in a copy of this file. Note that it will NOT work % to put \it before the line as the words are parsed separately. % Use \singlegloss to force single-spaced glosses even in double-space % environments. Works also in footnotes (^M as delimiter replaced by % \\)---hpk % %%% %%% Sentences with word-by-word glosses %%% % See covingtn.tex for full documentation. Some examples: % % Displayed sentence with gloss and translation: % % \gll Dit is een Nederlands voorbeeld.\\ % This is a Dutch example.\\ % \glt `This is an example in Dutch.' % % Same, using bracketing where words do not correspond one-to-one: % % \gll Dit is een voorbeeldje in het Nederlands.\\ % This is a {little example} in {} Dutch.\\ % \glt `This is a little example in Dutch.' % % If you want to align 3 lines rather than two, use \glll instead of \gll. % % Layout is critical between \gll (or \glll) and \glt (or \gln). % % Thanks to Marcel R. van der Goot for permission to reproduce code. \let\@gsingle=1 \def\singlegloss{\let\@gsingle=1} \def\nosinglegloss{\let\@gsingle=0} \@ifundefined{new@fontshape}% {\def\@selfnt{\ifx\@currsize\normalsize\@normalsize\else\@currsize\fi}} {\def\@selfnt{\selectfont}} \def\gll% % Introduces 2-line text-and-gloss. {\begin{flushleft} \ifx\@gsingle1% conditionally force single spacing (hpk/MC) \vskip\baselineskip\def\baselinestretch{1}% \@selfnt\vskip-\baselineskip\fi% \bgroup \twosent } \def\glll% % Introduces 3-line text-and-gloss. {\begin{flushleft} \ifx\@gsingle1% conditionally force single spacing (hpk/MC) \vskip\baselineskip\def\baselinestretch{1}% \@selfnt\vskip-\baselineskip\fi% \bgroup \threesent } \def\glt{\vskip.17\baselineskip} % Introduces a translation \let\trans\glt \def\glend{} % obsolete % Ends the gloss environment. % The following TeX code is adapted, with permission, from: % gloss.tex: Macros for vertically aligning words in consecutive sentences. % Version: 1.0 release: 26 November 1990 % Copyright (c) 1991 Marcel R. van der Goot (marcel@cs.caltech.edu). % Original Midnight/gloss.tex and Midnight/gloss.doc are available from % csvax.cs.caltech.edu [131.215.131.131] in pub/tex % and many other anonymous ftp archives. \newbox\lineone% boxes with words from first line \newbox\linetwo% \newbox\linethree% \newbox\wordone% a word from the first line (hbox) \newbox\wordtwo% \newbox\wordthree% \newbox\gline% the constructed double line (hbox) \newskip\glossglue% extra glue between glossed pairs or triples \glossglue = 0pt plus 2pt minus 1pt % allow stretch/shrink between words %\glossglue = 5pt plus 2pt minus 1pt % allow stretch/shrink between words \newif\ifnotdone \@ifundefined{eachwordone}{\let\eachwordone=\rm}{\relax} \@ifundefined{eachwordtwo}{\let\eachwordtwo=\rm}{\relax} \@ifundefined{eachwordthree}{\let\eachwordthree=\rm}{\relax} \def\lastword#1#2#3% #1 = \each, #2 = line box, #3 = word box {\setbox#2=\vbox{\unvbox#2% \global\setbox#3=\lastbox% }% \ifvoid#3\global\setbox#3=\hbox{#1\strut{} }\fi % extra space following \strut in case #1 needs a space } \def\testdone {\ifdim\ht\lineone=0pt \ifdim\ht\linetwo=0pt \notdonefalse % tricky space after pt \else\notdonetrue \fi \else\notdonetrue \fi } \gdef\getwords(#1,#2)#3 #4\\% #1=linebox, #2=\each, #3=1st word, #4=remainder {\setbox#1=\vbox{\hbox{#2\strut#3 }% adds space \unvbox#1% }% \def\more{#4}% \ifx\more\empty\let\more=\donewords \else\let\more=\getwords \fi \more(#1,#2)#4\\% } \gdef\donewords(#1,#2)\\{}% \gdef\twosent#1\\ #2\\{% #1 = first line, #2 = second line \getwords(\lineone,\eachwordone)#1 \\% \getwords(\linetwo,\eachwordtwo)#2 \\% \loop\lastword{\eachwordone}{\lineone}{\wordone}% \lastword{\eachwordtwo}{\linetwo}{\wordtwo}% \global\setbox\gline=\hbox{\unhbox\gline \hskip\glossglue \vtop{\box\wordone % vtop was vbox \nointerlineskip \box\wordtwo }% }% \testdone \ifnotdone \repeat \egroup % matches \bgroup in \gloss \gl@stop} \gdef\threesent#1\\ #2\\ #3\\{% #1 = first line, #2 = second line, #3 = third \getwords(\lineone,\eachwordone)#1 \\% \getwords(\linetwo,\eachwordtwo)#2 \\% \getwords(\linethree,\eachwordthree)#3 \\% \loop\lastword{\eachwordone}{\lineone}{\wordone}% \lastword{\eachwordtwo}{\linetwo}{\wordtwo}% \lastword{\eachwordthree}{\linethree}{\wordthree}% \global\setbox\gline=\hbox{\unhbox\gline \hskip\glossglue \vtop{\box\wordone % vtop was vbox \nointerlineskip \box\wordtwo \nointerlineskip \box\wordthree }% }% \testdone \ifnotdone \repeat \egroup % matches \bgroup in \gloss \gl@stop} \def\gl@stop{{\hskip -\glossglue}\unhbox\gline\end{flushleft}} \endinput ================================================ FILE: docs/realizer/gb4e.sty ================================================ % -*- LaTeX -*- \def\gbVersion{4e} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Version 4export (= v. 4 minus the compatibility code) % Based on hpk's gb.sty, revised for GM syllabus by ct, % and incorporating macros adapted from J.Frampton, M. de Groot en M. % Covington. Full documentation soon to come in gb4doc.tex. Bug-reports % and suggestions for improvements, other used features, please! % % Notes: % % Various styles for X-bar levels; can be changed, but note that {picture} % environements (e.g. trees) will then come out wrong and have to be % fixed % % This file allows _ and ^ to be used in ordinary text, hence must be % loaded AFTER any file that uses them in their TeX meaning. Hence % cgloss(n).sty is loaded early in this file. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%% % Format of examples: % %%%%%%%%%%%%%%%%%%%%%%%% % \begin{exe} or \exbegin % (arab.) % \begin{xlist} or \xlist % (1st embedding, alph.) % \begin{xlisti} or \xlisti % (2st embedding, rom.) % \end{xlisti} or \endxlisti % % \end{xlist} or \endxlist % % \end{exe} or \exend % % Other sublist-styles: xlistA (Alph.), xlistI (Rom.), xlistn (arab) % % \ex (produces Number) % \ex (numbered example) % \ex[jdgmt]{sentence} (numbered example with judgement) % % \exi{ident} (produces identifier) % \exi{ident} (example numbered with identifier) % \exi{ident}[jdgmt]{sentence} (dito with judgement) % (\exr, \exp and \sn are defined in terms of \exi) % % \exr{label} (produces cross-referenced Num.) % \exr{label} (cross-referenced example) % \exr{label}[jdgmt]{sentence} (cross-referenced example with judgement) % % \exp{label} (same as % \exp{label} \exr but % \exp{label}[jdgmt]{sentence} with prime) % % \sn (unnumbered example) % \sn[jdgmt]{sentence} (unnumbered example with judgement) % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \@ifundefined{new@fontshape}{\def\reset@font{}\let\mathrm\rm}{} \let\prmbrs=0 \def\primebars{\let\prmbrs=1} \def\obar#1{\ifmmode#1^{0}\else#1$^{0}$\fi} %% FIX \def\mbar#1{\ifmmode#1^{\mathrm{max}}\else#1$^{\mathrm{max}}$\fi} \def\ibar#1{\ifx\prmbrs0% \ifmmode\overline{\mathrm{#1}}\else$\overline{\mbox{#1}}$\fi% \else\ifmmode#1^{'}\else#1$^{'}$\fi\fi} \def\iibar#1{\ifx\prmbrs0% \ifmmode\overline{\overline{\mathrm{#1}}}% \else$\overline{\overline{\mbox{#1}}}$\fi% \else #1P\fi} \def\th{\ifmmode\theta\else$\theta$\fi} \def\al{\ifmmode\alpha\else$\alpha$\fi} \def\be{\ifmmode\beta\else$\beta$\fi} \def\ga{\ifmmode\gamma\else$\gamma$\fi} \def\de{\ifmmode\delta\else$\delta$\fi} \def\spec#1{[Spec,#1]} %Def. of "Specifier of #1" \def\ct#1{{\em #1\/}} %Citation of linguistic material with alternative style: %\def\ct#1{`#1'} \def\tx{\bf} %Introduction of technical terms with alternative style: %\def\tx{\em} \input{cgloss\gbVersion.sty} %%% NEWSTUFF: %\newcommand{\indexgroupmark}[1]{\item{\bf #1}} % ?? -CT % this allows _ to be used in horizontal mode (from J.Frampton): % \catcode`_=\active % \def_#1{\ifmmode\mit{\sb{#1}}\else${}\sb{#1}$\fi} % \catcode`^=\active % \def^#1{\ifmmode\mit{\sp{#1}}\else${}\sp{#1}$\fi} % \def\lb#1{\@ifnextchar [{\@glarph{#1}}{\@bl{#1}}} %\def\@glarph#1[#2]{\ifmmode{[}\sb{{\mathrm{#1}}\sb{#2}}\else% % ${[}\sb{{\mathrm{#1}}\sb{#2}}$\fi} % \def\@bl#1{\ifmmode{[}\sb{\mathrm{#1}}\;\else${[}\sb{\mathrm{#1}}\;$\fi} % \def\rb#1{\@ifnextchar [{\@grarph{#1}}{\@br{#1}}} %\def\@grarph#1[#2]{\ifmmode{]}\sb{{\mathrm{#1}}\sb{#2}}\else% % ${]}\sb{{\mathrm{#1}}\sb{#2}}$\fi} % \def\@br#1{\ifmmode{]}\sb{\mathrm{#1}}\;\else${]}\sb{\mathrm{#1}}\;$\fi} %%% END_NEWSTUFF. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Macros for examples, roughly following Linguistic Inquiry style. % % From here on best not to tamper, else all the examples and cross- % % references will come out scrambled! (see also note below) - CT % % Completely rewritten for more robustness and flexibility. (hpk) % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \def\qlist{\begin{list}{\Alph{xnum}.}{\usecounter{xnum}% \setlength{\rightmargin}{\leftmargin}}} \def\endqlist{\end{list}} \newif\if@noftnote\@noftnotetrue \newif\if@xrec\@xrecfalse \@definecounter{fnx} %%%% adapted from latex.tex to get examples in footnotes right \long\def\@footnotetext#1{% \@noftnotefalse\setcounter{fnx}{0}% \insert\footins{\reset@font\footnotesize \interlinepenalty\interfootnotelinepenalty \splittopskip\footnotesep \splitmaxdepth \dp\strutbox \floatingpenalty \@MM \hsize\columnwidth \@parboxrestore \edef\@currentlabel{\csname p@footnote\endcsname\@thefnmark}\@makefntext {\rule{\z@}{\footnotesep}\ignorespaces #1\strut}}\@noftnotetrue} \newcount\@xnumdepth \@xnumdepth = 0 \@definecounter{xnumi} \@definecounter{xnumii} \@definecounter{xnumiii} \@definecounter{xnumiv} \@definecounter{exx} \setcounter{exx}{0} \def\thexnumi{\@xsi{xnumi}} \def\thexnumii{\@xsii{xnumii}} \def\thexnumiii{\@xsiii{xnumiii}} \def\thexnumiv{\@xsiv{xnumiv}} \def\p@xnumii{\thexnumi} \def\p@xnumiii{\thexnumi\thexnumii-} \def\p@xnumiv{\thexnumi\thexnumii-\thexnumiii-} \def\xs@default#1{\csname @@xs#1\endcsname} \def\@@xsi{\let\@xsi\arabic} \def\@@xsii{\let\@xsii\alph} \def\@@xsiii{\let\@xsiii\roman} \def\@@xsiv{\let\@xsi\arabic} \@definecounter{rxnumi} \@definecounter{rxnumii} \@definecounter{rxnumiii} \@definecounter{rxnumiv} \def\save@counters{% \setcounter{rxnumi}{\value{xnumi}}% \setcounter{rxnumii}{\value{xnumii}}% \setcounter{rxnumiii}{\value{xnumiii}}% \setcounter{rxnumiv}{\value{xnumiv}}}% \def\reset@counters{% \setcounter{xnumi}{\value{rxnumi}}% \setcounter{xnumii}{\value{rxnumii}}% \setcounter{xnumiii}{\value{rxnumiii}}% \setcounter{xnumiv}{\value{rxnumiv}}}% \def\exewidth#1{\def\@exwidth{#1}} \exewidth{(234)} \def\exe{\@ifnextchar [{\@exe}{\@exe[\@exwidth]}} \def\@exe[#1]{\ifnum \@xnumdepth >0% \if@xrec\@exrecwarn\fi% \if@noftnote\@exrecwarn\fi% \@xnumdepth0\@listdepth0\@xrectrue% \save@counters% \fi% \advance\@xnumdepth \@ne \@@xsi% \begin{list}{(\thexnumi)}% {\usecounter{xnumi}\@subex{#1}{1em}% \if@noftnote% \setcounter{xnumi}{\value{exx}}% \else% \setcounter{xnumi}{\value{fnx}}% \fi}} \def\endexe{\if@noftnote\setcounter{exx}{\value{xnumi}}% \else\setcounter{fnx}{\value{xnumi}}% \reset@counters\@xrecfalse\fi\end{list}} \def\@exrecwarn{\typeout{*** Recursion on "exe"---your example numbering will probably be screwed up!}} \def\xlist{\@ifnextchar [{\@xlist{}}{\@xlist{}[iv.]}} \def\xlista{\@ifnextchar [{\@xlist{\alph}}{\@xlist{\alph}[m.]}} \def\xlisti{\@ifnextchar [{\@xlist{\roman}}{\@xlist{\roman}[iv.]}} \def\xlistn{\@ifnextchar [{\@xlist{\arabic}}{\@xlist{\arabic}[9.]}} \def\xlistA{\@ifnextchar [{\@xlist{\Alph}}{\@xlist{\Alph}[M.]}} \def\xlistI{\@ifnextchar [{\@xlist{\Roman}}{\@xlist{\Roman}[IV.]}} \def\endxlist{\end{list}} \def\endxlista{\end{list}} \def\endxlistn{\end{list}} \def\endxlistA{\end{list}} \def\endxlistI{\end{list}} \def\endxlisti{\end{list}} %%% a generic sublist-styler \def\@xlist#1[#2]{\ifnum \@xnumdepth >3 \@toodeep\else% \advance\@xnumdepth \@ne% \edef\@xnumctr{xnum\romannumeral\the\@xnumdepth}% \def\@bla{#1} \ifx\@bla\empty\xs@default{\romannumeral\the\@xnumdepth}\else% \expandafter\let\csname @xs\romannumeral\the\@xnumdepth\endcsname#1\fi \begin{list}{\csname the\@xnumctr\endcsname.}% {\usecounter{\@xnumctr}\@subex{#2}{1.5ex}}\fi} \def\@subex#1#2{\settowidth{\labelwidth}{#1}\itemindent\z@\labelsep#2% \ifnum\the\@xnumdepth=1\topsep 7\p@ plus2\p@ minus3\p@\else% \topsep 2\p@ plus2\p@\fi\parsep 2\p@ plus\p@ minus\p@% \itemsep \parsep\leftmargin\labelwidth\advance\leftmargin#2\relax} %%% the example-items \def\ex{\@ifnextchar [{\@ex}{\item}} \def\@ex[#1]#2{\item\@exj[#1]{#2}} \def\@exj[#1]#2{\@exjbg{#1} #2 \end{list}} \def\exi#1{\item[#1]\@ifnextchar [{\@exj}{}} \def\judgewidth#1{\def\@jwidth{#1}} \judgewidth{??} \def\@exjbg#1{\begin{list}{#1}{\@subex{\@jwidth}{.5ex}}\item} \def\exr#1{\exi{{(\ref{#1})}}} \def\exp#1{\exi{{(\ref{#1}$'$)}}} \def\sn{\exi{}} \def\bu{\item[$\bullet$]} %%%%%%% \lcomment for breaks in (example-)lists (leaves all counters %%%%%%% as they are) (hpk) \newlength{\lcommentsep} \lcommentsep = 1ex \long\def\lcomment#1% {\vspace{\lcommentsep} \item[]\hspace*{-\leftmargin}% \@tempskipa=\linewidth% \addtolength{\@tempskipa}{\rightmargin}% \addtolength{\@tempskipa}{\leftmargin}% \parbox{\@tempskipa}{#1}% \vspace{\lcommentsep}% } %%%%%% control the alignment of exampleno. and (picture-)example %%%%%% (by Lex Holt ). \def\attop#1{\leavevmode\vtop{\strut\vskip-\baselineskip\vbox{#1}}} \def\atcenter#1{$\vcenter{#1}$} %%%%%% %-------------------Move Arrows (from J.Frampton): \def\leaderfill{\leaders\hrule\hfil} \def\pointerup{\hbox to 0pt{\hss \vbox{\offinterlineskip\vskip-1pt\hbox{\elevenex\char'170}\null}\hss}} \def\pointerdown{\hbox to 0pt{\hss \vtop{\offinterlineskip\null\hbox{\elevenex\char'171}\vskip-1pt}\hss}} \let\pu=\pointerup \let\pd=\pointerdown \let\lf=\leaderfill \def\spacer{\hskip4.5pt} \def\fillright#1{\hfil#1\leaderfill} \def\fillleft#1{\leaderfill#1\hfil} % Changed spelling to \centr, else conflicts with LaTeX \center{} -CT \def\centr#1{\leaderfill#1\leaderfill} \def\link#1{\multispan#1\leaderfill} \def\arrowalign#1{\vtop{\baselineskip=0pt \lineskiplimit=0pt \lineskip=2pt \halign{&##\cr#1}}} %\font\elevenex=cmex10 scaled\magstephalf % just for the arrow! %PS: this may not work on some installations, not sure why. CT %%PPS: (e.g., PCTeX, but it works find works fine with EmTeX) %----------------END Move Arrows \def\pijl{$\rightarrow$\ } % Special accents for Vata & Gbadi; Navajo coming soon, I hope...: %\def\bb#1{$\mathrm{\overline{#1}}$} Following looks better: \def\bb#1{\ifmmode\overline{\mathrm{#1}}\else$\bar{\mathrm{#1}}$\fi} \def\boven#1#2{\raisebox{-0.2pt}{$\stackrel{#1}{\mathrm{#2}}$}} \def\bovenop#1#2{\raisebox{-0.06ex}[0ex][0ex]{$\stackrel{#1}{\mathrm{#2}}$}} \def\vl{\rule{0.05em}{0.30em}} \def\|#1{\ifmmode\vert#1\else\bovenop{\vl}{#1}\fi} ================================================ FILE: docs/realizer/manual.tex ================================================ %% %% nb: use pdflatex to create pdf file with hyperlinks %% %% ===================================================================== %% DOCUMENT DATA %% ===================================================================== \documentclass[11pt]{article} \title{OpenCCG Realizer Manual} \author{Michael White} %% ===================================================================== %% PACKAGES %% ===================================================================== \usepackage{openccg} % for hlds/ccg \usepackage{graphicx} % for figs \usepackage{gb4e} % for examples %\usepackage{cgmacros,hylo,ccg} % for hlds/ccg \usepackage[ colorlinks=true, linkcolor=blue, citecolor=blue, urlcolor=blue, pdfstartview=FitH, pdftitle={OpenCCG Realizer Manual}, pdfauthor={Michael White} ]{hyperref} %\usepackage{mathptmx} %% listing settings %% nb: not crazy about font, esp that bold not working with keywords \usepackage{listings,color} \lstset{language=Java,basicstyle=\ttfamily\footnotesize,keywordstyle=\underline,commentstyle=\itshape\color{blue}} %basicstyle=\ttfamily\small %% ===================================================================== %% NEW COMMANDS %% ===================================================================== %\newcommand{\occg}{\textsf{OpenCCG}} \newcommand{\occg}{OpenCCG} \newcommand{\tccg}{\textsf{tccg}} \newcommand{\ccgrz}{\textsf{ccg-realize}} \newcommand{\ccgtest}{\textsf{ccg-test}} \newcommand{\code}[1]{\texttt{#1}} %\small \newcommand{\eref}[2][]{(\ref{ex:#2}#1)} % ref to examples \newcommand{\secref}[1]{Section~\ref{sec:#1}} % ref to sections \newcommand{\figref}[1]{Figure~\ref{fig:#1}} % ref to figures \newlength{\mytablen} % for indenting in terms \newcommand{\mytab}[1]{ \settowidth{\mytablen}{\ensuremath{#1}} \mbox{\hspace{\mytablen}} } \newcommand{\xor}{~\underline{\vee}~} \newcommand{\shared}[1]{\fbox{\ensuremath{#1}}} \newcommand{\alt}[1]{\mathsf{alt}_{#1}} \newcommand{\opt}[1]{\mathsf{opt}_{#1}} %% ===================================================================== %% DOCUMENT BODY %% ===================================================================== \begin{document} \thispagestyle{empty} \maketitle \tableofcontents %\listoftables \listoffigures \newpage %% to do: %% making an n-gram model %% DLFs \section{About this manual} This manual is a programmer's guide to using the \occg\ surface realizer in Java applications. You can download and install \occg\ from its website, \url{http://openccg.sourceforge.net}. Once you've unpacked the archive, have a look at the \texttt{README} file for installation instructions. For a brief introduction to writing grammars for \occg, see the ``rough guide'' in \texttt{docs/grammars-rough-guide.pdf}. \section{About the OpenCCG realizer} \label{overview} The OpenCCG realizer \cite{White/Baldridge:2003,White-RLAC:2004,White-INLG:2004,White-ACLSoft:2005} is an open source surface realizer for Steedman's \cite{Steedman-LI:2000,Steedman:SynProc} Combinatory Categorial Grammar (CCG), including the multi-modal extensions to CCG devised by Baldridge and Kruijff \cite{Baldridge:PhD,Baldridge/Kruijff:2003}. Like other chart realizers \cite{Kay:1996,Shemtov:PhD,Carroll-and-co:1999,Bob-Moore:2002}, the OpenCCG realizer takes as input a logical form specifying the propositional meaning of a sentence, and returns one or more surface strings that express this meaning according to the lexicon and grammar. A distinguishing feature of OpenCCG is that it implements a hybrid symbolic-statistical chart realization algorithm that combines (1) a theoretically grounded approach to syntax and semantic composition, with (2) the use of integrated language models for making choices among the options left open by the grammar, thereby reducing the need for hand-crafted rules. To allow language models to be combined in flexible ways---as well as to enable research on how to best combine language modeling and realization---OpenCCG's design includes an extensible API (application programming interface) that allows user-defined functions to be used for scoring partial realizations and for pruning low-scoring ones during the search. The design also includes classes for supporting a range of language models and typical ways of combining them. \begin{figure*}%[t]%[t]%[!h] \begin{center} \mbox{} \includegraphics[width=\textwidth]{realizer-class.pdf} \caption{High-level architecture of the OpenCCG realizer} \label{realizer-class} \end{center} \end{figure*} The UML class diagram in Figure~\ref{realizer-class} shows the high-level architecture of the OpenCCG realizer. A realizer instance is constructed with a reference to a CCG grammar (which supports both parsing and realization). The grammar's lexicon has methods for looking up lexical items via their surface forms (for parsing), or via the principal predicates or relations in their semantics (for realization). A grammar also has a set of hierarchically organized atomic types, which can serve as the values of features in the syntactic categories, or as ontological sorts for the discourse referents in the logical forms (LFs). Lexical lookup yields lexical signs. A sign pairs a list of words with a category, which itself pairs a syntactic category with a logical form. Lexical signs are combined into derived signs using the rules in the grammar's rule group. Derived signs maintain a derivation history, and their word lists share structure with the word lists of their input signs. For generality, the realizer makes use of a configurable sign scorer and pruning strategy. A sign scorer implements a function that returns a number between 0 and 1 for an input sign. For example, a standard trigram language model can be used to implement a sign scorer, by returning the probability of a sign's words as its score. A pruning strategy implements a method for determining which edges to prune during the realizer's search. The input to the method is a ranked list of edges for signs that have equivalent categories (but different words); grouping edges in this way ensures that pruning cannot ``break'' the realizer, i.e.\ prevent it from finding some grammatical derivation when one exists. By default, an N-best pruning strategy is employed, which keeps the N highest scoring input edges, pruning the rest (where N is determined by the current preference settings). \begin{figure*}%[p]%[t]%[!h] \begin{center} % \mbox{} % \includegraphics{code/realize.pdf} \begin{lstlisting} // load grammar, instantiate realizer URL grammarURL = ...; Grammar grammar = new Grammar(grammarURL); Realizer realizer = new Realizer(grammar); // configure realizer with trigram backoff model // and 10-best pruning strategy realizer.signScorer = new StandardNgramModel(3, "lm.3bo"); realizer.pruningStrategy = new NBestPruningStrategy(10); // ... then, for each request: // get LF from input XML Document inputDoc = ...; LF lf = realizer.getLfFromDoc(inputDoc); // realize LF and get output words in XML Edge bestEdge = realizer.realize(lf); Document outputDoc = bestEdge.sign.getWordsInXml(); // return output ... outputDoc ...; \end{lstlisting} \caption{Example realizer usage} \label{realizer-usage} \end{center} \end{figure*} \section{Using the realizer} Sample Java code for using the realizer appears in Figure~\ref{realizer-usage}. The input is an XML document that contains an \code{lf} element either as the root or as a child of the root. To create a sample XML document with an acceptable format, you can use the \tccg\ tool's \code{:2xml } command. Note that the input XML document can be created in any way that is allowed by the JDOM API. For example, if the logical form is created by a Java XSLT-based sentence planner in the same process, the XSLT output can be captured in a JDOM document, and then simply passed by reference to the realizer. The output of the realizer is typically an XML document, as shown in the figure. In such documents, each word in the output sequence appears in its own element; additionally, any pitch accents and boundary tones appear in separate elements, and any expanded multi-words are indicated. Output documents of this kind can be easily processed into other formats using XSLT. If a simple string output suffices, the \code{Sign.getOrthography()} method can be used instead. The realization algorithm is implemented by the \code{realize(LF)} method. As in the chart realizers cited earlier, the algorithm makes use of a chart and an agenda to perform a bottom-up dynamic programming search for signs whose LFs completely cover the elementary predications in the input logical form. The algorithm's details and a worked example appear in \cite{White-RLAC:2004,White-INLG:2004}. To see a full realization trace, you can use \ccgrz\ to realize an LF stored in an XML file (e.g.\ one created using \tccg). As shown in Figure~\ref{realizer-usage}, the \code{realize(LF)} method returns the edge for the best realization of the input LF, as determined by the sign scorer. After a realization request, the N-best complete edges---or more generally, all the edges for complete realizations that survived pruning---are also available from the chart. To access these edges, you can invoke \code{realizer.getChart().bestEdges()}. The search for complete realizations proceeds in one of two modes, anytime and two-stage (packing/unpacking). In the anytime mode, a best-first search is performed with a configurable time limit (which may be a limit on how long to look for a better realization, after the first complete one is found). With this mode, the scores assigned by the sign scorer determine the order of the edges on the agenda, and thus have an impact on realization speed. In the two-stage mode, a packed forest of all possible realizations is created in the first stage; then in the second stage, the packed representation is unpacked in bottom-up fashion, with scores assigned to the edge for each sign as it is unpacked, much as in \cite{Langkilde:2000}. In both modes, the pruning strategy is invoked to determine whether to keep or prune newly constructed edges. For single-best output, the anytime mode can provide signficant time savings by cutting off the search early; see \cite{White-INLG:2004} for discussion. For N-best output---especially when a complete search (up to the edges that survive the pruning strategy) is desirable---the two-stage mode can be more efficient. \section{Scoring signs} The classes for implementing sign scorers appear in Figure~\ref{scorer-class}. In the diagram, classes for n-gram scoring appear towards the bottom, while classes for combining scorers appear on the left, and the class for avoiding repetition appears on the right. \begin{figure*}%[p]%[t]%[!h] \begin{center} \mbox{} \includegraphics[width=\textwidth]{scorer-class.pdf} \caption{Classes for scoring signs} \label{scorer-class} \end{center} \end{figure*} \subsection{Standard n-gram models} \label{standard-ngrams} The \code{Standard\-Ngram\-Model} class can load standard n-gram backoff models for scoring, as shown earlier in Figure~\ref{realizer-usage}. Such models can be constructed with the SRILM toolkit \cite{SRILM-ICSLP:2002}, as described in Section~\ref{using-srilm}; in principle, other toolkits could be used instead, as long as their output could be converted into the same file formats. Since the SRILM toolkit has more restrictive licensing conditions than those of OpenCCG's LGPL license, OpenCCG includes its own classes for scoring with n-gram models, in order to avoid any necessary runtime dependencies on the SRILM toolkit. The n-gram tables are efficiently stored in a trie data structure (as in the SRILM toolkit), thereby avoiding any arbitrary limit on the n-gram order. To save memory and speed up equality tests, each string is interned (replaced with a canonical instance) at load time, which accomplishes the same purpose as replacing the strings with integers, but without the need to maintain a separate mapping from integers back to strings. For better generalization, certain words may be dynamically replaced with the names of their semantic classes when looking up n-gram probabilities. Words are assigned to semantic classes in the lexicon, and the semantic classes to use in this way may be configured at the grammar level. Note that \cite{Oh/Rudnicky:2002} and \cite{Adwait:2002} make similar use of semantic classes in n-gram scoring, by deferring the instantiation of classes (such as \textit{departure city}) until the end of the generation process; our approach accomplishes the same goal in a slightly more flexible way, in that it also allows the specific word to be examined by other scoring models, if desired. As discussed in \cite{White-INLG:2004}, with dialogue systems like COMIC n-gram models can do an excellent job of placing underconstrained adjectival and adverbial modifiers---as well as boundary tones---without resorting to the more complex methods investigated for adjective ordering in \cite{Shaw/Hatzi:1999,Malouf:2000}. For instance, in examples like those in \eref{adv-placement}, they correctly select the preferred positions for \textit{here} and \textit{also} (as well as for the boundary tones), with respect to the verbal head and sister dependents: \begin{exe} %\small \ex \label{ex:adv-placement} \begin{xlist} \ex Here$_{L+H*}$ LH\% we have a design in the classic$_{H*}$ style LL\% . \ex This$_{L+H*}$ design LH\% here$_{L+H*}$ LH\% is also$_{H*}$ classic LL\% . \end{xlist} \end{exe} We have also found that it can be useful to use reverse (or ``right-to-left'') models, as they can help to place adverbs like \textit{though}, as in \eref{though}: \begin{exe} %\small \ex \label{ex:though} The tiles are also$_{H*}$ from the Jazz$_{H*}$ series though LL\% . \end{exe} \noindent In principle, the forward and reverse probabilities should be the same---as they are both derived via the chain rule from the same joint probability of the words in the sequence---but we have found that with sparse data the estimates can differ substantially. In particular, since \textit{though} typically appears at the end of a variety of clauses, its right context is much more predictable than its left context, and thus reverse models yield more accurate estimates of its likelihood of appearing clause-finally. \subsection{N-gram scorers} The \code{Standard\-Ngram\-Model} class is implemented as a subclass of the base class \code{Ngram\-Scorer}. All \code{Ngram\-Scorer} instances may have any number of \code{Ngram\-Filter} instances, whose \code{filter\-Out} methods are invoked prior to n-gram scoring; if any of these methods return true, a score of zero is immediately returned. The \code{AAn\-Filter} provides one concrete implementation of the \code{Ngram\-Filter} interface, and returns true if it finds a bigram consisting of \textit{a} followed by a vowel-inital word, or \textit{an} followed by a consonant-initial word, subject to a configurable set of exceptions that can be culled from bigram counts. We have found that such n-gram filters can be more efficient, and more reliable, than relying on n-gram scores alone; in particular, with \textit{a/an}, since the unigram probability for \textit{a} tends to be much higher than that of \textit{an}, with unseen words beginning with a vowel, there may not be a clear preference for the bigram beginning with \textit{an}. The base class \code{Ngram\-Scorer} implements the bulk of the \code{score} method, using an abstract \code{log\-Prob\-From\-Ngram} method for subclass-specific calculation of the log probabilities (with backoff) for individual n-grams. The \code{score} method also invokes the \code{prepare\-To\-Score\-Words} method, in order to allow for subclass-specific pre-processing of the words in the given sign. With \code{Standard\-Ngram\-Model}, this method is used to extract the word forms or semantic classes into a list of strings to score. It also appends any pitch accents to the word forms or semantic classes, effectively treating them as integral parts of the words. Since the realizer builds up partial realizations bottom-up rather than left-to-right, it only adds start of sentence (and end of sentence) tags with complete realizations. As a consequence, the words with less than a full $n-1$ words of history are scored with appropriate sub-models. For example, the first word of a phrase is scored with a unigram sub-model, without imposing backoff penalties. Another consequence of bottom-up realization is that both the left- and right-contexts may change when forming new signs from a given input sign. Consequently, it is often not possible (even in principle) to use the score of an input sign directly in computing the score of a new result sign. If one could make assumptions about how the score of an input sign has been computed---e.g., by a bigram model---one could determine the score of the result sign from the scores of the input signs together with an adjustment for the word(s) whose context has changed. However, our general approach to sign scoring precludes making such assumptions. Nevertheless, it is still possible to improve the efficiency of n-gram scoring by caching the log probability of a sign's words, and then looking up that log probability when the sign is used as the first input sign in creating a new combined sign---thus retaining the same left context---and only recomputing the log probabilities for the words of any input signs past the first one. (With reverse models, the sign must be the last sign in the combination.) In principle, the derivation history could be consulted further to narrow down the words whose n-gram probabilities must be recomputed to the minimum possible, though \code{Ngram\-Scorer} only implements a single-step lookup at present.\footnote{Informal experiments indicate that caching log probabilities in this way can yield an overall reduction in best-first realization times of 2-3\% on average.} Finally, note that a Java \code{Weak\-Hash\-Map} is used to implement the cache, in order to avoid an undesirable buildup of entries across realization requests. \subsection{Interpolation} \label{interpolation} Scoring models may be linearly interpolated in two ways. Sign scorers of any variety may be combined using the \code{Sign\-Scorer\-Interpolation} class. For example, Figure~\ref{forward-reverse-interpolation} shows how forward and reverse n-gram models may be interpolated. \begin{figure*}%[p]%[t]%[!h] \begin{center} \begin{lstlisting} // configure realizer with 4-gram forward and reverse backoff // models, interpolated with equal weight NgramScorer forwardModel = new StandardNgramModel(4, "lm.4bo"); NgramScorer reverseModel = new StandardNgramModel(4, "lm-r.4bo"); reverseModel.setReverse(true); realizer.signScorer = new SignScorerInterpolation( new SignScorer[] { forwardModel, reverseModel } ); \end{lstlisting} \caption{Example interpolated n-gram model} \label{forward-reverse-interpolation} \end{center} \end{figure*} With n-gram models of the same direction, it is also possible to linearly interpolate models at the word level, using the \code{Linear\-Ngram\-Scorer\-Combo} class. Word-level interpolation makes it easier to use cache models created with maximum likelihood estimation, as word-level interpolation with a base model avoids problems with zero probabilities in the cache model. As discussed in \cite{Carsten-Alignment:2005}, cache models can be used to promote alignment with a conversational partner, by constructing a cache model from the bigrams in the partner's previous turn, and interpolating it with a base model.\footnote{At present, such cache models must be constructed with a call to the SRILM toolkit; it would not be difficult to add OpenCCG support for constructing them though, since these models do not require smoothing.} Figure~\ref{base-cache-interpolation} shows one way to create such an interpolated model. \begin{figure*}%[p]%[t]%[!h] \begin{center} \begin{lstlisting} // configure realizer with 4-gram backoff base model, // interpolated at the word level with a bigram maximum-likelihood // cache model, with more weight given to the base model NgramScorer baseModel = new StandardNgramModel(4, "lm.4bo"); NgramScorer cacheModel = new StandardNgramModel(2, "lm-cache.mle"); realizer.signScorer = new LinearNgramScorerCombo( new SignScorer[] { baseModel, cacheModel }, new double[] { 0.6, 0.4 } ); \end{lstlisting} \caption{Example word-level interpolation of a cache model} \label{base-cache-interpolation} \end{center} \end{figure*} \subsection{N-gram precision models} \label{ngram-precision} The \code{NgramPrecisionModel} subclass of \code{Ngram\-Scorer} computes a modified version of the Bleu score used in MT evaluation \cite{Bleu:2001}. Its constructor takes as input an array of target strings---from which it extracts the n-gram sequences to use in computing the n-gram precision score---and the desired order. Unlike with the Bleu score, rank order centroid weights (rather than the geometric mean) are used to combine scores of different orders, which avoids problems with scoring partial realizations which have no n-gram matches of the target order. For simplicity, the score also does not include the Bleu score's bells and whistles to make cheating on length difficult. We have found n-gram precision models to be very useful for regression testing the grammar, as an n-gram precision model created just from the target string nearly always leads the realizer to choose that exact string as its preferred realization. These models can also be useful for evaluating the success of different scoring models in a cross-validation setup, though with high quality output, manual inspection is usually necessary to determine the importance of any differences between the preferred realization and the target string. Finally, note that n-gram precision models can be used as a quick-and-dirty substitute for standard n-gram models, if one does not have time to install and use the SRILM toolkit. \subsection{Factored language models} A factored language model \cite{Bilmes-Kirchoff:2003} is a new kind of language model that treats words as bundles of factors. To support scoring with such models, OpenCCG represents words as objects with a surface form, pitch accent, stem, part of speech, supertag, and semantic class. Words may also have any number of further attributes, such as associated gesture classes, in order to handle in a general way elements like pitch accents that are ``coarticulated'' with words. To represent words efficiently, and to speed up equality tests, all attribute values are interned, and the \code{Word} objects themselves are interned via a factory method. Note that in Java, it is straightforward to intern objects other than strings by employing a \code{Weak\-Hash\-Map} to map from an object key to a weak reference to itself as the canonical instance. (Using a weak reference avoids accumulating interned objects that would otherwise be garbage collected.) With the SRILM toolkit, factored language models can be constructed that support \textit{generalized parallel backoff}: that is, backoff order is not restricted to just dropping the most temporally distant word first, but rather may be specified as a path through the set of contextual parent variables; additionally, parallel backoff paths may be specified, with the possibility of combining these paths dynamically in various ways. In OpenCCG, the \code{Factored\-Ngram\-Model} class supports scoring with factored language models that employ generalized backoff, though parallel backoff is not yet supported, as it remains somewhat unclear whether the added complexity of parallel backoff is worth the implementation effort. Typically, several related factored language models are specified in a single file and loaded by a \code{Factored\-Ngram\-Model\-Family}, which can multiplicatively score models for different child variables, and include different sub-models for the same child variable. To illustrate, let us consider a simplified version of the factored language model family used in the COMIC realizer. This model computes the probability of the current word given the preceding ones according to the formula shown in \eref{comic-flm}, where a word consists of the factors word (W), pitch accent (A), gesture class (GC), and gesture instance (GI), plus the other standard factors which the model ignores: \begin{exe} \ex \label{ex:comic-flm} \begin{small} \( \begin{array}{l} P(\langle W,A,GC,GI \rangle \, | \, \langle W,A,GC,GI \rangle_{-1} \, \ldots) \approx \\ \; \; \; P(W \, | \, W_{-1} W_{-2} A_{-1} A_{-2}) \; \times \\ \; \; \; P(GC \, | \, W) \; \times \\ \; \; \; P(GI \, | \, GC) \\ \end{array} \) \end{small} \end{exe} \noindent In \eref{comic-flm}, the probability of the current word is approximated by the probability of the current word form given the preceding two word forms and preceding two pitch accents, multiplied by the probability of the current gesture class given the current word form, and by the probability of the current gesture instance given the current gesture class. Note that in the COMIC grammar, the choice of pitch accent is entirely rule governed, so the current pitch accent is not scored separately in the model. However, the preceding pitch accents are taken into account in predicting the current word form, as perplexity experiments have suggested that they do provide additional information beyond that provided by the previous word forms. The specification file for this model appears in Figure~\ref{flm-spec}. The format of the file is a restricted form of the files used by the SRILM toolkit to build factored language models. The file specifies four models, where the first, third and fourth models correspond to those in \eref{comic-flm}. With the first model, since the previous words are typically more informative than the previous pitch accents, the backoff order specifies that the most distant accent, \code{A(-2)}, should be dropped first, followed by the previous accent, \code{A(-1)}, then the most distant word, \code{W(-2)}, and finally the previous word, \code{W(-1)}. The second model is considered a sub-model of the first---since it likewise predicts the current word---to be used when there is only one word of context available (i.e.\ with bigrams). Note that when scoring a bigram, the second model will take the previous pitch accent into account, whereas the first model would not. For documentation of the file format as it is used in the SRILM toolkit, see \cite{FLM-JHSW:2002}. \begin{figure*}%[p]%[t]%[!h] \begin{footnotesize} \begin{verbatim} ## Simplified COMIC realizer FLM spec file ## Trigram Word model based on previous words and accents, dropping accents first, ## with bigram sub-model; ## Unigram Gesture Class model based on current word; and ## Unigram Gesture Instance model based on current gesture class 4 ## 3gram with A W : 4 W(-1) W(-2) A(-1) A(-2) w_w1w2a1a2.count w_w1w2a1a2.lm 5 W1,W2,A1,A2 A2 ndiscount gtmin 1 W1,W2,A1 A1 ndiscount gtmin 1 W1,W2 W2 ndiscount gtmin 1 W1 W1 ndiscount gtmin 1 0 0 ndiscount gtmin 1 ## bigram with A W : 2 W(-1) A(-1) w_w1a1.count w_w1a1.lm 3 W1,A1 A1 ndiscount gtmin 1 W1 W1 ndiscount gtmin 1 0 0 ndiscount gtmin 1 ## Gesture class depends on current word GC : 1 W(0) gc_w0.count gc_w0.lm 2 W0 W0 ndiscount gtmin 1 0 0 ndiscount gtmin 1 ## Gesture instance depends only on class GI : 1 GC(0) gi_gc0.count gi_gc0.lm 2 GC0 GC0 ndiscount gtmin 1 0 0 \end{verbatim} \end{footnotesize} \caption{Example factored language model family specification} \label{flm-spec} \end{figure*} Like \code{Standard\-Ngram\-Model}, the \code{Factored\-Ngram\-Model} class stores its n-gram tables in a trie data structure, except that it stores an interned factor key (i.e.\ a factor name and value pair, or just a string, in the case of the word form) at each node, rather than a simple string. During scoring, the \code{log\-Prob\-From\-Ngram} method determines the log probability (with backoff) of a given n-gram by extracting the appropriate sequence of factor keys, and using them to compute the log probability as with standard n-gram models. The \code{Factored\-Ngram\-Model\-Family} class computes log probabilities by delegating to its component factored n-gram models (choosing appropriate sub-models, when appropriate) and summing the results. \subsection{Avoiding repetition} While cache models appear to be a promising avenue to promote lexical and syntactic alignment with a conversational partner, a different mechanism appears to be called for to avoid ``self-alignment''---that is, to avoid the repetitive use of words and phrases. As a means to experiment with avoiding repetition, OpenCCG includes the \code{Repetition\-Scorer} class. This class makes use of a configurable penalty plus a set of methods for dynamically managing the context. It returns a score of \( 10^{- c_r \times p} \), where $c_r$ is the count of repeated items, and $p$ is the penalty. Note that this formula returns 1 if there are no repeated items, and returns a score that is linear in log space with the number of repeated items otherwise. A repetition scorer can be combined multiplicatively with an n-gram model, in order to discount realizations that repeat items from the recent context. Figure~\ref{rep-scorer} shows such a combination, together with the operations for updating the context. By default, open class stems are the considered the relevant items over which to count repetitions, though this behavior can be specialized by subclassing \code{Repetition\-Scorer} and overriding the \code{updateItems} method. Note that in counting repetitions, full counts are given to items in the previous words or recent context, while fractional counts are given to older items; the exact details may likewise be changed in a subclass, by overriding the \code{repeatedItems} method. \begin{figure*}%[p]%[t]%[!h] \begin{center} \begin{lstlisting} // set up n-gram scorer and repetition scorer String lmfile = "ngrams/combined.flm"; NgramScorer ngramScorer = new FactoredNgramModelFamily(lmfile, true); ngramScorer.addFilter(new AAnFilter()); RepetitionScorer repetitionScorer = new RepetitionScorer(); // combine n-gram scorer with repetition scorer realizer.signScorer = new SignScorerProduct( new SignScorer[] { ngramScorer, repetitionScorer } ); // ... then, after each realization request, Edge bestEdge = realizer.realize(lf); // ... update repetition context for next realization: repetitionScorer.ageContext(); repetitionScorer.updateContext(bestEdge.getSign()); \end{lstlisting} \caption{Example combination of an n-gram scorer and a repetition scorer} \label{rep-scorer} \end{center} \end{figure*} \subsection{Building language models with the SRILM toolkit} \label{using-srilm} You can use \occg's regression testing tool, \ccgtest, to help build and test language models built with the SRILM toolkit. By default, running \ccgtest\ will use the grammar in the current directory to parse and realize the default regression file, \code{testbed.xml}, using an n-gram precision model constructed for each test item. Using the appropriate command-line options, it is also possible to export the text of the test items in order to construct an n-gram model with the SRILM toolkit, and then use the resulting model in testing the realizer. To display the syntax of \ccgtest's command-line options, you can invoke it with the the \code{-h} option, as shown in \eref{ccg-test-help}. To export the text of the test items to a text file, you use the \code{-text} option, as in \eref{export-text}. The next step is to use SRILM's \code{ngram-count} tool to build an n-gram language model. In \eref{make-lm},\footnote{This command, and the ensuing ones, should be entered on one line.} \code{ngram-count} is used to build a 4-gram backoff model, \code{n.4bo}, from the text file \code{tb.txt}, using Ristad's ``natural'' discounting method \cite{Ristad:1995}. For small test sets, we have found that Ristad's method works better than the default one (Good-Turing). Note that in \eref{make-lm}, the \code{-unk} option is used to reserve some probability for unknown words; the \code{-gtmin 1} options (for N=1 to 4) specify to keep all 1-counts; and the \code{-ndiscount} options (for N=1 to 4) specify the use of natural discounting for unigrams through 4-grams. Finally, to test the resulting language model, you use \ccgtest's \code{-ngramorder} and \code{-lm} options, as shown in \eref{test-lm}. \begin{exe} %\small \ex %\label{ex:make-test-lm} \begin{xlist} \ex \label{ex:ccg-test-help} \code{ccg-test -h} \ex \label{ex:export-text} \code{ccg-test -text tb.txt} \ex \label{ex:make-lm} \code{ngram-count -order 4 -unk -text tb.txt -lm n.4bo \\ -gt1min 1 -gt2min 1 -gt3min 1 -gt4min 1 \\ -ndiscount1 -ndiscount2 -ndiscount3 -ndiscount4} \ex \label{ex:test-lm} \code{ccg-test -noparsing -ngramorder 4 -lm n.4bo} \end{xlist} \end{exe} To perform a simple 2-fold cross-validation, \ccgtest\ includes options for exporting or testing just the even or odd test items. The command in \eref{export-even} shows how you can export just the text of the even-numbered test items. Note that the \code{-textsc} option specifies that the text be exported using semantic class replacement, i.e.\ with certain words replaced with their semantic classes; the classes to use for this purpose are specified using the \code{replacement-sem-classes} attribute of the \code{tokenizer} element in the \code{grammar.xml} file. The next step is to build a language model just as before; the abbreviated command appears in \eref{make-even-lmsc}. Finally, you can test the language model on just the odd-numbered items, as in \eref{test-odd}, where the \code{-lmsc} option specifies that semantic class replacement should be employed when scoring realizations with the model. Naturally, you can switch the \code{-even} and \code{-odd} flags, and adjust the text and language model names, to test realization on the even-numbered items, using a language model trained from the odd-numbered ones. \begin{exe} %\small \ex %\label{ex:even-odd} \begin{xlist} \ex \label{ex:export-even} \code{ccg-test -even -textsc tb-sc.even.txt} \ex \label{ex:make-even-lmsc} \code{ngram-count -order 4 -unk -text tb-sc.even.txt \\ -lm n-sc.even.4bo ...} \ex \label{ex:test-odd} \code{ccg-test -noparsing -odd -ngramorder 4 -lmsc n-sc.even.4bo} \end{xlist} \end{exe} An example of building a factored language model appears next. In \eref{export-fsc}, the text of the test items is exported, where each word appears with all its factors, and word forms are replaced with semantic classes when appropriate. In \eref{make-flmsc}, SRILM's \code{fngram-count} is used to create a factored language model from the spec file named \code{spec.flm}. (Note that the various individual language model files are listed in the spec file.) Finally, \eref{test-flm} shows how the factored language model can be tested in \ccgtest. \begin{exe} %\small \ex \begin{xlist} \ex \label{ex:export-fsc} \code{ccg-test -textfsc tb-fsc.txt} \ex \label{ex:make-flmsc} \code{fngram-count -factor-file spec.flm -text tb-fsc.txt -lm -unk} \ex \label{ex:test-flm} \code{ccg-test -noparsing -flmsc spec.flm} \end{xlist} \end{exe} \section{Pruning Strategies} \label{pruning} The classes for defining edge pruning strategies appear in Figure~\ref{pruner-class}. As mentioned in Section~\ref{overview}, an N-best pruning strategy is employed by default, where N is determined by the current preference settings. It is also possible to define custom strategies. To support the definition of a certain kind of custom strategy, the abstract class \code{Diversity\-Pruning\-Strategy} provides an N-best pruning strategy that promotes diversity in the edges that are kept, according to the equivalence relation established by the abstract \code{not\-Compellingly\-Different} method. In particular, in order to determine which edges to keep, a diversity pruning strategy clusters the edges into a ranked list of equivalence classes, which are sequentially sampled until the limit N is reached. If the \code{single\-Best\-Per\-Group} flag is set, then a maximum of one edge per equivalence class is retained. \begin{figure*}%[p]%[t]%[!h] \begin{center} \mbox{} %scale=1.25 \includegraphics[width=\textwidth]{pruner-class.pdf} \caption{Classes for defining pruning strategies} \label{pruner-class} \end{center} \end{figure*} As an example, the COMIC realizer's diversity pruning strategy appears in Figure~\ref{gest-diversity-strategy}. The idea behind this strategy is to avoid having the N-best lists become full of signs whose words differ only in the exact gesture instance associated with one or more of the words. With this strategy, if two signs differ in just this way, the edge for the lower-scoring sign will be considered ``not compellingly different'' and pruned from the N-best list, making way for other edges whose signs exhibit more interesting differences. \begin{figure*}%[p]%[t]%[!h] \begin{center} \begin{lstlisting} // configure realizer with gesture diversity pruner realizer.pruningStrategy = new DiversityPruningStrategy() { /** * Returns true iff the given signs are not compellingly different; * in particular, returns true iff the words differ only in their * gesture instances. */ public boolean notCompellinglyDifferent(Sign sign1, Sign sign2) { List words1 = sign1.getWords(); List words2 = sign2.getWords(); if (words1.size() != words2.size()) return false; for (int i = 0; i < words1.size(); i++) { Word w1 = (Word) words1.get(i); Word w2 = (Word) words2.get(i); if (w1 == w2) continue; if (w1.getForm() != w2.getForm()) return false; if (w1.getPitchAccent() != w2.getPitchAccent()) return false; if (w1.getVal("GC") != w2.getVal("GC")) return false; // nb: assuming that they differ in the val of GI at this point } return true; } }; \end{lstlisting} \caption{Example diversity pruning strategy} \label{gest-diversity-strategy} \end{center} \end{figure*} OpenCCG also provides a concrete subclass of \code{Diversity\-Pruning\-Strategy} named \code{Ngram\-Diversity\-Pruning\-Strategy}, which generalizes the approach to pruning described in \cite{Langkilde:2000}. With this class, two signs are considered not compellingly different if they share the same $n\!-\!1$ initial and final words, where $n$ is the n-gram order. When one is interested in single-best output, an n-gram diversity pruning strategy can increase efficiency while guaranteeing no loss in quality---as long as the reduction in the search space outweighs the extra time necessary to check for the same initial and final words---since any words in between an input sign's $n\!-\!1$ initial and final ones cannot affect the n-gram score of a new sign formed from the input sign. However, when N-best outputs are desired, or when repetition scoring is employed, it is less clear whether it makes sense to use an n-gram diversity pruning strategy; for this reason, a simple N-best strategy remains the default option. \section{Disjunctive logical forms} \label{sec:disj-lf} In applications, to specify the desired space of possible paraphrases, one may either provide an input logical form that underspecifies certain realization choices, or include explicit disjunctions in the input LF (or both). In our experience, we have found disjunctive LFs---inspired by those found in \cite{Shemtov:PhD}---to be an important capability, especially as one seeks to make grammars reusable across applications. \begin{figure}%[t]%[!h] \begin{small} \begin{center} \includegraphics[width=0.52\textwidth]{ex1a} \end{center} \begin{itemize} \item[(a)] Semantic dependency graph for \textit{The design (is$\mid$'s) based on the Funny Day collection by Villeroy and Boch.} \end{itemize} \vspace{3mm} \begin{center} \includegraphics[width=0.52\textwidth]{ex1b} \end{center} \begin{itemize} \item[(b)] Semantic dependency graph for \textit{The design (is$\mid$'s) based on Villeroy and Boch's Funny Day series.} \end{itemize} \vspace{3mm} \begin{center} \includegraphics[width=0.52\textwidth]{ex1c} \end{center} \begin{itemize} \item[(c)] Disjunctive semantic dependency graph covering (a)-(b), i.e.\ \textit{The design (is$\mid$'s) based on (the Funny Day (collection$\mid$series) by Villeroy and Boch $\mid$ Villeroy and Boch's Funny Day (collection$\mid$series)).} \end{itemize} \end{small} \caption{Example semantic dependency graphs.} \label{fig:ex1} \end{figure} \begin{figure}%[t]%[!h] \begin{small} \ensuremath{ @_{e}(\C{be} \wedge \modp{tense}\con{pres} \wedge \modp{mood}\con{dcl} \wedge \\ \mytab{@_{e}(} \modp{Arg}(d \wedge \C{design} \wedge \modp{det}\con{the} \wedge \modp{num}\con{sg}) \wedge \\ \mytab{@_{e}(} \modp{Prop}(p \wedge \C{based-on} \wedge \\ \mytab{@_{e}(\modp{Prop}(} \modp{Artifact}d \wedge \\ \mytab{@_{e}(\modp{Prop}(} \modp{Source}(c \wedge \C{collection} \wedge \modp{det}\con{the} \wedge \modp{num}\con{sg} \wedge \\ \mytab{@_{e}(\modp{Prop}(\modp{Source}(} \modp{HasProp}(f \wedge \C{Funny\_Day}) \wedge \\ \mytab{@_{e}(\modp{Prop}(\modp{Source}(} \modp{Creator}(v \wedge \C{V\&B})))) } \begin{center} (a) \vspace{2mm} \mbox{}\vdots \vspace{2mm} \end{center} \ensuremath{ @_{e}(\C{be} \wedge \modp{tense}\con{pres} \wedge \modp{mood}\con{dcl} \wedge \\ \mytab{@_{e}(} \modp{Arg}(d \wedge \C{design} \wedge \modp{det}\con{the} \wedge \modp{num}\con{sg}) \wedge \\ \mytab{@_{e}(} \modp{Prop}(p \wedge \C{based-on} \wedge \\ \mytab{@_{e}(\modp{Prop}(} \modp{Artifact}d \wedge \\ \mytab{@_{e}(\modp{Prop}(} \modp{Source}(c \wedge \modp{num}\con{sg} \wedge (\modp{det}\con{the})? \wedge \\ \mytab{@_{e}(\modp{Prop}(\modp{Source}(} (\C{collection} \xor \C{series}) \wedge \\ \mytab{@_{e}(\modp{Prop}(\modp{Source}(} \modp{HasProp}(f \wedge \C{Funny\_Day}) \wedge \\ \mytab{@_{e}(\modp{Prop}(\modp{Source}(} (\modp{Creator}\shared{v} \xor \modp{GenOwner}\shared{v})))) \\ \wedge @_{v}(\C{Villeroy\_and\_Boch}) } \begin{center} (c) \end{center} \end{small} \caption{HLDS for examples in \figref{ex1}.} \label{fig:ex1-hlds} \end{figure} As an illustration of disjunctive logical forms, consider the semantic dependency graphs in \figref{ex1}, which are taken from the COMIC\footnote{\texttt{http://www.hcrc.ed.ac.uk/comic/}} multimodal dialogue system.\footnote{To simplify the exposition, the features specifying information structure and deictic gestures have been omitted, as have the semantic sorts of the discourse referents.} Given the lexical categories in the COMIC grammar, the graphs in \figref{ex1}(a) and (b) fully specify their respective realizations, with the exception of the choice of the full or contracted form of the copula.\footnote{Note that to be consistent with the distributed grammar, the predicate $\C{based\_on}$ should actually be $\C{based-on}$; this discrepancy has been corrected in the subsequent figures.} To generalize over these alternatives, the disjunctive graph in (c) may be employed. This graph allows a free choice between the domain synonyms \textit{collection} and \textit{series}, as indicated by the vertical bar between their respective predications. The graph also allows a free choice between the \modp{Creator} and \modp{GenOwner} relations---lexicalized via \textit{by} and the possessive, respectively---connecting the head $c$ (\textit{collection} or \textit{series}) with the dependent $v$ (for \textit{Villeroy and Boch}); this choice is indicated by an arc between the two dependency relations.\footnote{Note that the arc and vertical bar are just presentation devices; there is no difference in the underlying implementation.} Finally, the determiner feature (\modp{det}\con{the}) on $c$ is indicated as optional, via the question mark.\footnote{Another option would be to include the determiner feature in the alternative with the \modp{Creator} relation, but that would make the graph harder to draw and would not illustrate optionality.} It is worth pausing at this point to observe that in designing the COMIC grammar, the differences between (a) and (b) could perhaps have been collapsed. However, such a move would make it more difficult to reuse the grammar in other applications---and indeed, the core of the grammar is shared with the FLIGHTS system \cite{FLIGHTS-FLAIRS:2004}---as it would presuppose that these paraphrases should always available in the same contexts. An example where the disjunctively specified paraphrases have applicable contexts that are more clearly limited appears in \eref{ex2}: \begin{exe} \ex \label{ex:ex2} (This design $\mid$ This one $\mid$ This) (is$\mid$'s) (classic $\mid$ in the classic style) $\mid$ Here we have a (classic design $\mid$ design in the classic style). \end{exe} \noindent This example shows some of the phrasings that may be used in COMIC to describe the style of a design that has not been discussed previously. The example includes a top-level disjunction between the use of a deictic NP \textit{this design $\mid$ this one $\mid$ this} (with an accompanying pointing gesture) followed by the copula, or the use of the phrase \textit{here we have} to introduce the design. While these alternatives can function as paraphrases in this context, it is difficult to see how one might specify them in a single underspecified (and application-neutral) logical form. Graphs such as those in \figref{ex1} are represented internally using Hybrid Logic Dependency Semantics (HLDS), as in \figref{ex1-hlds}. In HLDS, as can be seen in \figref{ex1-hlds}(a), each semantic head is associated with a nominal that identifies its discourse referent, and heads are connected to their dependents via dependency relations, which are modeled as modal relations; modal relations are also used to represent semantic features (in which case the relation is to an anonymous node). In (c), two new operators are introduced to represent periphrastic alternatives and optional parts of the meaning, namely $\xor$ and $(\cdot)?$, for exclusive-or and optionality, respectively. To indicate that a nominal represents a reference to a node that is considered a shared part of multiple alternatives, the nominal is annotated with a box, as exemplified by \shared{v}. This notion of shared references is needed during the logical form flattening stage of the realization algorithm in order to determine which elementary predications are part of each alternative. \begin{figure}%[t]%[!h] \begin{footnotesize} \begin{verbatim} \end{verbatim} \end{footnotesize} \caption{XML for example (a) \figref{ex1}.} \label{fig:ex1-xml-a} \end{figure} \begin{figure}%[t]%[!h] \begin{footnotesize} \begin{verbatim} \end{verbatim} \end{footnotesize} \caption{XML for example (c) \figref{ex1}.} \label{fig:ex1-xml-c} \end{figure} To specify inputs to the realizer, an XML representation of HLDS terms may be employed; alternatively, the more intuitive XML graph representation illustrated in \figref{ex1-xml-a} and \figref{ex1-xml-c} may be used, with an automatic translation converting such representations to HLDS. As can be seen in \figref{ex1-xml-a}, the nodes and dependency relations in the graph are represented by \texttt{node} and \texttt{rel} elements. Note that \texttt{node} elements that represent subordinated, reentrant references to a node use an \texttt{idref} attribute, as exemplified by the \texttt{Artifact} relation to the \texttt{node} element with \texttt{idref="d"}. \figref{ex1-xml-c} shows how periphrastic alternatives and optional parts of the meaning are specified using the \texttt{one-of} and \texttt{opt} elements, respectively. Where the alternatives involve attributes of a node, an \texttt{atts} element is used to provide the lexical predications or semantic features in question. Finally, note that \texttt{node} elements that represent references to a node that is considered a shared part of multiple alternatives is marked with the \texttt{shared="true"} attribute, as is the case here with the references to the dependent node $v$ (for \textit{Villeroy and Boch}). %% ===================================================================== %% BIBLIOGRAPHY %% ===================================================================== \addcontentsline{toc}{section}{References} \bibliographystyle{alpha} \bibliography{refs} \end{document} ================================================ FILE: docs/realizer/openccg.sty ================================================ %% %% This style file contains a minimal set of commands for %% OpenCCG categories and logical forms, plus derivations. %% It borrows from earlier style files by Gann, Jason and Geert-Jan, %% and from Beryl and Mark for the derivations. %% \newcommand{\bs}{\backslash} % backslash, to save typing \newcommand{\gf}[1]{\textsf{\textsl{#1}}} % gloss font, for words \newcommand{\cf}[1]{\ensuremath{\mathsf{#1}}} % category font \newcommand{\fb}[1]{\ensuremath{_{\mathit{#1}}}} % features, subscripted \newcommand{\fsb}[2]{\ensuremath{_{\langle#1\rangle\mathit{#2}}}} % fs index and features, subscripted \newcommand{\C}[1]{\textbf{#1}} % concept font \newcommand{\con}[1]{\ensuremath{\mathrm{#1}}} % constant font, for sem feature values or sem sorts \newcommand{\modp}[1]{\ensuremath{\langle}\textsc{#1}\ensuremath{\rangle}} % sem relation/feature % CCG derivations % arguments: #1 = no. of words, #2 = body % Carsten suggests removing @{}, in order to eliminate a gap on the % left end of lines in the derivation \newcommand{\deriv}[2] { \renewcommand{\arraystretch}{.5} $\begin{array}[t]{*{#1}{c}} #2 \end{array}$ } % centered multicolumn (NB: changed \mc to \cmc to avoid conflict with kluwer.cls) \newcommand{\cmc}[2]{\multicolumn{#1}{c}{#2}} % Rules, argument #1 gives the number of columns to cover. \newcommand{\uline}[1] {\cmc{#1}{\hrulefill} } \newcommand{\fapply}[1] { \cmc{#1}{\hrulefill_{>}} } \newcommand{\bapply}[1] { \cmc{#1}{\hrulefill_{<}} } \newcommand{\fcomp}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}}}} \newcommand{\fxcomp}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}_{\times}}}} \newcommand{\fxcompN}[2] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}^{#2}_{\times}}}} \newcommand{\fcomptwo}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}^2}}} \newcommand{\fxcomptwo}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}_{\times}^2}}} \newcommand{\fcompthree}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}^3}}} \newcommand{\fxcompthree}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{B}_{\times}^3}}} \newcommand{\bcomp}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}}}} \newcommand{\bxcomp}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}_{\times}}}} \newcommand{\bxcompN}[2] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}^{#2}_{\times}}}} \newcommand{\bcomptwo}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}^2}}} \newcommand{\bxcomptwo}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}_{\times}^2}}} \newcommand{\bcompthree}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}^3}}} \newcommand{\bxcompthree}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{B}_{\times}^3}}} \newcommand{\fsubst}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{S}}}} \newcommand{\bsubst}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{S}}}} \newcommand{\fxsubst}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{S}_{\times}}}} \newcommand{\bxsubst}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{S}_{\times}}}} \newcommand{\ftype}[1] { \cmc{#1}{\hrulefill_{{>}\mathbf{T}}}} \newcommand{\btype}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{T}}}} \newcommand{\conj}[1] { \cmc{#1}{\hrulefill_{{<}\Phi{>}}}} \newcommand{\boundary}[1] { \cmc{#1}{\hrulefill_{{<}\mathbf{\%}}}} \newcommand{\asterisk}[1] { \cmc{#1}{\hrulefill_{\mathbf{*}}}} \newcommand{\comb}[2] % an arbitrary combinator { \cmc{#1}{\hrulefill_{#2}}} \newcommand{\badcomb}[2] % an inappropriate use of a combinator { \cmc{#1}{\hrulefill_{\mbox{ *** }}\hrulefill_{#2}}} \newcommand{\dcomp}[2] % an arbitrary dcomposition { \cmc{#1}{\dotfill_{#2}}} \newcommand{\unfreeze}[1] {\cmc{#1}{\hrulefill_{\mathbf{@}}} } \newcommand{\ul}{\uline{1}} \newcounter{CCG@counter} \newcommand{\CCG@amp}{&} \newcommand{\ulines}[1]{ \uline{1} \setcounter{CCG@counter}{1} \whiledo{\value{CCG@counter} < #1} { \CCG@amp \uline{1} \stepcounter{CCG@counter} } } ================================================ FILE: docs/realizer/refs.bib ================================================ @PhdThesis{Baldridge:PhD, author = {Jason Baldridge}, title = {Lexically Specified Derivational Control in Combinatory Categorial Grammar}, school = {School of Informatics, University of Edinburgh}, year = 2002 } @InProceedings{Baldridge/Kruijff:2002, author = {Baldridge, Jason and Kruijff, Geert-Jan}, title = "{Coupling {CCG} and Hybrid Logic Dependency Semantics}", booktitle = {Proc.\ ACL-02}, year = 2002 } % booktitle = {Proc.\ of 40th Annual Meeting of the Association for Computational Linguistics}, % pages = {319--326}, % address = {Philadelphia, Pennsylvania}, @InProceedings{Baldridge/Kruijff:2003, author = {Baldridge, Jason and Kruijff, Geert-Jan}, title = "{Multi-Modal Combinatory Categorial Grammar}", booktitle = {Proc.\ ACL-03}, year = 2003 } % booktitle = {Proc.\ of 10th Annual Meeting of the European Association for Computational Linguistics}, % address = {Budapest, Hungary} @phdthesis{Kruijff:PhD, author={Kruijff, Geert-Jan M.}, title={A Categorial Modal Architecture of Informativity: Dependency Grammar Logic \& Information Structure}, school={Charles University}, year=2001 } @InProceedings{Carroll-and-co:1999, author = {John Carroll and Ann Copestake and Dan Flickinger and Victor Pozna\'nski}, title = {An efficient chart generator for (semi-) lexicalist grammars}, booktitle = {Proc.\ EWNLG-99}, year = 1999 } % booktitle = {Proc.\ of the 7th European Workshop on Natural Language Generation}, % Toulouse, France. % pages = {86-95} @InProceedings{Kay:1996, author = {Martin Kay}, title = {Chart generation}, booktitle = {Proc.\ ACL-96}, year = 1996 } % booktitle = {Proc.\ of the 34th Annual Meeting of the Association for Computational Linguistics}, % pages = {200-204} % Santa Cruz, California. @InProceedings{Shieber:1988, author = {Stuart Shieber}, year = 1988, title = {A uniform architecture for parsing and generation}, booktitle = {Proc.\ COLING-88} } % booktitle = {Proc.\ of the 14th International Conference on Computational Linguistics}, % pages = {614-619} % Budapest, Hungary. @InProceedings{Bob-Moore:2002, author = {Robert C. Moore}, title = {A Complete, Efficient Sentence-Realization Algorithm for Unification Grammar}, year = 2002, booktitle = {Proc.\ INLG-02} } % booktitle = {Proc.\ of the 2nd International Natural Language Generation Conference} %pages? %New York @InProceedings{Knight/Hatzi:1995, author = {Kevin Knight and Vasileios Hatzivassiloglou}, year = 1995, title = {Two-level, many-paths generation}, booktitle = {Proc.\ ACL-95} } %I. Langkilde and K. Knight. 1998a. Generation that %exploits corpus-based statistical knowledge. In Proc. %COLING-ACL. @InProceedings{Langkilde/Knight:1998, author = {Irene Langkilde and Kevin Knight}, title = {The practical value of n-grams in generation}, booktitle = {Proc.\ INLG-98}, year = 1998 } % booktitle = {Proc.\ of the Ninth International Workshop on Natural Language Generation}, @InProceedings{Langkilde:2000, author = {Irene Langkilde}, year = 2000, title = {Forest-based statistical sentence generation}, booktitle = {Proc.\ NAACL-00} } @InProceedings{Langkilde-Geary:2002, author = {Irene Langkilde-Geary}, title = {An Empirical Verification of Coverage and Correctness for a General-Purpose Sentence Generator}, year = 2002, booktitle = {Proc.\ INLG-02} } % booktitle = {Proc.\ of the Second International Natural Language Generation Conference} %pages? %New York @InProceedings{Srini/Owen:2000, author = {Srinivas Bangalore and Owen Rambow}, title = {Exploiting a Probabilistic Hierarchical Model for Generation}, year = 2000, booktitle = {Proc.\ COLING-00} } % Saarbrucken, Germany, August 2000. @InProceedings{Varges/Mellish:2001, author = {Sebastian Varges and Chris Mellish}, year = 2001, title = {Instance-based Natural Language Generation}, booktitle = {Proc.\ NAACL-01} } % booktitle = {Proc.\ of the 2nd Meeting of the North American Chapter of the Association for Computational Linguistics}, % pages = {1--8} % (NAACL-2001) % Carnegie Mellon University, Pittsburgh, PA, USA. @Article{Steedman-LI:2000, author = {Mark Steedman}, year = 2000, title = "Information Structure and the Syntax-Phonology Interface", journal = "Linguistic Inquiry", volume = 31, number = 4, pages = {649--689} } @Book{Steedman:SynProc, author = {Mark Steedman}, title = {The Syntactic Process}, publisher = {MIT Press}, year = 2000, } % address="Cambridge Mass.", @InProceedings{White/Baldridge:2003, author = {Michael White and Jason Baldridge}, year = 2003, title = "{Adapting Chart Realization to {CCG}}", booktitle = {Proc.\ EWNLG-03} } % booktitle = {Proc.\ of the 9th European Workshop on Natural Language Generation} @Article{White-RLAC:2004, author = {Michael White}, year = 2004, title = "{Efficient Realization of Coordinate Structures in Combinatory Categorial Grammar}", journal = {Research on Language and Computation}, note = {To appear} } @InProceedings{White-INLG:2004, author = {Michael White}, year = 2004, title = "{Reining in CCG Chart Realization}", booktitle = {Proc.\ INLG-04} } @InProceedings{White-ACLSoft:2005, author = {Michael White}, year = 2005, title = "Designing an Extensible {API} for Integrating Language Modeling and Realization", booktitle = {Proc.\ ACL-05 Workshop on Software} } @InCollection{Kruijff:2003, author = {Geert-Jan M. Kruijff}, title = {Binding Across Boundaries}, booktitle = {Resource-Sensitivity in Binding and Anaphora}, editor = {Geert-Jan M. Kruijff and Richard T. Oehrle}, publisher = {Kluwer Academic Publishers}, year = 2003 } @PhdThesis{Shemtov:PhD, year=1997, author={Hadar Shemtov}, title={Ambiguity Management in Natural Language Generation}, school={Stanford University} } @TechReport{Bleu:2001, author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-Jing Zhu}, year = 2001, title = "{Bleu: a Method for Automatic Evaluation of Machine Translation}", institution = {IBM}, number = {RC22176} } @InProceedings{FLIGHTS-FLAIRS:2004, author = {Johanna Moore and Mary Ellen Foster and Oliver Lemon and Michael White}, title = "Generating Tailored, Comparative Descriptions in Spoken Dialogue", booktitle = {Proc.\ FLAIRS-04}, year = {2004} } @inproceedings{COMIC-eChallenges:2003, title = {Towards Ambient Intelligence: Multimodal Computers that Understand our Intentions}, author = {Els den Os and Lou Boves}, booktitle = {Proc.\ eChallenges-03}, year = {2003} } @InProceedings{SRILM-ICSLP:2002, author = {Andreas Stolcke}, title = "{SRILM} --- {A}n extensible language modeling toolkit", booktitle = {Proc.\ ICSLP-02}, year = 2002} @TechReport{Ristad:1995, author = {Eric S.\ Ristad}, year = 1995, title = "{A Natural Law of Succession}", institution = {Princeton Univ.}, number = {CS-TR-495-95} } @InProceedings{Bilmes-Kirchoff:2003, author = {Jeff Bilmes and Katrin Kirchhoff}, title = "Factored language models and general parallelized backoff", booktitle = {Proc.\ HLT-03}, year = 2003} @Misc{FLM-JHSW:2002, author = {Katrin Kirchhoff and Jeff Bilmes and Sourin Das and Nicolae Duta and Melissa Egan and Gang Ji and Feng He and John Henderson and Daben Liu and Mohamed Noamany and Pat Schone and Richard Schwartz and Dimitra Vergyri}, title = "{Novel Approaches to Arabic Speech Recognition: Report from the 2002 Johns-Hopkins Summer Workshop}", year = 2002} @Article{Daelemans:1999, author = {Walter Daelemans}, title = "Introduction to the special issue on memory-based language processing", journal = {Journal of Experimental \& Theoretical Artificial Intelligence}, volume = 11, number = 3, year = 1999, pages = {287--296} } @InProceedings{Shaw/Hatzi:1999, author = {James Shaw and Vasileios Hatzivassiloglou}, title = "Ordering Among Premodifiers", booktitle = {Proc.\ ACL-99}, year = 1999} % Proc. of the 37th Association for Computational Linguistics, pages 135-143, College Park, Maryland, June 1999 @InProceedings{Malouf:2000, author = {Robert Malouf}, title = "The order of prenominal adjectives in natural language generation", booktitle = {Proc.\ ACL-00}, year = 2000} % Proceedings of the 38th Annual Meeting of the Association for Computational Linguistics. Pages 85-92. @Article{Adwait:2002, author = {Adwait Ratnaparkhi}, title = "Trainable approaches to surface natural language generation and their application to conversational dialog systems", journal = {Computer, Speech \& Language}, volume = 16, number = {3/4}, year = 2002, pages = {435--455} } @Article{Oh/Rudnicky:2002, author = {Alice H. Oh and Alexander I. Rudnicky}, title = "Stochastic natural language generation for spoken dialog systems", journal = {Computer, Speech \& Language}, volume = 16, number = {3/4}, year = 2002, pages = {387--407} } @InProceedings{Carsten-Alignment:2005, author = {Carsten Brockmann and Amy Isard and Jon Oberlander and Michael White}, title = "Variable alignment in affective dialogue", booktitle = {Proc.\ UM-05 Workshop on Affective Dialogue Systems}, year = 2005} @Unpublished{COMIC-D7.4:2004, author = {Michael White}, title = "Experiments with Multimodal Output in Human-Machine Interaction", note = {IST Project COMIC Public Deliverable 7.4}, year = 2004} %\texttt{http://www.hcrc.ed.ac.uk/comic/documents/deliverables/D7.4-final.pdf} @InProceedings{Foster-White-NLPXML:2004, author = {Mary Ellen Foster and Michael White}, title = "{Techniques for Text Planning with XSLT}", booktitle = {Proc.\ 4th NLPXML Workshop}, year = 2004} ================================================ FILE: docs/style.css ================================================ body{ font-family: Arial, Helvetica, sans-serif; size: 12pt; background: white; color: black; } h1{ font-family: Verdana, Arial, Helvetica, sans-serif; font-size: 18pt; color: #006699; font-weight: italic; text-align: center; } h2{ font-family: Verdana, Arial, Helvetica, sans-serif; font-size: 16pt; font-weight: bold; color: #006699; } h3{ font-family: Verdana, Arial, Helvetica, sans-serif; font-size: 10pt; color: #006699; text-align: right; } a{ font-family: arial,times; } p{ font-family: Verdana, Arial, Helvetica, sans-serif; color: #000000; font-size: 12pt; } table{ border: 0; margin: 0; } td{ font-family: Bookman,Lucida, Helvetica,arial, times; font-size: 12pt; } td.header1{ font-family: Bookman,Lucida, Helvetica,arial, times; font-size: 12pt; background: #52A0EF; } td.banner{ font-family: Bookman,Lucida, Helvetica,arial, times; font-size: 12pt; background: white; } td.header2{ font-family: Bookman,Lucida, Helvetica,arial, times; font-size: 12pt; background: #52A0EF; } ================================================ FILE: docs/taggers-README ================================================ The OpenCCG POS- and supertaggers can be used off-the-shelf, as well as with the OpenCCG parser. This README describes how to train and use the taggers off-the-shelf; for their use with the OpenCCG parser, see ccgbank-README. The training process has been implemented in an ant build file, so it's a bit easier now. Before you get started, you'll need to configure your environment variables as described in the main README (ie, $OPENCCG_HOME/README). After that, you'll need to install SRILM and Zhang Le's maxent toolkit, as described below. Note that this should be relatively straightforward on linux, but potentially difficult on other platforms. Once you've taken care of these prerequisites, simply cd into $OPENCCG_HOME/ccgbank, and then you can simply train the POS- and supertagger as follows: $ cd $OPENCCG_HOME/ccgbank $ ccg-build -f build-original.xml &> logs/log.original & Note that ccg-build is simply a front-end to ant that sets some environment variables, so you can pass through ant flags such as -f, which indicates that the build file to use is build-original.xml (rather than the default, build.xml). The training will take a while; you can check the progess by peeking at the log file (logs/log.original), as well as the log files for individual steps in the process, which you can see by looking in the build-original.xml file. Once training is done, you can try it out as follows: $ ccg-build -f build-original.xml test &> logs/log.original.test & This task will test the POS- and supertaggers on the dev section, and should only take a matter of minutes. Note that you will likely have to increase the Java memory limit, if you haven't already done so; this can be done by choosing a higher value for JAVA_MEM at the end of the script $OPENCCG_HOME/bin/ccg-env, which is invoked by all the other OpenCCG scripts (including ccg-build). The supertagger output file will be in the following format (with each sentence bracketed by ...): [tab][tab]...[tab]...[tab][tab]...[tab]... ... [tab][tab]...[tab]...[tab][tab]...[tab]... Adjust the '-beta' option in the build file's test-st-model target to tag at greater tagging ambiguity levels. Have fun, and do let us know if anything in these instructions is flawed. ----------------- REQUIRED SOFTWARE ----------------- To train the models, you'll need to have SRILM command-line tools (or some in-house stand-in) and Zhang Le's maxent toolkit working. Install SRILM as per the directions, and do the following to install the patched version of Zhang Le's toolkit: $ cd $ wget http://homepages.inf.ed.ac.uk/lzhang10/software/maxent/maxent-20061005.tar.bz2 Unpack and patch the maxent.cpp file (it doesn't cover the case where ':' can be part of the feature symbol itself, and not just a delimiter that separates string repr's of features from their real-valued activations). $ bunzip2 maxent-20061005.tar.bz2 $ tar xf maxent-20061005.tar $ cd maxent-20061005/src $ patch maxent.cpp $OPENCCG_HOME/docs/maxent.cpp.patched Now compile the maxent code. $ cd $ cd maxent-20061005 $ make clean all unittest Test to make sure it (more or less) works (I always only pass 7 of the 8 tests, but the training seems to work): $ cd test $ ./runall.py Finally, add the 'maxent' binary (under 'maxent-20061005/src/opt') to your PATH environment variable. Good. Now we're ready to train some taggers! ---------------- NOTES ------------------------ All the taggers in OpenCCG perform forward-backward tagging. To simplify the implementation (and to take advantage of arbitrarily long n-gram histories of tags) we take a hybrid approach, simply multiplying the non-sequence-aware maxent tagging model with a SRILM-trained (i.e., ARPA-formatted) model of tag sequences. This was the alternative to the orthodox MEMM (Maximum Entropy Markov Model) approach suggested in (McCallum, et al., 2000, section 2.6). Otherwise, the approach closely follows that of Curran, Clark and Vadas (2006), including the use of beta-best POS tags as features. The POS- and supertaggers can make use of a prior model instead of a tag dictionary. The idea is to train a prior model to give probabilistic features to a downstream maxent model and let it sort it out, rather than using tagging dictionaries (which use crude frequency cut-offs to determine which tags a word may be assigned). Unfortunately, empirical testing of this idea has been inconclusive, so it's been put on the back burner (by default, a prior model is used with the POS tagger but not the supertagger); it remains for future work to better test the idea, and in particular, whether the prior model feature approach can take better advantage of self-trained data by just re-training the prior model on such data. ================================================ FILE: grammars/add-chunks.xsl ================================================ ================================================ FILE: grammars/add-family-members.xsl ================================================ Warning, no family with name ' ' found for entry with stem ' '. Warning, no entry with name ' ' found in family with name ' ' for entry with stem ' '. ================================================ FILE: grammars/append.xsl ================================================ ================================================ FILE: grammars/categories.xsd ================================================ Elements and types for categories and feature structures. By and large, there are no constraints on the names used in defining categories, with just a few exceptions: - '[*DEFAULT*]' proposition or feature value: this reserved value is replaced with the predicate associated with a lexical item (defaulting to the stem) when instantiating the lexical categories associated with a word - 'index' feature: this feature receives special treatment in the realizer; see Mike and Jason's paper on the realizer for details, at http://www.iccs.informatics.ed.ac.uk/~mwhite/White-Baldridge-ENLG-2003-to-appear.pdf - lists: lists are defined with the 'First' and 'Last' relations, which is (optionally) converted to just the 'List' relation by simplify-lists.xsl; elements in the list are represented using nodes with the 'elem' predicate, together with 'Item' and 'Next' relations to point to the actual list item and the next element, respectively - tuples: pairs (for argument clusters and gapping) are represented using nodes with the 'tup' (for tuple) predicate, together with 'Item1' and 'Item2' relations to point to the paired items (in principle, further 'ItemN' relations could be used for tuples of length greater than 2); paired items receive special treatment in the realizer - 'BoundVar' relation: this relation indicates that the subordinate nominal is a bound var; as a result, feature-based instantiation is disabled in the realizer - 'mark' feature: this semantic attribute indicates that the realizer should label the phrase in the XML output headed by the index associated with this feature A category, either atomic or complex. An atomic category. Atomic categories must have a type, and may contain a feature structure and LF. The possible LF predications are given by the "hldsPreds" type in hlds.xsd. The type of the category, e.g. "np". A complex category, consisting of a target category, an argument stack, and an optional LF. Since a complex cat is not curried, the target category will always be atomic. An argument stack is one or more basic args, dollar args or set args. A simple, non-recursive feature structure, consisting of a set of features, i.e. attribute-value pairs. If there is only a single, string-valued feature, it can be specified using the "attr" and "val" attributes on this element. An integer id for the feature structure. The id is used for coindexation, "inheritsFrom" feature propagation, and macro access. This attribute is used to specify feature propagation with exceptions (i.e., default unification). At run time, feature equations are added in order to propagate feature values. In particular, a feature variable is added to this feature structure and a corresponding one is added to the referenced feature structure, for all appropriate attributes except those with explicit values already on this feature structure. The appropriate attributes are determined by the type of the atomic category, and consist of all the attributes seen with this category type in the lexicon file. NB: Attributes appearing in macros in the morph file are not included, since their association with category types is not available statically. The name of the single, string-valued feature (when appropriate). The value of the feature. A feature, i.e. an attribute-value pair. Features can be either syntactic or semantic (LF-valued). Syntactic features can be either ground or variable. If ground, they must be string-valued, and specified via the "val" attribute; if variable, the value is a variable named by a "featvar" element. Semantic features can also be either ground or variable; their possible values are given by the "hldsFeatVals" type in hlds.xsd. The name of the feature. The value of the feature, when string-valued. A variable over syntactic feature values. The name of the feature variable. A basic arg pairs a slash and a category. A dollar arg pairs an optional slash and a dollar variable. The slash defaults to the most general slash. A dollar variable in the definition of a complex category, i.e. a variable over any sublist of args in an arg stack. The name of the dollar variable, for coindexation purposes. A set arg, i.e. an unordered set of basic args. A slash in the definition of a complex category. A slash has a direction, a mode and an ability; a variable can also be given for the mode, using the "varmodality" attribute. For discussion, see Jason's dissertation, esp. Ch. 8 on the implementation; the dissertation is downloadable from http://www.iccs.inf.ed.ac.uk/~jmb/dissertation The direction of the slash. The direction can be forward (/), backward (\) or both (|). Defaults to both. The mode of the slash. The possible values are: all (.), application only (*), associative (^), permutative (x), permutative right (x>), permutative left (<x), associative permutative right (>), and associative permutative left (<). Defaults to all. See Jason's dissertation for details. A variable over modalities. The ability of the slash, either inert or active. Defaults to either. This is used to implement antecedent government; see Jason's dissertation for details. ================================================ FILE: grammars/comic/build.xml ================================================ ================================================ FILE: grammars/comic/dict.xml ================================================ ================================================ FILE: grammars/comic/dict.xsl ================================================ ================================================ FILE: grammars/comic/grammar.xml ================================================ ================================================ FILE: grammars/comic/lexicon-base.xsl ================================================ Creator Source ================================================ FILE: grammars/comic/lexicon.xml ================================================ ================================================ FILE: grammars/comic/morph.xml ================================================ ================================================ FILE: grammars/comic/rules-base.xml ================================================ ================================================ FILE: grammars/comic/rules.xml ================================================ ================================================ FILE: grammars/comic/testbed.xml ================================================ Describing the style. Manufacturer and series. Decorative tiles. Appositive 'with' now uses an elab-rel. Colour scheme. Added pitch accent onto canned text bits. Combining things. The shared argument (the colours) no longer has to be separated out, since the VPing is now an appositive. NB: The relative clause 'that features ...' is now an appositive, and has theme status here; this would be appropriate in the context of the question 'do you have any more designs with blue in the colour scheme?'. Transitions. Further options (in thumbnails). NB: this clause from the sample dialogue gets its own sentence. Questions. Changed pitch accent and phrase to theme, for clarification context. Changed pitch accent and phrase to theme, for clarification context. User tailoring. ================================================ FILE: grammars/comic/types-extras.xml ================================================ ================================================ FILE: grammars/comic/types.xml ================================================ ================================================ FILE: grammars/convert-lists.xsl ================================================ ================================================ FILE: grammars/convert-to-graph.xsl ================================================ ================================================ FILE: grammars/convert-to-hlds.xsl ================================================ Error: node with id = should be a reference (with idref) ================================================ FILE: grammars/core-en/add-chunks.xsl ================================================ ================================================ FILE: grammars/core-en/add-intonation-info.xsl ================================================ ' ================================================ FILE: grammars/core-en/adj.xsl ================================================ ================================================ FILE: grammars/core-en/adv.xsl ================================================ ================================================ FILE: grammars/core-en/auxv.xsl ================================================ ================================================ FILE: grammars/core-en/cats.xsl ================================================ ================================================ FILE: grammars/core-en/conj.xsl ================================================ ================================================ FILE: grammars/core-en/derive-features.xsl ================================================ rh h s - ================================================ FILE: grammars/core-en/det.xsl ================================================ ================================================ FILE: grammars/core-en/dict.xsl ================================================ ================================================ FILE: grammars/core-en/drop-features.xsl ================================================ ================================================ FILE: grammars/core-en/lexicon.xsl ================================================ ================================================ FILE: grammars/core-en/misc.xsl ================================================ ================================================ FILE: grammars/core-en/np.xsl ================================================ ================================================ FILE: grammars/core-en/pp.xsl ================================================ Poss ================================================ FILE: grammars/core-en/punct.xsl ================================================ ================================================ FILE: grammars/core-en/raise-nodes.xsl ================================================ Arg1 ================================================ FILE: grammars/core-en/templates.xsl ================================================ ================================================ FILE: grammars/core-en/types.xml ================================================ ================================================ FILE: grammars/core-en/unary-rules.xsl ================================================ ================================================ FILE: grammars/core-en/v.xsl ================================================ ================================================ FILE: grammars/dict.xsd ================================================ Schema for dict file. A dict file is a more refined version of a morph file that groups word forms by their stems (and parts of speech) and lists the mappings to lexical categories. See the grammar build targets in build.xml for how a dict file can be straightforwardly transformed into the morph.xml and lexicon.xml files required by the run-time system. See morph.xsd for more information on the the mapping between morph items and lexical categories, and on the macros. The root element, containing a list of dictionary entries and macros. A dictionary entry includes any number of 'member-of' and 'stem-for' mapping elements, followed by any number of 'word' elements for different word forms. The stem and part of speech (pos) attributes are required. If there is just a single word form, it can be specified using the 'word' attribute, with the stem as the default. The predicate may also be given with the 'pred' attribute, if it differs from the stem. Any macros that apply to all of the entry's word forms may also be listed. See morph.xsd for info on the (semantic) class and coart(iculation) attributes. Specifies a mapping from the entry to a family of lexical categories. The 'family' attribute should provide the name of a category family. The 'pred' attribute may be used to specify a different predicate to use with this family. Specifies a mapping from the entry to a particular entry in a family of lexical categories. The 'family' and 'entry' attributes should provide the names of a category family and entry therein. A specific word form, given by the 'form' attribute. Any macros that are particular to the word form may be listed. Any lexical category families or entries whose mapping should be blocked may also be listed. ================================================ FILE: grammars/extract-morph.xsl ================================================ ================================================ FILE: grammars/flights/build.xml ================================================ ================================================ FILE: grammars/flights/dict.xml ================================================ ================================================ FILE: grammars/flights/dict.xsl ================================================ ================================================ FILE: grammars/flights/flairs.xml ================================================ Student example. To make "on Ryanair" into a predicative adjective, so that it can serve as the propositional complement of the copula, a dummy 'has-rel' head is introduced. Frequent Flyer example. Business Class example. Frequent Flyer variant, with preferred airline second (still a bit awkward). Some extra test cases. ================================================ FILE: grammars/flights/grammar.xml ================================================ ================================================ FILE: grammars/flights/lexicon-base.xsl ================================================ Airline airline ================================================ FILE: grammars/flights/lexicon.xml ================================================ ================================================ FILE: grammars/flights/morph.xml ================================================ ================================================ FILE: grammars/flights/nina107.xml ================================================ skipping to nina107_033 TBD: nina107_040.wav From where? nina107_041.wav And from where? ================================================ FILE: grammars/flights/rules-base.xml ================================================ ================================================ FILE: grammars/flights/rules.xml ================================================ ================================================ FILE: grammars/flights/testbed.xml ================================================ ================================================ FILE: grammars/flights/types-extras.xml ================================================ ================================================ FILE: grammars/flights/types.xml ================================================ ================================================ FILE: grammars/flights/vera.xml ================================================ ================================================ FILE: grammars/grammar.xsd ================================================ Schema for grammar file. A grammar file lists the lexicon, morphology and rules files, and optionally lists the types file and sequences of XSLT transformations to use in loading/saving LFs from/to XML. It may also specify a custom list of features to use with supertag names, a custom tokenizer, and/or list semantic classes to replace words with for language models. The root element. An element with a file attribute specifying a filename. An element with a file attribute specifying a filename for the lexicon file, plus an optional openlex attribute indicating whether lexical category assignments are complete (for openlex false) or incomplete (for openlex true). An element with a file attribute specifying a filename for the rules file. An optional combosfile attribute gives the filename for the list of observed supercat-rule combos. The format of the combos file is one combo per line, where each combo is either a supertag and a rule name, or a pair of supertags and a rule name, separated by single spaces. The optional dynamic-combos flag indicates whether the observed combos should be determined dynamically (using the ones in the combos file as a starting point, if present). By default, dynamic-combos is true if a combos file is given, and false if not. An element which gives the features to use in constructing supertag names. The feature names are given as a space-delimited list in the feats attribute. An element which gives the fully qualified class name for a custom tokenizer, and/or lists semantic classes to replace words with for language models. Specifies a sequence of XSLT transformations. ================================================ FILE: grammars/hlds.xsd ================================================ Elements and types for hybrid logic dependency semantics (HLDS) constructs. The subset of hybrid logic terms used in HLDS is defined here. (The Java classes currently support a slightly larger subset.) To support flattening of these terms into elementary predications, the contents of the satisfaction operators and diamond relations (modal ops) are restricted, as described in detail below. Note that where a list of terms is allowed, these are implicitly conjoined; a conjunction operator is inserted upon loading, when necessary. LFs with disjunctive or optionality operators may also be specified. Nominals and variables may be given a type (or sort) listed in the types file, by appending the type to the name after a colon. Propositions whose names appear in the types file are also treated as typed. Typically one or more satisfaction operators, encoding the semantics associated with a category. Exclusive disjunctions (xor) are also allowed. The possible values of an LF feature, i.e., a nominal (atom or variable), proposition, or variable (over HLDS terms). A satisfaction operator, with a nominal and an arg. The nominal can either be a nominal atom, named by the "nom" attribute, or a nominal variable, named by the "nomvar" attribute. The nominal may be optionally marked as "shared" (see nominal def below). The arg optionally begins with a proposition, followed by any number of diamond relations or disjunctive/optionality operators, but must be non-empty. A named proposition. A diamond relation (modal operator), with a mode and an arg. The "mode" attribute names the relation. The arg is either just a proposition or a variable, or a nominal followed optionally by a proposition and any number of nested diamond relations or disjunctive/optionality operators. In the former case, the relation encodes a semantic feature; in the latter case, it encodes a relation to another semantic head. The arg may also be a disjunctive operator containing satops. A conjunction (conj), exclusive disjunction (xor) or optionality (opt) operator. Note that conjunctions are usually left implicit. A nominal term, either an atom or a variable. The "shared" attribute indicates that the nominal term provides a reference to a node that is a shared part of multiple alternatives, rather than being a subordinated or coordinated reference. A nominal atom. A nominal variable. A variable over HLDS terms. A term with a name. A named term with an optional "shared" attribute. ================================================ FILE: grammars/lexicon.xsd ================================================ Schema for lexicon file. The lexicon file contains the definitions of the lexical categories, whose entries are grouped into families (inspired by XTAG tree families). Families may be open or closed. With open families, all words listed in the morph file with the same part of speech are mapped to all entries in the family. With closed families, the members of the family must be explicitly listed. The lexicon may specify that certain features should be treated specially, either for licensing and instantiating semantically null or "marked" categories in the realizer, or for implementing "distributive" behavior. See below for further details. The root element, which includes the list of category families and the declarations of the licensing and distributive attributes. Lists the names of the attributes with "distributive" behavior, i.e. ones where the feature value is supposed to show up on every atomic category. Attributes that are distributive in this sense can be used to prevent phrasal combinations across theme/rheme boundaries, following Steedman's '00 LI paper. Distributive attributes receive two kinds of special processing. First, after instantiating a lexical category for a word, if a distributive attribute has a unique value, it is automatically spread to all of the atomic categories, thereby alleviating the need to ensure this happens in the definition of the lexical categories and macros. Second, as a stopgap measure, following rule combinations, the value of any distributive attribute appearing on the result category is propagated to all argument categories, overwriting any values that may already be there. This processing allows categories such as s[eme=phr]$\s[eme=th]$ to be used for boundary tones, where the value of the distributive 'eme' feature is changed on the target 's' category (in the lexicon), but the value of this attribute on the $-matched categories remain unchanged until the special processing kicks in. This stopgap measure would not be necessary if the 'inheritsFrom' and $ capabilities could be combined, which would require a more dynamic treatment of 'inheritsFrom' (i.e. exceptive feature propagation, or default unification). A space-separated list of attribute names. Lists the features to use to license and/or instantiate semantically null or "marked" categories in the realizer, in priority order. Examples of semantically empty categories may include case-marking prepositions or particles, infinitival 'to', or complementizers such as 'that'. Examples of categories which may be considered "marked" include inverting categories for auxiliaries which are used in questions, but not ordinary declaratives. Note that by default, the 'lex' feature is included as one used to license and instantiate semantically empty categories, for all values, in all locations. Thus, if (say) a semantically null, case marking preposition category has a 'lex' attribute which is instantiated with the stem (via the '[*DEFAULT*]' mechanism), then this category will be activated in the realizer iff any of the categories instantiated for the input logical form have the same value for the 'lex' feature, thereby indicating its relevance. If an alternative specification for the 'lex' feature is listed, the default specification is not included. The name of the licensing feature, i.e. the attribute. The value of the feature. If not specified, all attribute values are treated the same for licensing and instantiation purposes. A list of other values of this feature whose presence may also license categories with this feature value. A flag indicating whether semantically null categories with the licensing feature need to be licensed. Defaults to true. A flag indicating whether initial categories with the licensing feature are marked and need to be licensed. Defaults to false. If set to true, the defaults for the license-empty-cats and instantiate flags change to false, and the default for the location changes to target-only. A flag indicating whether semantically empty categories with the licensing feature should be instantiated. Defaults to true. The location of the licensing feature on the category to be licensed. If target-only, then the value must appear only on the target category. If args-only, then the value must appear only on the argument categories. Defaults to both. Optionally specifies the order in which to sort relations in the logical forms. The default sort order is as follows, where "*" indicates all other relations (sorted alphabetically): "BoundVar", "PairedWith", "Restr", "Body", "Scope", "*", "GenRel", "Coord", "Append" The default order may be overridden using the 'order' attribute, which should contain a space-separated list of relation names, with "*" indicating all other relations. A space-separated list of relation names. A category family, which groups lexical category entries, and optionally includes a list of the family members, if the family is closed. Family members are listed by their stems. A 'member' element may also have a predicate ('pred') explicitly specified, if it differs from the stem. The predicate is used to fill in any propositions in the semantics with the reserved name '[*DEFAULT*]'. The name of the family must be unique. The part of speech ('pos') is used in the mapping between morphological items and families, as described earlier. A family may also declare a relation used for indexing purposes (the 'indexRel') in the realizer, i.e. a relation used to lookup words based on relational or featural elementary predications in the input logical form; see Mike and Jason's paper on the realizer for discussion, at http://www.iccs.informatics.ed.ac.uk/~mwhite/White-Baldridge-ENLG-2003-to-appear.pdf. Families with semantically null entries are indicated using the reserved value '*NoSem*'. Relations may also be declared at the level of entries. NB: It would be nicer if the system could figure out which entries were semantically null; what makes this non-trivial is that macros can introduce additional semantic semantic predications. At present, this is only discovered when lexical categories are instantiated for specific words. As an alternative to the indexRel, a family may declare a relation (semantic feature) to use to signal a coarticulation, via the 'coartRel' attribute. For example, the coart rel "kon" may be used to signal a contrast feature that triggers the addition of a pitch accent. Coarticulation are handled as lexical modifiers, and as such should be given modifier categories of the form X|X. An entry for a family, which contains the definition of a lexical category, and must have a name which is unique within the family. If there is only stem that is applicable to this entry, it may be specified on this element. A relation to use for indexing purposes in realization may be given for this entry (if different from other entries in the family). It is possible to temporarily disable an entry by setting the 'active' attribute to false (defaults to true). ================================================ FILE: grammars/mini-basque/build.xml ================================================ ================================================ FILE: grammars/mini-basque/dict.xml ================================================ ================================================ FILE: grammars/mini-basque/grammar.xml ================================================ ================================================ FILE: grammars/mini-basque/lexicon-base.xml ================================================ ================================================ FILE: grammars/mini-basque/lexicon.xml ================================================ ================================================ FILE: grammars/mini-basque/morph.xml ================================================ ================================================ FILE: grammars/mini-basque/parameters.xml ================================================ ================================================ FILE: grammars/mini-basque/preset-families.xml ================================================ ================================================ FILE: grammars/mini-basque/rules.xml ================================================ ================================================ FILE: grammars/mini-basque/testbed.out ================================================ Loading grammar from URL: file:/home/bozsahin/openccg/grammars/mini-basque/grammar.xml Parse Realize String ----- ------- ------ ok - joan ok - ikusi ok - zuen ok - zen ok - klase ra ok - eskola n ok - ama aitak ikusi zuen ok - ama joan ok - *ama k joan ok - *ama k joan du (2) - ama k seme a ikusi du (2) - seme a ama k ikusi du (2) - nik liburu a emakume ari eman nahi dut FAILED - *nik liburu a aita k eman nahi dut FAILED - *nik aita k liburu a eman nahi dut ok - ama k seme a emakume ari eman dio ok - seme a ama k emakume ari eman dio ok - emakume ari seme a ama k eman dio ok - nik joan nahi dut (2) - nik kafe a egin nahi dut ok - *nik kafe a egin nahi du ok - nik kafe a egin nahi FAILED - *nik aita k ikusi nahi dut ok - nik ikusi nahi dud an gizon a joan da ok - gizon a ikusi nahi dud an seme a joan da ok - emakume ari liburu a eman dio n gizon a joan du FAILED - emakume ari liburu a eman dio n gizon a nik ikusi dut ok - *emakume ari liburu a eman dio n gizon ak ni ikusi dut ok - emakume ari liburu a eman dio n gizon ak ni ikusi (2) - gizon ak emakume ari eman dio n liburu a aita k ikusi du (3) - gizon ak liburu a eman dio n emakume a ok - gizon ak liburu a eman dio n emakume a joan du (3) - aita k seme a eskola n utzi zuen eta klase ra joan zen (3) - seme a eskola n utzi zuen eta klase ra joan zen (3) - seme a eskola n utzi eta klase ra joan zen ok - *seme a eskola n utzi eta klase ra joan zuen ok - ama joan zen eta aita k ikusi zuen ok - aita k ikusi zuen eta klase ra joan zen ok - *ama k joan eta aita ikusi zuen ok - aitak ama ikusi zuen eta joan zen ok - ama aitak ikusi zuen eta joan zen ok - joan zen eta ama ikusi zuen ok - joan zen eta aitak ikusi zuen FAILED - sagarrak emakume ak egosten ditu eta gizonak jaten ditu ok - sagarrak emakume ak egosten ditu eta usteltzen dira ok - sagarrak usteltzen dira eta emakume ak egosten ditu ================================================ FILE: grammars/mini-basque/testbed.xml ================================================ ================================================ FILE: grammars/mini-basque/types.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/build.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/dict.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/grammar.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/lexicon-base.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/lexicon.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/morph.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/parameters.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/preset-families.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/rules.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/testbed.out ================================================ Loading grammar from URL: file:/home/bozsahin/openccg/grammars/mini-dyirbal/grammar.xml Parse Realize String ----- ------- ------ ok - bayi yara walngarra nabaygu ok - bayi yara walngarra bural naygu bagun yibi gu ok - bayi yara walngarra bangun yibi ngu burali ok - yabu numa ngu gigan banagaygu ok - naja bayi yara gigan gubi ngu mawali ok - *naja bayi yara gigan bayi gubi mawali ok - bayi yara miyanda nu yanu ok - balan yibi bangul yara ngu miyanda nu ru buran ok - bayi yara jilwal na nu bagun guda gu yanu ok - bayi burrbula bangul gubi ngu baran , bajigu FAILED - *bangul gubi ngu bayi burrbula baran , bajigu ok - bayi yara yanu , bangun yibi ngu buran ok - nyurra nanana buran , banaganyu ================================================ FILE: grammars/mini-dyirbal/testbed.xml ================================================ ================================================ FILE: grammars/mini-dyirbal/types.xml ================================================ ================================================ FILE: grammars/mini-english/build.xml ================================================ ================================================ FILE: grammars/mini-english/dict.xml ================================================ ================================================ FILE: grammars/mini-english/grammar.xml ================================================ ================================================ FILE: grammars/mini-english/lexicon-base.xml ================================================ ================================================ FILE: grammars/mini-english/lexicon.xml ================================================ ================================================ FILE: grammars/mini-english/morph.xml ================================================ ================================================ FILE: grammars/mini-english/parameters.xml ================================================ ================================================ FILE: grammars/mini-english/preset-families.xml ================================================ ================================================ FILE: grammars/mini-english/rules.xml ================================================ ================================================ FILE: grammars/mini-english/testbed.out ================================================ Loading grammar from URL: file:/home/bozsahin/openccg/grammars/mini-english/grammar.xml Parse Realize String ----- ------- ------ ok - John promised him to exercise ok - John persuaded him to read the book ok - *John promised him Sue see ok - John gave Mary and read a book ok - the doctor punched the patient and fell-down ================================================ FILE: grammars/mini-english/testbed.xml ================================================ ================================================ FILE: grammars/mini-english/types.xml ================================================ ================================================ FILE: grammars/mini-inuit/build.xml ================================================ ================================================ FILE: grammars/mini-inuit/dict.xml ================================================ ================================================ FILE: grammars/mini-inuit/grammar.xml ================================================ ================================================ FILE: grammars/mini-inuit/lexicon-base.xml ================================================ ================================================ FILE: grammars/mini-inuit/lexicon.xml ================================================ ================================================ FILE: grammars/mini-inuit/morph.xml ================================================ ================================================ FILE: grammars/mini-inuit/parameters.xml ================================================ ================================================ FILE: grammars/mini-inuit/preset-families.xml ================================================ ================================================ FILE: grammars/mini-inuit/rules.xml ================================================ ================================================ FILE: grammars/mini-inuit/testbed.out ================================================ Loading grammar from URL: file:/home/bozsahin/openccg/grammars/mini-inuit/grammar.xml Parse Realize String ----- ------- ------ ok - miiqqat Juuna ikiussallugu niriursuipput ok - miiqqat qitissallutik niriursuipput ok - nanuq Piita p tugu taa ok - miiraq kamat tuq ================================================ FILE: grammars/mini-inuit/testbed.xml ================================================ ================================================ FILE: grammars/mini-inuit/types.xml ================================================ ================================================ FILE: grammars/mini-nezperce/build.xml ================================================ ================================================ FILE: grammars/mini-nezperce/parameters.xml ================================================ ================================================ FILE: grammars/mini-nezperce/preset-families.xml ================================================ ================================================ FILE: grammars/mini-nezperce/types.xml ================================================ ================================================ FILE: grammars/mini-tagalog/build.xml ================================================ ================================================ FILE: grammars/mini-tagalog/dict.xml ================================================ ================================================ FILE: grammars/mini-tagalog/grammar.xml ================================================ ================================================ FILE: grammars/mini-tagalog/lexicon-base.xml ================================================ ================================================ FILE: grammars/mini-tagalog/lexicon.xml ================================================ ================================================ FILE: grammars/mini-tagalog/morph.xml ================================================ ================================================ FILE: grammars/mini-tagalog/parameters.xml ================================================ ================================================ FILE: grammars/mini-tagalog/preset-families.xml ================================================ ================================================ FILE: grammars/mini-tagalog/rules.xml ================================================ ================================================ FILE: grammars/mini-tagalog/testbed.out ================================================ Loading grammar from URL: file:/home/bozsahin/openccg/grammars/mini-tagalog/grammar.xml Parse Realize String ----- ------- ------ (3) - iniwasan ko ng tumingin kay Lorna ok - nagatubili siya ng humiram ng pera sa bangko ok - nagatubili siya ng hiramin ang pera sa bangko (2) - nagatubili siya ng hiraman ng pera ang bangko ok - binawalan ko si Maria ng awitin ang pera (3) - ang babae ng bumili ng baro (3) - ang baro ng binili ng babae (2) - huhugasan ko at pupunasan mo ang mgapinggan (2) - niluto ang pagkain at hinugasan ang mgapinggan ni Josie (4) - nanghuhuli ang ama at nagtitinda ang ina ng isda (2) - nagbigay ng regalo si Maria at nagpadala ng liham ang mgabat a kay Juan (5) - pumunta sa tindahan at bumili ang kapatid ng bigas ================================================ FILE: grammars/mini-tagalog/testbed.xml ================================================ ================================================ FILE: grammars/mini-tagalog/types.xml ================================================ ================================================ FILE: grammars/mini-turkish/build.xml ================================================ ================================================ FILE: grammars/mini-turkish/dict.xml ================================================ ================================================ FILE: grammars/mini-turkish/grammar.xml ================================================ ================================================ FILE: grammars/mini-turkish/lexicon-base.xml ================================================ ================================================ FILE: grammars/mini-turkish/lexicon.xml ================================================ ================================================ FILE: grammars/mini-turkish/morph.xml ================================================ ================================================ FILE: grammars/mini-turkish/parameters.xml ================================================ ================================================ FILE: grammars/mini-turkish/preset-families.xml ================================================ ================================================ FILE: grammars/mini-turkish/rules.xml ================================================ ================================================ FILE: grammars/mini-turkish/testbed.out ================================================ Loading grammar from URL: file:/home/bozsahin/openccg/grammars/mini-turkish/grammar.xml Parse Realize String ----- ------- ------ ok - cocuk kitab i oku ma ya calisti ok - *cocuk kitab i oku ma yi calisti ok - cocuk kitab i oku mak istedi ok - cocuk kitab i oku ma yi istedi ok - *cocuk kitab i oku ma ya istedi ok - *cocuk adam kitab i ver me yi istedi ok - adam cocug u kitab i oku ma ya zorladi ok - *adam cocug u kitab i oku ma yi zorladi ok - cocug u adam kitab i oku ma ya zorladi ok - adam cocug u kitab i kadin a ver me ye zorladi ok - adam cocug u kadin a kitab i ver me ye zorladi ok - cocug u adam kadin a kitab i ver me ye zorladi ok - *adam cocug u kadin kitab i ver me ye zorladi ok - *adam cocug a kadin a kitab i ver me ye zorladi ok - kitab i oku yan adam uyudu ok - cocuk kitab i oku yan adam i gordu ok - adam in gor dugu cocuk uyudu ok - cocuk adam in gor dugu kitab i okudu ok - cocug un kitab i ver digi adam uyudu ok - cocug un adam a ver digi kitap dustu ok - *adam cocug un adam a ver en kadin i gordu ok - *adam adam a kitab i ver digi kadin i gordu ok - adam cocug a carpti ve dustu ok - cocug a adam carpti ve dustu ok - cocug a carpti ve adam dustu ok - kitab i adam cocug a verdi ama kadin almisti ok - cocug a kitab i adam verdi ama kadin almisti ok - dergi ye adam bakti ama cocuk okudu ================================================ FILE: grammars/mini-turkish/testbed.xml ================================================ ================================================ FILE: grammars/mini-turkish/types.xml ================================================ ================================================ FILE: grammars/morph.xsd ================================================ Schema for morph file. A morph file lists all the known word forms (morph items) together with their stems, parts of speech, semantic classes, associated macros, and excluded lexical categories. The part of speech is used to help determine the mapping between morph items and lexical categories. A morph item is automatically associated with all open families with matching parts of speech. With closed families, the morph item's stem must be listed as a family member, and the parts of speech must match. It is also possible to exclude certain lexical category entries or families, by including the entry name, qualified entry name or family name in the morph item's list of excluded lexical categories. NB: A limitation of the current implementation is that the mapping between morph items and open families can only be done with the stem as the predicate; to use different predicates, closed families must be used. Semantic classes may also be given, for n-gram ranking purposes, and for restricting the unification of nominals to compatible types, specified in the types file. When a category is instantiated, the semantic class is assigned to the nominal var(s) for the proposition with the reserved name '[*DEFAULT*]'. The types of all nominal vars are then propagated to all other nominal vars with the same name, throughout the category. The macros are used to add features or semantic predications to a lexical category that depend on the particular morph item (e.g. tense, number, case, etc.). The root element, containing a list of entries (morph items) and macros. Each macro must have a unique name. A morph item, with its word form, stem, part of speech, and optional semantic class, associated macros, and excluded lexical categories. The stem defaults to word form. At run time, when the grammar is loaded, a cross-reference check is performed on the names in the lists of associated macros and excluded lexical categories. The coart flag indicates that this entry is a coarticulation, eg a pitch accent, gesture, or other word-associated element. With coarticulations, the word form should be one or more attribute-value pairs, with attributes and values separated by hyphens, and multiple pairs separated by colons. When multiple attribute-value pairs are given, only the first one is used for indexing purposes. An identifying value for the stem should also be given, eg *accent* for a pitch accent. NB: Lexical attributes supplied by coarticulations must be done so uniformly; that is, they cannot appear already on some lexical items. Also, when there are multipled, independent coarticulations, they must supply disjoint sets of lexical attributes, and their categories must be capable of applying in any order. A named macro, with a set of features and/or semantic predications to add to a lexical category. The features are given by feature structure declarations, where the id is used to indicate where the features are to be added (i.e., to the feature structure of which atomic category). The semantic predications are given in an 'lf' element; at run time, these HLDS predications are flattened and added to any predications already present in the lexical category. Macro names begin with @ (for historical reasons) and are followed by at least one non-whitespace character. ================================================ FILE: grammars/parameters.xsd ================================================ Schema for the parameters.xml file. The parameters file contains definitions of language type, its basic word order and category for basic intransitive verbs, and the word order skeleton for the basic transitive verbs. iv and tv are pre-CCG categories. They refer to grammatical roles S,A,P, described by XML tags s-argument, a-argument and p-argument. They are mapped to CCG categories by parametric-lexicon.xsl. Word order can be specified as ordered/unordered list of arguments. Curried arguments can be described by enclosing them in arg tag, and set of arguments as setarg tag (the latter is also a valid category in openCCG, cf. categories.xsd and lexicon.xsd. But arg is not). For further information and how to start up lexicon design from a skeletal lexicon, see Bozsahin and Steedman (2003) (draft), and the comments generated in the file preset-families.xml by paramatric-lexicon.xsl. As usual, the best way to learn is to look at an example file. Parameter files are provided for Basque, Dyirbal, English, Inuit, Tagalog and Turkish (in their respective directories). p-argument and a-argument must not carry case; they are set from iv specs. s-argument must carry case; tv category and its grammatical relations are set up from this case and language type. The direction of the slash. The direction can be forward (/), backward (\) or both (|). What kind of subject is missing from the infinitive (parameter) The parameters must include infinitival, iv and tv specifications. The latter two must have a predicate specification. iv has s-argument, tv has a-argument and p-argument, wrapped around either arg or setarg tags. They can appear in any order (both iv and tv specs, and argument and predicate specs). Controllee must specify subject type as semantic or syntactic. ================================================ FILE: grammars/parametric-lexicon.xsl ================================================ NB: Word order and directionality of embedded clause's |(S|NP) type is an 'educated guess' from the syntactic type of TV. Change them accordingly. NB: If infinitive type is syntactic subject (which is the default), there is a built-in type in types.xml file called "subject-case". Include in this type all the subjects that can be controlled e.g. nominative subjects only (as in German), dative and nominative subjects (as in Malayalam) etc. In this case, the controllee is not semantically restricted. If infinitive type is semantic subject, you need a larger fragment of Hybrid Logic than HLDS uses to implement identity of two event variables e.g. @_e(Arg1 a) ^ @_e(Arg1 b) --> @_a(b) where a is the event variable for controller verb, and b is the event variable for the controlled verb, and Arg1 is the modality for primary arguments (1s). This constraint is formulable in HL but HLDS does not cover that fragment (yet). When it does, the lf tag should just stick in that constraint. Currently, OpenCCG can generate an LF with TWO non-identical Arg1 modalities in the same event structure, therefore some illicit examples would go through. NB: Index is same as Arg1's NB: Index is same as Arg1's NB: Word order and directionality of embedded clause's |(S|NP) type is an 'educated guess' from the syntactic type of TV. Change them accordingly NB: If infinitive type is syntactic subject (which is the default), there is a built-in type in types.xml file called "subject-case". Include in this type all the subjects that can be controlled e.g. nominative subjects only (as in German), dative and nominative subjects (as in Malayalam) etc. In this case, the controllee is not semantically restricted. If infinitive type is semantic subject, you need a larger fragment of Hybrid Logic than HLDS uses to implement identity of two event variables e.g. @_e(Arg1 a) ^ @_e(Arg1 b) --> @_a(b) where a is the event variable for controller verb, and b is the event variable for the controlled verb, and Arg1 is the modality for primary arguments (1s). This constraint is formulable in HL but HLDS does not cover that fragment (yet). When it does, the lf tag should just stick in that constraint. Currently, OpenCCG can generate an LF with TWO non-identical Arg1 modalities in the same event structure, therefore some illicit examples would go through. NB: Index is same as Arg2's NB: Index is same as Arg2's - This file is generated by parametric-lexicon.xsl to set up accusativity/ergativity parameter for IV and TV primary families and control primary families. NB: pre-CCG categories of parameters.xml are mapped to CCG categories in this file. From now on, it's all CCG Suggestions to start-up lexicon development: 1) Copy this file to lexicon-base.xml to avoid losing your changes to it (remember, this file is auto-generated at the start) 2) Edit lexicon-base.xml to modify the preset families and to add your own families as needed (merging the entries of same family is left to you) 3) Use the ccg-build facility of openCCG, which uses lexicon-base to build the lexicon.xml, morph.xml and rules.xml files needed by the system. *** Families derived from language parameters *** Includes primary entries for IV (unerg and unacc), basic TV, TV-control1, TV-control2, IV-control1 subject-case, s-case, p-case and a-case are value types that set up ergative- accusative mapping and surface cases of these argumnents (cf. types.xml file). Actual case values for them (e.g. nom for a-case in accusative languages, erg for a-case in ergative languages) are defined in types.xml file. You can of course refer to actual values since they are types, but if a construction is related to GR mapping (ERG or ACC), it's better to use `subject-case' (which covers a-case and s-case in ACC; p-case and s-case in ERG). Types.xml sets these up from parameter specification. *** End of derived families *** Add new families here, and merge the new entries for preset families as needed (e.g., you may add an entry to TV family for pro-dropping the subject etc.) NB: Argument is the PAS of the embedded S|NP (cf. E2 above) NB: Argument is the PAS of the embedded S|NP (cf. E2 above) NB: Argument is the PAS of the embedded S|NP (cf. E2 above) ================================================ FILE: grammars/parametric-types.xsl ================================================ - This file is generated by parametric-types.xsl from parameter specs (parameters.xml) to set up types.xml file for development. - If the language in question has eg. quirky subjects etc., add their values under appropriate types. subject-case, s-case, p-case and a-case are pre-defined types that are used in the automatically generated initial lexicon (preset-families .xml). We recommend that you dont change them. They handle accusative/ergative mapping among other things (cf. types.xml initial hierarchy). Warning: Please substitute case names as children of a-case and p-case since these cannot be predicted parametrically for an unaligned language (cf. unknown and unknown2 values) ================================================ FILE: grammars/raise-nodes.xsl ================================================ Next ================================================ FILE: grammars/routes/build.xml ================================================ ================================================ FILE: grammars/routes/dict.xml ================================================ ================================================ FILE: grammars/routes/dlf_test.xml ================================================ from mid-Cambridge , drive west on Cambridge_Street for about 1 mile . from mid-Cambridge , drive on Cambridge_Street toward Harvard_Yard for a mile . drive west on Cambridge_Street for a mile . drive on Cambridge_Street toward Harvard_Yard for about 1 mile . ================================================ FILE: grammars/routes/grammar.xml ================================================ ================================================ FILE: grammars/routes/lexicon-base.xml ================================================ ================================================ FILE: grammars/routes/lexicon.xml ================================================ ================================================ FILE: grammars/routes/morph.xml ================================================ ================================================ FILE: grammars/routes/rules.xml ================================================ ================================================ FILE: grammars/routes/testbed.xml ================================================ ================================================ FILE: grammars/routes/types.xml ================================================ ================================================ FILE: grammars/rules.xsd ================================================ Schema for combinatory rules file. A rules file specifies the combinatory rules to use. These rules include the (potentially) universal set of application, composition, type raising and substitution rules; they are only "potentially" universal because for efficiency one might want to use a restricted subset of the rules, and because the type raising rules are defined to work with specific, configurable argument and result types. The rules file may also specify unary type changing rules, which may be thought of as signs for zero morphemes, and thus conceptually belonging to the lexicon rather than to the (potentially) universal rule set. They are defined in the rules file since, algorithmically, they are handled at the same time as the other combinatory rules. The root element. Application, e.g. X/Y Y => X. Composition, e.g. X/Y Y/Z => X/Z. Substitution, e.g. X/Y/Z Y/Z => X/Z. Specifies the direction of the rule. With forward combination, the functor appears on the left; with backward combination, it appears on the right. Specifies whether the rule is harmonic (true) or permutative (false). Type raising, e.g. np => s/(s\np). The default category for the arg is "np"; the default for the result is "s". Specifies whether a coindexed dollar variable is included on the argument and result categories. Unary type changing rule, e.g. np => s/(s/np) for topicalization. The arg and result categories are required, as is the name attribute. ================================================ FILE: grammars/simplify-lists.xsl ================================================ ================================================ FILE: grammars/tiny/build.xml ================================================ ================================================ FILE: grammars/tiny/grammar.xml ================================================ ================================================ FILE: grammars/tiny/lexicon.xml ================================================ ================================================ FILE: grammars/tiny/morph.xml ================================================ ================================================ FILE: grammars/tiny/rules.xml ================================================ ================================================ FILE: grammars/tiny/testbed.xml ================================================ ================================================ FILE: grammars/tiny/types.xml ================================================ ================================================ FILE: grammars/tokens.xsd ================================================ Definitions for tokens. A 'token' as defined here is a string with no white space, for easy parsing, but otherwise more flexible than a NMTOKEN. Feature variables with optional type. ================================================ FILE: grammars/treeify-lists.xsl ================================================ ================================================ FILE: grammars/types.xsd ================================================ Schema for type hierarchy file. A type hierarchy file specifies the hierarchy of simple types which constitute the domain of syntactic feature values. The root element, containing a list of type entries. Each type must have a unique name. A simple type, with its name and immediate parents in the hierarchy. Parents are specified as a space-separated list of type names. Parents attribute is optional and defaults to the built-in type "top". Only the immediate parents should be specified. The whole hierarchy is constructed during the initial grammar loading process. ================================================ FILE: grammars/worldcup/add-chunks.xsl ================================================ ================================================ FILE: grammars/worldcup/build.xml ================================================ ================================================ FILE: grammars/worldcup/dict.xml ================================================ ================================================ FILE: grammars/worldcup/grammar.xml ================================================ ================================================ FILE: grammars/worldcup/lexicon-base.xsl ================================================ ================================================ FILE: grammars/worldcup/lexicon.xml ================================================ ================================================ FILE: grammars/worldcup/morph.xml ================================================ ================================================ FILE: grammars/worldcup/raise-nodes.xsl ================================================ Restr ================================================ FILE: grammars/worldcup/rules.xml ================================================ ================================================ FILE: grammars/worldcup/testbed.xml ================================================ ================================================ FILE: lib/ASL ================================================ /* * ============================================================================ * The Apache Software License, Version 1.1 * ============================================================================ * * Copyright (C) 1999 The Apache Software Foundation. All rights reserved. * * Redistribution and use in source and binary forms, with or without modifica- * tion, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. The end-user documentation included with the redistribution, if any, must * include the following acknowledgment: "This product includes software * developed by the Apache Software Foundation (http://www.apache.org/)." * Alternately, this acknowledgment may appear in the software itself, if * and wherever such third-party acknowledgments normally appear. * * 4. The names "Ant" and "Apache Software Foundation" must not be used to * endorse or promote products derived from this software without prior * written permission. For written permission, please contact * apache@apache.org. * * 5. Products derived from this software may not be called "Apache", nor may * "Apache" appear in their name, without prior written permission of the * Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * APACHE SOFTWARE FOUNDATION OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLU- * DING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This software consists of voluntary contributions made by many individuals * on behalf of the Apache Software Foundation. For more information on the * Apache Software Foundation, please see . * */ ================================================ FILE: lib/LGPL ================================================ GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! ================================================ FILE: lib/LIBNOTES ================================================ This file briefly describes the libraries which are used by the OpenNLP CCG Library. See the appropriate license files also in this directory. ------------------------------------------------------------------------ ant.jar, ant-launcher.jar, ant-junit.jar, ant-junit4.jar, ant-contrib.jar The Apache Ant Build System, version 1.9.0 Homepage: http://ant.apache.org/ ANT Contrib, version 1.0b3 Homepage: http://sourceforge.net/projects/ant-contrib/ License: Apache Software License (ASL) A Java based build tool. ------------------------------------------------------------------------ jdom.jar JDOM, version 1.1 Homepage: http://www.jdom.org License: See jdom.license A complete, Java-based solution for accessing, manipulating, and outputting XML data from Java code. ------------------------------------------------------------------------ jgrapht-jdk1.6.jar JGraphT, version 0.8.3 Homepage: http://www.jgrapht.org License: LGPL A free Java graph library that provides mathematical graph-theory objects and algorithms. ------------------------------------------------------------------------ jline.jar jLine, version 1.0 Homepage: http://jline.sourceforge.net/ License: See jline.license JLine is a Java library for handling console input. It is similar in functionality to BSD editline and GNU readline, providing a command input history and more. ------------------------------------------------------------------------ junit-4.10.jar JUnit, version 4.10 Homepage: http://junit.sourceforge.net/ License: Common Public License - v 1.0 JUnit is a simple, open source framework to write and run repeatable tests. ------------------------------------------------------------------------ libken.so KenLM, version 4 Homepage: http://kheafield.com/code/kenlm/ License: LGPL primarily (see src/kenlm/LICENSE) KenLM is a language modeling toolkit supporting large LMs via a memory-mapped binary format. JNI interface for linux adapted from the Joshua decoder; other platforms possible in principle. ------------------------------------------------------------------------ trove.jar GNU Trove, version 1.0.2 Homepage: http://trove4j.sf.net License: LGPL High performance collections for Java. ------------------------------------------------------------------------ xalan.jar, xercesImpl.jar, xml-apis.jar, xlstc.jar, serializer.jar The Apache Xalan-J XML transformation processor, version 2.7.1 The Apache Xerces-J XML parser, version 2.9.0 Homepage: http://xml.apache.org License: Apache Software License (ASL) XSLT transformation library and associated XML parser, with schema validation support. ------------------------------------------------------------------------ jopt-simple.jar JOpt Simple, version 3.1 Homepage: http://jopt-simple.sourceforge.net/ License: MIT License Command-line options parsing library. ------------------------------------------------------------------------ javacc.jar JavaCC, version 4.0 Homepage: https://javacc.dev.java.net/ License: SUN License Java parser generator. ================================================ FILE: lib/MIT ================================================ The MIT License Copyright (c) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: lib/SUN ================================================ Copyright (c) 2003 Sun Microsystems, Inc. All Rights Reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: -Redistribution of source code must retain the above copyright notice, this list of conditions and the following disclaimer. -Redistribution in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of Sun Microsystems, Inc. or the names of contributors may be used to endorse or promote products derived from this software without specific prior written permission. This software is provided "AS IS," without a warranty of any kind. ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT, ARE HEREBY EXCLUDED. SUN MIDROSYSTEMS, INC. ("SUN") AND ITS LICENSORS SHALL NOT BE LIABLE FOR ANY DAMAGES SUFFERED BY LICENSEE AS A RESULT OF USING, MODIFYING OR DISTRIBUTING THIS SOFTWARE OR ITS DERIVATIVES. IN NO EVENT WILL SUN OR ITS LICENSORS BE LIABLE FOR ANY LOST REVENUE, PROFIT OR DATA, OR FOR DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL OR PUNITIVE DAMAGES, HOWEVER CAUSED AND REGARDLESS OF THE THEORY OF LIABILITY, ARISING OUT OF THE USE OF OR INABILITY TO USE THIS SOFTWARE, EVEN IF SUN HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. You acknowledge that this software is not designed, licensed or intended for use in the design, construction, operation or maintenance of any nuclear facility. ================================================ FILE: lib/jdom.license ================================================ /*-- $Id: jdom.license,v 1.1.1.1 2003/02/28 18:02:10 mwhite14850 Exp $ Copyright (C) 2001 Brett McLaughlin & Jason Hunter. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions, and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions, and the disclaimer that follows these conditions in the documentation and/or other materials provided with the distribution. 3. The name "JDOM" must not be used to endorse or promote products derived from this software without prior written permission. For written permission, please contact license@jdom.org. 4. Products derived from this software may not be called "JDOM", nor may "JDOM" appear in their name, without prior written permission from the JDOM Project Management (pm@jdom.org). In addition, we request (but do not require) that you include in the end-user documentation provided with the redistribution and/or in the software itself an acknowledgement equivalent to the following: "This product includes software developed by the JDOM Project (http://www.jdom.org/)." Alternatively, the acknowledgment may be graphical using the logos available at http://www.jdom.org/images/logos. THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. This software consists of voluntary contributions made by many individuals on behalf of the JDOM Project and was originally created by Brett McLaughlin and Jason Hunter . For more information on the JDOM Project, please see . */ ================================================ FILE: lib/jline.license ================================================ Copyright (c) 2002-2006, Marc Prud'hommeaux All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. Neither the name of JLine nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: pom.xml ================================================ 4.0.0 opennlp ccg 0.10.0 pom src/ 1.8 1.8 ================================================ FILE: src/ccg2xml/README ================================================ README for ccg2xml Ben Wing October 8, 2006 ccg2xml is a front-end for more conveniently specifying OpenCCG grammars. The native XML format is not human-friendly and contains much repetition of information, often involving long-distance dependencies, which are difficult to keep track of. The .ccg format that is accepted by ccg2xml is designed to be expressive, concise and human-friendly, with as little required duplication as possible. ccg2xml is run on a .ccg file and produces (by default) the following files in the current directory: - lexicon.xml: Categories, lexical insertion rules - morph.xml: Morphological forms of words, feature macros - types.xml: Feature type hierarchies - rules.xml: Allowable combinatory, type-raising and type-changing rules - grammar.xml: File giving the names and locations of the previous four files - testbed.xml: File containing test sentences and expected number of parses The standard OpenCCG tools, such as tccg, can then be run, as if the grammar had been developed directly in XML. ccg2xml will output error messages if it encounters syntax errors, and in such a case will not generate any XML files. (There are also warnings that it may generate, which are non-fatal in that they do not prevent generation.) ccg2xml has a few options: -o controls which XML files are generated; this can be useful for using ccg2xml in conjunction with an existing grammar. -t outputs the file to stdout after all macro substitutions have been applied; this can help in debugging complicated macros that aren't working as expected. -d controls which directory the XML files are output to (by default, the current directory). -y, -m, and --super-macro-debug are debug options useful mostly for developers of ccg2xml. For more information, run `ccg2xml -h' to get the "usage" message. ccg2xml is written in Python, and itself a piece of generated code. ccg2xml makes use of PLY, a Python system for development of non-ambiguous context-free grammar parsers, similar to lex and yacc. PLY is written by David Beazley (dave@dabeaz.com), available at http://www.dabeaz.com/ply/; PLY 1.6 is included as part of the ccg2cml source (the files lex.py and yacc.py). PLY uses a clever trick of introspecting on the source file to determine the rules, which are stored in documentation strings. However, this ends up requiring a certain amount of redundancy in the source file. This gets awkward for large projects, so I created a front-end format that closely emulates yacc's format, while adding support for regular-expression operators on the right-hand side (RHS) of a rule (*, +, and ?); it consists of mixed CFG specifications and Python code. The file `convert-ply.py' converts from my local .ply format into standard .py files, and is used (see `Makefile') to generate the ccg2xml script. (It also attempts to work around a serious bug in PLY .) The source file ccg.ply is currently 1,727 lines in length (of mixed CFG specifications and Python code), and the auto-generated ccg2xml file is 2,370 lines. ccg2xml is written in Python. It uses PLY, which is a Python implementation of the standard `yacc' tool. NOTE: ccg2xml was written using PLY 1.6, which is included as part of ccg2xml (lex.py and yacc.py). PLY 1.6 contains a serious bug involving empty RHS productions, which convert-cgi.py attempts to work around and which ccg.ply also works around. I was never able to figure out the exact rule format that triggers this bug, so I simply had to use trial and error in rewriting rules to get a working ccg2xml. It is strongly recommended to upgrade to the most recent version of PLY (currently 2.1), which is likely to fix this problem, and make it possible to deterministically modify ccg.ply. -------------------------------------------------- The .ccg format -------------------------------------------------- The general feel of the syntax is like C, Java or Perl. Indentation and whitespace is unimportant (The only exception is in macro definitions, where the text of a macro must either be on the same line as the definition -- possibly extended with backslash line-continuation markers -- or be enclosed in braces). This is critical for macros, where having to worry about precisely controlling the indentation or whitespace at the beginning or end of a macro definition (such as would be required in Python or shell scripts) would be a major hassle and impediment. The file consists of a number of declarations, which specify features (feature {}), word stems and inflected formats (word {}), lexical insertion rules (family {}), test sentences (testbed {}), and other information relevant to the grammar. It can also contain macro definitions (def {}) along with corresponding macro calls, such as the calls to noun() and verb() in the sample grammar above. The macro mechanism is essentially just a way of doing parameterized text substitution, and is discussed in detail in sections 2.4 and 3.2. In general, declarations can be in any order; any dependencies that must be computed are done after the entire file has been parsed. Furthermore, any declaration containing multiple parts (for example, a feature declaration encompassing multiple features, a word declaration with multiple inflections, or a family declaration with multiple insertion rules) can be split up into separate declarations. In addition, the syntax tries to be very forgiving of the usage of commas, semicolons, and other terminators and separators. (In most lists, in fact, commas are optional, duplicated commas are no problem, and extra commas at the end of a list are allowed. The main exception where commas matter is in macro calls. You can still put an extra comma at the end of a macro call, but otherwise you must have exactly one comma (no more, no less) between arguments. The reason is that macro arguments can contain pretty much any text whatsoever (including no text at all), so commas are needed to indicate where one argument stops and the next one starts.) This simplifies, in a number of ways, the creation of lists using macros. The exceptions to these syntactic allowances are all related to macros. For example, macros must be defined before they can be used. (That is, before they are used in a macro call in the main text of the grammar; this does not apply to macro calls inside of another macro definition.) Also, the syntax of commas in argument lists of macro calls is much stricter than in lists occurring elsewhere in the grammar (see above). Note that, in general, there is no need to put quotes around literal text, such as the inflected forms of words. There is also no concept of "reserved words" in this format. Words like feature, word, and entry that have a special significance as "declarators" when in the right place in the syntax can otherwise be freely used as word stems and inflections, macro variable names, part of speech tags, etc. At the same time, however, it is possible to put quotes (single or double) around text. This allows special characters, spaces, etc. to be used in word inflections, feature values, and the like; otherwise, only letters, numbers, '_', '-', '+', '%' and non-ASCII characters can be used. Quotes can also be used to protect against the accidental interpretation of a word as a declarator, in the rare case that this is needed. The overall syntactic laxity of the language is part of a general philosophy of making the grammar language as tolerant and as expressive as possible. This is in keeping with the purpose of the language. Although it looks similar to a traditional programming language, its semantics is on a much higher level, where expressiveness is much more important and precision relatively less important. Furthermore, the language is intended for use by non-programmers as well as programmers, and non-programmers are is likely to have difficulty with rigid syntax rules, especially in the presence of macros. -------------------------------------------------- Macros -------------------------------------------------- The mechanism I chose to use to eliminate repetition is parameterized macros, which work like macro functions in C. Their definition includes a series of formal parameters and a section of literal text intermixed with the names of the macro's parameters. When a call to the macro occurs, the macro's text replaces the call, with the actual arguments to the call substituted for the occurrences of the corresponding formal parameters in the text. Any macro calls occurring in this text are then recursively expanded. The macro expansion mechanism generally happens at the level of tokens, where a token is a single word, a single non-alphanumeric character, or a single piece of quoted text. However, it is possible to construct a token by pasting two tokens together, using the dot operator; this is similar to the ## operator in C. This is useful for constructing inflected forms of words, as shown in the tiny.ccg and arabic.ccg sample files. It is also possible to compositionally construct the name of a macro and then call this constructed name; this is used in arabic.ccg, in the macro 3rd-weak-verb(). The parser pays attention to matched pairs of parens, brackets, and braces in the text of a macro call argument, and will not get confused by commas inside of such matched delimiters. Thus, a macro call foo(a, bar(b, c)) is correctly interpreted as a call to foo() with two arguments, a and bar(b, c), and not a call to foo() with three arguments, where the first argument is a, second argument is bar(b, and third argument is c). The text of both macro definitions and arguments to macro calls can optionally be surrounded by braces, to clearly delimit the text boundaries. The braces do not form part of the text itself (to specify text surrounded by braces, a second set would have to be added around the text). It is conventional to surround macro definitions with braces, except sometimes for very short definitions that fit on one line. However, braces are not normally used around macro call arguments unless necessary for correct parsing (for example, a macro argument containing a comma in it that is not surrounded by matching delimiters). -------------------------------- Sample Files -------------------------------- Three sample .ccg-format files are currently included in the directory `ccg-format-grammars' off of the top-level OpenCCG directory: - `tinytiny.ccg' is a simple file that parses a basic chunk of English (including nouns, personal pronouns, articles, and transitive and intransitive verbs) and demonstrates the basic features of the .ccg format. - `tiny.ccg' is a conversion of the original OpenCCG "tiny" sample grammar to .ccg format. `tiny.ccg' is no longer as small as its name implies, especially since it contains a large number of comments describing the CCG format in depth, as well as a number of example declarations to demonstrate the various features of the CCG format. - `arabic.ccg' is an Arabic grammar fragment, which demonstrates how to effectively use macros in order to efficiently generate the morphology of a complex, highly-inflected language. ================================================ FILE: src/ccg2xml/Tree.py ================================================ # Highly optimized Tkinter tree control # by Charles E. "Gene" Cash # # This is documented more fully on my homepage at # http://home.cfl.rr.com/genecash/ and if it's not there, look in the Vaults # of Parnassus at http://www.vex.net/parnassus/ which I promise to keep # updated. # # Thanks to Laurent Claustre for sending lots of helpful # bug reports. # # This copyright license is intended to be similar to the FreeBSD license. # # Copyright 1998 Gene Cash All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the # distribution. # # THIS SOFTWARE IS PROVIDED BY GENE CASH ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES # OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # # This means you may do anything you want with this code, except claim you # wrote it. Also, if it breaks you get to keep both pieces. # # 02-DEC-98 Started writing code. # 22-NOV-99 Changed garbage collection to a better algorithm. # 28-AUG-01 Added logic to deal with exceptions in user callbacks. # 02-SEP-01 Fixed hang when closing last node. # 07-SEP-01 Added binding tracking so nodes got garbage-collected. # Also fixed subclass call to initialize Canvas to properly deal # with variable arguments and keyword arguments. # 11-SEP-01 Bugfix for unbinding code. # 13-OCT-01 Added delete & insert methods for nodes (by email request). # LOTS of code cleanup. # Changed leading double underscores to PVT nomenclature. # Added ability to pass Node subclass to Tree constructor. # Removed after_callback since subclassing Node is better idea. # 15-OCT-01 Finally added drag'n'drop support. It consisted of a simple # change to the Node PVT_click method, and addition of logic like # the example in Tkdnd.py. It took 3 days to grok the Tkdnd # example and 2 hours to make the code changes. Plus another 1/2 # day to get a working where() function. # 16-OCT-01 Incorporated fixes to delete() and dnd_commit() bugs by # Laurent Claustre . # 17-OCT-01 Added find_full_id() and cursor_node() methods. # 18-OCT-01 Fixes to delete() on root during collapse and with # drag-in-progress flag by Laurent Claustre . # 10-FEB-02 Fix to prev_visible() by Nicolas Pascal . # Fixes which made insert_before()/insert_after() actually work. # Also added expand/collapse indicators like Internet Explorer # as requested by Nicolas. # 11-FEB-02 Another fix to prev_visible(). It works this time. Honest. # 31-MAY-02 Added documentation strings so the new PYthon 2.2 help function # is a little more useful. # 19-AUG-02 Minor fix to eliminate crash in "treedemo-icons.py" caused by # referencing expand/collapse indicators when lines are turned off. # 15-OCT-02 Used new idiom for calling Canvas superclass. # 18-NOV-02 Fixed bug discovered by Amanjit Gill , where # I didn't pass "master" properly to the Canvas superclass. Sigh. # One step forward, one step back. import Tkdnd from Tkinter import * #------------------------------------------------------------------------------ def report_callback_exception(): """report exception on sys.stderr.""" import traceback import sys sys.stderr.write("Exception in Tree control callback\n") traceback.print_exc() #------------------------------------------------------------------------------ class Struct: """Helper object for add_node() method""" def __init__(self): pass #------------------------------------------------------------------------------ class Node: """Tree helper class that's instantiated for each element in the tree. It has several useful attributes: parent_node - immediate parent node id - id assigned at creation expanded_icon - image displayed when folder is expanded to display children collapsed_icon - image displayed when node is not a folder or folder is collapsed. parent_widget - reference to tree widget that contains node. expandable_flag - is true when node is a folder that may be expanded or collapsed. expanded_flag - true to indicate node is currently expanded. h_line - canvas line to left of node image. v_line - canvas line below node image that connects children. indic - expand/collapse canvas image. label - canvas text label symbol - current canvas image Please note that methods prefixed PVT_* are not meant to be used by client programs.""" def __init__(self, parent_node, id, collapsed_icon, x, y, parent_widget=None, expanded_icon=None, label=None, expandable_flag=0): """Create node and initialize it. This also displays the node at the given position on the canvas, and binds mouseclicks.""" # immediate parent node self.parent_node=parent_node # internal name used to manipulate things self.id=id # bitmaps to be displayed self.expanded_icon=expanded_icon self.collapsed_icon=collapsed_icon # tree widget we belong to if parent_widget: self.widget=parent_widget else: self.widget=parent_node.widget # for speed sw=self.widget # our list of child nodes self.child_nodes=[] # flag that node can be expanded self.expandable_flag=expandable_flag self.expanded_flag=0 # add line if parent_node and sw.line_flag: self.h_line=sw.create_line(x, y, x-sw.dist_x, y) else: self.h_line=None self.v_line=None # draw approprate image self.symbol=sw.create_image(x, y, image=self.collapsed_icon) # add expand/collapse indicator self.indic=None if expandable_flag and sw.line_flag and sw.plus_icon and sw.minus_icon: self.indic=sw.create_image(x-sw.dist_x, y, image=sw.plus_icon) # add label self.label=sw.create_text(x+sw.text_offset, y, text=label, anchor='w') # single-click to expand/collapse if self.indic: sw.tag_bind(self.indic, '<1>', self.PVT_click) else: sw.tag_bind(self.symbol, '<1>', self.PVT_click) # for drag'n'drop target detection sw.tag_bind(self.symbol, '', self.PVT_enter) sw.tag_bind(self.label, '', self.PVT_enter) # for testing (gotta make sure nodes get properly GC'ed) #def __del__(self): # print self.full_id(), 'deleted' # ----- PUBLIC METHODS ----- def set_collapsed_icon(self, icon): """Set node's collapsed image""" self.collapsed_icon=icon if not self.expanded_flag: self.widget.itemconfig(self.symbol, image=icon) def set_expanded_icon(self, icon): """Set node's expanded image""" self.expanded_icon=icon if self.expanded_flag: self.widget.itemconfig(self.symbol, image=icon) def parent(self): """Return node's parent node""" return self.parent_node def prev_sib(self): """Return node's previous sibling (the child immediately above it)""" i=self.parent_node.child_nodes.index(self)-1 if i >= 0: return self.parent_node.child_nodes[i] else: return None def next_sib(self): """Return node's next sibling (the child immediately below it)""" i=self.parent_node.child_nodes.index(self)+1 if i < len(self.parent_node.child_nodes): return self.parent_node.child_nodes[i] else: return None def next_visible(self): """Return next lower visible node""" n=self if n.child_nodes: # if you can go right, do so return n.child_nodes[0] while n.parent_node: # move to next sibling i=n.parent_node.child_nodes.index(n)+1 if i < len(n.parent_node.child_nodes): return n.parent_node.child_nodes[i] # if no siblings, move to parent's sibling n=n.parent_node # we're at bottom return self def prev_visible(self): """Return next higher visible node""" n=self if n.parent_node: i=n.parent_node.child_nodes.index(n)-1 if i < 0: return n.parent_node else: j=n.parent_node.child_nodes[i] return j.PVT_last() else: return n def children(self): """Return list of node's children""" return self.child_nodes[:] def get_label(self): """Return string containing text of current label""" return self.widget.itemcget(self.label, 'text') def set_label(self, label): """Set current text label""" self.widget.itemconfig(self.label, text=label) def expanded(self): """Returns true if node is currently expanded, false otherwise""" return self.expanded_flag def expandable(self): """Returns true if node can be expanded (i.e. if it's a folder)""" return self.expandable_flag def full_id(self): """Return list of IDs of all parents and node ID""" if self.parent_node: return self.parent_node.full_id()+(self.id,) else: return (self.id,) def expand(self): """Expand node if possible""" if not self.expanded_flag: self.PVT_set_state(1) def collapse(self): """Collapse node if possible""" if self.expanded_flag: self.PVT_set_state(0) def delete(self, me_too=1): """Delete node from tree. ("me_too" is a hack not to be used by external code, please!)""" sw=self.widget if not self.parent_node and me_too: # can't delete the root node raise ValueError, "can't delete root node" self.PVT_delete_subtree() # move everything up so that distance to next subnode is correct n=self.next_visible() x1, y1=sw.coords(self.symbol) x2, y2=sw.coords(n.symbol) if me_too: dist=y2-y1 else: dist=y2-y1-sw.dist_y self.PVT_tag_move(-dist) n=self if me_too: if sw.pos == self: # move cursor if it points to current node sw.move_cursor(self.parent_node) self.PVT_unbind_all() sw.delete(self.symbol) sw.delete(self.label) sw.delete(self.h_line) sw.delete(self.v_line) sw.delete(self.indic) self.parent_node.child_nodes.remove(self) # break circular ref now, so parent may be GC'ed later n=self.parent_node self.parent_node=None n.PVT_cleanup_lines() n.PVT_update_scrollregion() def insert_before(self, nodes): """Insert list of nodes as siblings before this node. Call parent node's add_node() function to generate the list of nodes.""" i=self.parent_node.child_nodes.index(self) self.parent_node.PVT_insert(nodes, i, self.prev_visible()) def insert_after(self, nodes): """Insert list of nodes as siblings after this node. Call parent node's add_node() function to generate the list of nodes.""" i=self.parent_node.child_nodes.index(self)+1 self.parent_node.PVT_insert(nodes, i, self.PVT_last()) def insert_children(self, nodes): """Insert list of nodes as children of this node. Call node's add_node() function to generate the list of nodes.""" self.PVT_insert(nodes, 0, self) def toggle_state(self): """Toggle node's state between expanded and collapsed, if possible""" if self.expandable_flag: if self.expanded_flag: self.PVT_set_state(0) else: self.PVT_set_state(1) # ----- functions for drag'n'drop support ----- def PVT_enter(self, event): """detect mouse hover for drag'n'drop""" self.widget.target=self def dnd_end(self, target, event): """Notification that dnd processing has been ended. It DOES NOT imply that we've been dropped somewhere useful, we could have just been dropped into deep space and nothing happened to any data structures, or it could have been just a plain mouse-click w/o any dragging.""" if not self.widget.drag: # if there's been no dragging, it was just a mouse click self.widget.move_cursor(self) self.toggle_state() self.widget.drag=0 # ----- PRIVATE METHODS (prefixed with "PVT_") ----- # these methods are subject to change, so please try not to use them def PVT_last(self): """Return bottom-most node in subtree""" n=self while n.child_nodes: n=n.child_nodes[-1] return n def PVT_find(self, search): """Used by searching functions""" if self.id != search[0]: # this actually only goes tilt if root doesn't match return None if len(search) == 1: return self # get list of children IDs i=map(lambda x: x.id, self.child_nodes) # if there is a child that matches, search it try: return self.child_nodes[i.index(search[1])].PVT_find(search[1:]) except: return None def PVT_insert(self, nodes, pos, below): """Create and insert new children. "nodes" is list previously created via calls to add_list(). "pos" is index in the list of children where the new nodes are inserted. "below" is node which new children should appear immediately below.""" if not self.expandable_flag: raise TypeError, 'not an expandable node' # for speed sw=self.widget # expand and insert children children=[] self.expanded_flag=1 sw.itemconfig(self.symbol, image=self.expanded_icon) if sw.minus_icon and sw.line_flag: sw.itemconfig(self.indic, image=sw.minus_icon) if len(nodes): # move stuff to make room below.PVT_tag_move(sw.dist_y*len(nodes)) # get position of first new child xp, dummy=sw.coords(self.symbol) dummy, yp=sw.coords(below.symbol) xp=xp+sw.dist_x yp=yp+sw.dist_y # create vertical line if sw.line_flag and not self.v_line: self.v_line=sw.create_line( xp, yp, xp, yp+sw.dist_y*len(nodes)) sw.tag_lower(self.v_line, self.symbol) n=sw.node_class for i in nodes: # add new subnodes, they'll draw themselves # this is a very expensive call children.append( n(parent_node=self, expandable_flag=i.flag, label=i.name, id=i.id, collapsed_icon=i.collapsed_icon, expanded_icon=i.expanded_icon, x=xp, y=yp)) yp=yp+sw.dist_y self.child_nodes[pos:pos]=children self.PVT_cleanup_lines() self.PVT_update_scrollregion() sw.move_cursor(sw.pos) def PVT_set_state(self, state): """Common code forexpanding/collapsing folders. It's not re-entrant, and there are certain cases in which we can be called again before we're done, so we use a mutex.""" while self.widget.spinlock: pass self.widget.spinlock=1 # expand & draw our subtrees if state: self.child_nodes=[] self.widget.new_nodes=[] if self.widget.get_contents_callback: # this callback needs to make multiple calls to add_node() try: self.widget.get_contents_callback(self) except: report_callback_exception() self.PVT_insert(self.widget.new_nodes, 0, self) # collapse and delete subtrees else: self.expanded_flag=0 self.widget.itemconfig(self.symbol, image=self.collapsed_icon) if self.indic: self.widget.itemconfig(self.indic, image=self.widget.plus_icon) self.delete(0) # release mutex self.widget.spinlock=0 def PVT_cleanup_lines(self): """Resize connecting lines""" if self.widget.line_flag: n=self while n: if n.child_nodes: x1, y1=self.widget.coords(n.symbol) x2, y2=self.widget.coords(n.child_nodes[-1].symbol) self.widget.coords(n.v_line, x1, y1, x1, y2) n=n.parent_node def PVT_update_scrollregion(self): """Update scroll region for new size""" x1, y1, x2, y2=self.widget.bbox('all') self.widget.configure(scrollregion=(x1, y1, x2+5, y2+5)) def PVT_delete_subtree(self): """Recursively delete subtree & clean up cyclic references to make garbage collection happy""" sw=self.widget sw.delete(self.v_line) self.v_line=None for i in self.child_nodes: # delete node's subtree, if any i.PVT_delete_subtree() i.PVT_unbind_all() # delete widgets from canvas sw.delete(i.symbol) sw.delete(i.label) sw.delete(i.h_line) sw.delete(i.v_line) sw.delete(i.indic) # break circular reference i.parent_node=None # move cursor if it's in deleted subtree if sw.pos in self.child_nodes: sw.move_cursor(self) # now subnodes will be properly garbage collected self.child_nodes=[] def PVT_unbind_all(self): """Unbind callbacks so node gets garbage-collected. This wasn't easy to figure out the proper way to do this. See also tag_bind() for the Tree widget itself.""" for j in (self.symbol, self.label, self.indic, self.h_line, self.v_line): for k in self.widget.bindings.get(j, ()): self.widget.tag_unbind(j, k[0], k[1]) def PVT_tag_move(self, dist): """Move everything below current icon, to make room for subtree using the Disney magic of item tags. This is the secret of making everything as fast as it is.""" # mark everything below current node as movable bbox1=self.widget.bbox(self.widget.root.symbol, self.label) bbox2=self.widget.bbox('all') self.widget.dtag('move') self.widget.addtag('move', 'overlapping', bbox2[0], bbox1[3], bbox2[2], bbox2[3]) # untag cursor & node so they don't get moved too self.widget.dtag(self.widget.cursor_box, 'move') self.widget.dtag(self.symbol, 'move') self.widget.dtag(self.label, 'move') # now do the move of all the tagged objects self.widget.move('move', 0, dist) def PVT_click(self, event): """Handle mouse clicks by kicking off possible drag'n'drop processing""" if self.widget.drop_callback: if Tkdnd.dnd_start(self, event): x1, y1, x2, y2=self.widget.bbox(self.symbol) self.x_off=(x1-x2)/2 self.y_off=(y1-y2)/2 else: # no callback, don't bother with drag'n'drop self.widget.drag=0 self.dnd_end(None, None) #------------------------------------------------------------------------------ class Tree(Canvas): # do we have enough possible arguments?!?!?! def __init__(self, master, root_id, root_label='', get_contents_callback=None, dist_x=15, dist_y=15, text_offset=10, line_flag=1, expanded_icon=None, collapsed_icon=None, regular_icon=None, plus_icon=None, minus_icon=None, node_class=Node, drop_callback=None, *args, **kw_args): # pass args to superclass (new idiom from Python 2.2) Canvas.__init__(self, master, *args, **kw_args) # this allows to subclass Node and pass our class in self.node_class=node_class # keep track of node bindings self.bindings={} # cheap mutex spinlock self.spinlock=0 # flag to see if there's been any d&d dragging self.drag=0 # default images (BASE64-encoded GIF files) if expanded_icon == None: self.expanded_icon=PhotoImage( data='R0lGODlhEAANAKIAAAAAAMDAwICAgP//////ADAwMAAAAAAA' \ 'ACH5BAEAAAEALAAAAAAQAA0AAAM6GCrM+jCIQamIbw6ybXNSx3GVB' \ 'YRiygnA534Eq5UlO8jUqLYsquuy0+SXap1CxBHr+HoBjoGndDpNAAA7') else: self.expanded_icon=expanded_icon if collapsed_icon == None: self.collapsed_icon=PhotoImage( data='R0lGODlhDwANAKIAAAAAAMDAwICAgP//////ADAwMAAAAAAA' \ 'ACH5BAEAAAEALAAAAAAPAA0AAAMyGCHM+lAMMoeAT9Jtm5NDKI4Wo' \ 'FXcJphhipanq7Kvu8b1dLc5tcuom2foAQQAyKRSmQAAOw==') else: self.collapsed_icon=collapsed_icon if regular_icon == None: self.regular_icon=PhotoImage( data='R0lGODlhCwAOAJEAAAAAAICAgP///8DAwCH5BAEAAAMALAAA' \ 'AAALAA4AAAIphA+jA+JuVgtUtMQePJlWCgSN9oSTV5lkKQpo2q5W+' \ 'wbzuJrIHgw1WgAAOw==') else: self.regular_icon=regular_icon if plus_icon == None: self.plus_icon=PhotoImage( data='R0lGODdhCQAJAPEAAAAAAH9/f////wAAACwAAAAACQAJAAAC' \ 'FIyPoiu2sJyCyoF7W3hxz850CFIA\nADs=') else: self.plus_icon=plus_icon if minus_icon == None: self.minus_icon=PhotoImage( data='R0lGODdhCQAJAPEAAAAAAH9/f////wAAACwAAAAACQAJAAAC' \ 'EYyPoivG614LAlg7ZZbxoR8UADs=') else: self.minus_icon=minus_icon # horizontal distance that subtrees are indented self.dist_x=dist_x # vertical distance between rows self.dist_y=dist_y # how far to offset text label self.text_offset=text_offset # flag controlling connecting line display self.line_flag=line_flag # called just before subtree expand/collapse self.get_contents_callback=get_contents_callback # called after drag'n'drop self.drop_callback=drop_callback # create root node to get the ball rolling self.root=node_class(parent_node=None, label=root_label, id=root_id, expandable_flag=1, collapsed_icon=self.collapsed_icon, expanded_icon=self.expanded_icon, x=dist_x, y=dist_y, parent_widget=self) # configure for scrollbar(s) x1, y1, x2, y2=self.bbox('all') self.configure(scrollregion=(x1, y1, x2+5, y2+5)) # add a cursor self.cursor_box=self.create_rectangle(0, 0, 0, 0) self.move_cursor(self.root) # make it easy to point to control self.bind('', self.PVT_mousefocus) # totally arbitrary yet hopefully intuitive default keybindings # stole 'em from ones used by microsoft tree control # page-up/page-down self.bind('', self.pagedown) self.bind('', self.pageup) # arrow-up/arrow-down self.bind('', self.next) self.bind('', self.prev) # arrow-left/arrow-right self.bind('', self.ascend) # (hold this down and you expand the entire tree) self.bind('', self.descend) # home/end self.bind('', self.first) self.bind('', self.last) # space bar self.bind('', self.toggle) # ----- PRIVATE METHODS (prefixed with "PVT_") ----- # these methods are subject to change, so please try not to use them def PVT_mousefocus(self, event): """Soak up event argument when moused-over""" self.focus_set() # ----- PUBLIC METHODS ----- def tag_bind(self, tag, seq, *args, **kw_args): """Keep track of callback bindings so we can delete them later. I shouldn't have to do this!!!!""" # pass args to superclass func_id=apply(Canvas.tag_bind, (self, tag, seq)+args, kw_args) # save references self.bindings[tag]=self.bindings.get(tag, [])+[(seq, func_id)] def add_list(self, list=None, name=None, id=None, flag=0, expanded_icon=None, collapsed_icon=None): """Add node construction info to list""" n=Struct() n.name=name n.id=id n.flag=flag if collapsed_icon: n.collapsed_icon=collapsed_icon else: if flag: # it's expandable, use closed folder icon n.collapsed_icon=self.collapsed_icon else: # it's not expandable, use regular file icon n.collapsed_icon=self.regular_icon if flag: if expanded_icon: n.expanded_icon=expanded_icon else: n.expanded_icon=self.expanded_icon else: # not expandable, don't need an icon n.expanded_icon=None if list == None: list=[] list.append(n) return list def add_node(self, name=None, id=None, flag=0, expanded_icon=None, collapsed_icon=None): """Add a node during get_contents_callback()""" self.add_list(self.new_nodes, name, id, flag, expanded_icon, collapsed_icon) def find_full_id(self, search): """Search for a node""" return self.root.PVT_find(search) def cursor_node(self, search): """Return node under cursor""" return self.pos def see(self, *items): """Scroll (in a series of nudges) so items are visible""" x1, y1, x2, y2=apply(self.bbox, items) while x2 > self.canvasx(0)+self.winfo_width(): old=self.canvasx(0) self.xview('scroll', 1, 'units') # avoid endless loop if we can't scroll if old == self.canvasx(0): break while y2 > self.canvasy(0)+self.winfo_height(): old=self.canvasy(0) self.yview('scroll', 1, 'units') if old == self.canvasy(0): break # done in this order to ensure upper-left of object is visible while x1 < self.canvasx(0): old=self.canvasx(0) self.xview('scroll', -1, 'units') if old == self.canvasx(0): break while y1 < self.canvasy(0): old=self.canvasy(0) self.yview('scroll', -1, 'units') if old == self.canvasy(0): break def move_cursor(self, node): """Move cursor to node""" self.pos=node x1, y1, x2, y2=self.bbox(node.symbol, node.label) self.coords(self.cursor_box, x1-1, y1-1, x2+1, y2+1) self.see(node.symbol, node.label) def toggle(self, event=None): """Expand/collapse subtree""" self.pos.toggle_state() def next(self, event=None): """Move to next lower visible node""" self.move_cursor(self.pos.next_visible()) def prev(self, event=None): """Move to next higher visible node""" self.move_cursor(self.pos.prev_visible()) def ascend(self, event=None): """Move to immediate parent""" if self.pos.parent_node: # move to parent self.move_cursor(self.pos.parent_node) def descend(self, event=None): """Move right, expanding as we go""" if self.pos.expandable_flag: self.pos.expand() if self.pos.child_nodes: # move to first subnode self.move_cursor(self.pos.child_nodes[0]) return # if no subnodes, move to next sibling self.next() def first(self, event=None): """Go to root node""" # move to root node self.move_cursor(self.root) def last(self, event=None): """Go to last visible node""" # move to bottom-most node self.move_cursor(self.root.PVT_last()) def pageup(self, event=None): """Previous page""" n=self.pos j=self.winfo_height()/self.dist_y for i in range(j-3): n=n.prev_visible() self.yview('scroll', -1, 'pages') self.move_cursor(n) def pagedown(self, event=None): """Next page""" n=self.pos j=self.winfo_height()/self.dist_y for i in range(j-3): n=n.next_visible() self.yview('scroll', 1, 'pages') self.move_cursor(n) # ----- functions for drag'n'drop support ----- def where(self, event): """Determine drag location in canvas coordinates. event.x & event.y don't seem to be what we want.""" # where the corner of the canvas is relative to the screen: x_org=self.winfo_rootx() y_org=self.winfo_rooty() # where the pointer is relative to the canvas widget, # including scrolling x=self.canvasx(event.x_root-x_org) y=self.canvasy(event.y_root-y_org) return x, y def dnd_accept(self, source, event): """Accept dnd messages, i.e. we're a legit drop target, and we do implement d&d functions.""" self.target=None return self def dnd_enter(self, source, event): """Get ready to drag or drag has entered widget (create drag object)""" # this flag lets us know there's been drag motion self.drag=1 x, y=self.where(event) x1, y1, x2, y2=source.widget.bbox(source.symbol, source.label) dx, dy=x2-x1, y2-y1 # create dragging icon if source.expanded_flag: self.dnd_symbol=self.create_image(x, y, image=source.expanded_icon) else: self.dnd_symbol=self.create_image(x, y, image=source.collapsed_icon) self.dnd_label=self.create_text(x+self.text_offset, y, text=source.get_label(), justify='left', anchor='w') def dnd_motion(self, source, event): """Move drag icon""" self.drag=1 x, y=self.where(event) x1, y1, x2, y2=self.bbox(self.dnd_symbol, self.dnd_label) self.move(self.dnd_symbol, x-x1+source.x_off, y-y1+source.y_off) self.move(self.dnd_label, x-x1+source.x_off, y-y1+source.y_off) def dnd_leave(self, source, event): """Finish dragging or drag has left widget (destroy drag object)""" self.delete(self.dnd_symbol) self.delete(self.dnd_label) def dnd_commit(self, source, event): """Object has been dropped here""" # call our own dnd_leave() to clean up self.dnd_leave(source, event) # process pending events to detect target node # update_idletasks() doesn't do the trick if source & target are # on different widgets self.update() if not self.target: # no target node return # we must update data structures based on the drop if self.drop_callback: try: # called with dragged node and target node # this is where a file manager would move the actual file # it must also move the nodes around as it wishes self.drop_callback(source, self.target) except: report_callback_exception() #------------------------------------------------------------------------------ # the good 'ol test/demo code if __name__ == '__main__': import os import sys # default routine to get contents of subtree # supply this for a different type of app # argument is the node object being expanded # should call add_node() def get_contents(node): path=apply(os.path.join, node.full_id()) for filename in os.listdir(path): full=os.path.join(path, filename) name=filename folder=0 if os.path.isdir(full): # it's a directory folder=1 elif not os.path.isfile(full): # but it's not a file name=name+' (special)' if os.path.islink(full): # it's a link name=name+' (link to '+os.readlink(full)+')' node.widget.add_node(name=name, id=filename, flag=folder) root=Tk() root.title(os.path.basename(sys.argv[0])) tree=os.sep if sys.platform == 'win32': # we could call the root "My Computer" and mess with get_contents() # to return "A:", "B:", "C:", ... etc. as it's children, but that # would just be terminally cute and I'd have to shoot myself tree='C:'+os.sep # create the control t=Tree(master=root, root_id=tree, root_label=tree, get_contents_callback=get_contents, width=300) t.grid(row=0, column=0, sticky='nsew') # make expandable root.grid_rowconfigure(0, weight=1) root.grid_columnconfigure(0, weight=1) # add scrollbars sb=Scrollbar(root) sb.grid(row=0, column=1, sticky='ns') t.configure(yscrollcommand=sb.set) sb.configure(command=t.yview) sb=Scrollbar(root, orient=HORIZONTAL) sb.grid(row=1, column=0, sticky='ew') t.configure(xscrollcommand=sb.set) sb.configure(command=t.xview) # must get focus so keys work for demo t.focus_set() # we could do without this, but it's nice and friendly to have Button(root, text='Quit', command=root.quit).grid(row=2, column=0, columnspan=2) # expand out the root t.root.expand() root.mainloop() ================================================ FILE: src/ccg2xml/arabic.ccg ================================================ ############################################################# # # # arabic.ccg # # # ############################################################# # Author: Ben Wing # Date: April 2006 # This is a grammar for a fragment of Arabic. It's particularly # useful for demonstrating the extended use of macros to handle # complicated morphological inflections. # See the `tiny' grammar (tiny.ccg) for more info about the format # of this file. feature { CASE<2>: nom, acc, gen; NUM<2>: sg, du, pl; GEND<2>: m, f; STATE<2>: cons, non-cons {indef, def}; ANIM<2>: hum, nonhum; PERS<2>: 1st, 2nd, 3rd; RESUMPTIVE<2>: nonres, res; SEM-NUM: sg-X, du-X, pl-X; SEM-PERS: 1st-X, 2nd-X, 3rd-X; TENSE: past, pres; MOOD: indic, subj, juss; # Here's a more complicated hierarchy, from the original tiny grammar. ontology: sem-obj { phys-obj { animate-being { person }, thing }, situation { change { action }, state } }; } rule { no typeraise; typeraise +: n => s; typeraise - $: n => s; typeraise - $: pp => s; typeraise - $: pp/n => s; typechange: s$1 | n[nom] => s$1 ; typechange: n<~2>[cons] => n<2>[3rd,def] /* n[gen,def] ; typechange: n<~2>[cons] => n<2>[3rd,indef] /* n[gen,indef] ; } ########################################################################## # Morphological entries # # (morph.xml) # ########################################################################## word wa:Conj; # "and" word anna:Comp; # "that", introducing sentential complements word inna:Comp; # same, but only after the verb qaal "say" word maa:InterrogPro(thing): 3rd; # "what" word man:InterrogPro(person): 3rd; # "who" word li:Prep; # "what" word fii:Prep; # "who" # This word means "this". word haadhaa { *: sg, m; haadhihi: sg, f; ha_ulaahi: pl; # Bizarrely, this word declines for case only in the dual. haadhaani: du, m, nom; haadhayni: du, m, acc; haadhayni: du, m, gen; haataani: du, f, nom; haatayni: du, f, acc; haatayni: du, f, gen; } # This word means "that". word dhaalik { *: sg, m; tilka: sg, f; ulaa_ika: pl; # Bizarrely, this word declines for case only in the dual. dhaanika: du, m, nom; dhaynika: du, m, acc; dhaynika: du, m, gen; taanika: du, f, nom; taynika: du, f, acc; taynika: du, f, gen; } # This is the relative pronoun. word al-ladhii { *: sg, m; al-latii: sg, f; al-ladhiina: pl, m; al-laati: pl, f; # Bizarrely, this word declines for case only in the dual. al-ladhaani: du, m, nom; al-ladhayni: du, m, acc; al-ladhayni: du, m, gen; al-lataani: du, f, nom; al-latayni: du, f, acc; al-latayni: du, f, gen; } word pro:Pro { ana: 1st, 1st-X, sg, sg-X; anta: 2nd, 2nd-X, sg, sg-X, m; anti: 2nd, 2nd-X, sg, sg-X, f; huwa: 3rd, 3rd-X, sg, sg-X, m; hiya: 3rd, 3rd-X, sg, sg-X, f; naHnu: 1st, 1st-X, pl, pl-X; antun: 2nd, 2nd-X, pl, pl-X, m; antunna: 2nd, 2nd-X, pl, pl-X, f; hum: 3rd, 3rd-X, pl, pl-X, m; hunna: 3rd, 3rd-X, pl, pl-X, f; } word ii:: 1st, 1st-X, sg, sg-X; word nii:: 1st, 1st-X, sg, sg-X; word ka:: 2nd, 2nd-X, sg, sg-X, m; word ki:: 2nd, 2nd-X, sg, sg-X, f; word hu:: 3rd, 3rd-X, sg, sg-X, m; word haa:: 3rd, 3rd-X, sg, sg-X, f; word naa:: 1st, 1st-X, pl, pl-X; word kum:: 2nd, 2nd-X, pl, pl-X, m; word kunna:: 2nd, 2nd-X, pl, pl-X, f; word hum:: 3rd, 3rd-X, pl, pl-X, m; word hunna:: 3rd, 3rd-X, pl, pl-X, f; ############################################# # Nouns # ############################################# # This shows how a reasonably complicated morphology can be accommodated. # It is certainly possible that some of this may (and probably should) # be offloaded into a separate morphology-processing engine. However, # even in that case there is often a good deal more to the lexicon. # We show a couple examples of complete paradigms, in order to make it # easier to understand what's going on below. # Here is a typical noun (kitaab "book") with a broken plural (kutub "books"). # For nouns with broken plurals, the plural is typically declined like # the singular. Note that Arabic nouns are conjugated for three numbers # (singular, dual, plural), three cases (nominative, accusative, dative), # and three states (indefinite, definite, construct). (The construct state # is used for nouns that are modified by other nouns -- e.g. "book" in # "the book of Mary".) # Form Nominative Accusative Dative # --------------------------------------------------------------- # sg.indef kitaabun kitaaban kitaabin # sg.def al-kitaabu al-kitaaba al-kitaabi # sg.cons kitaabu kitaaba kitaabi # # du.indef kitaabaani kitaabayni kitaabayni # du.def al-kitaabaani al-kitaabayni al-kitaabayni # du.cons kitaabaa kitaabay kitaabay # # pl.indef kutubun kutuban kutubin # pl.def al-kutubu al-kutuba al-kutubi # pl.cons kutubu kutuba kutubi # Here is a typical noun (mudarris "teacher") with a different kind of # plural, a so-called "strong masculine plural", which has its own declension. # Form Nominative Accusative Dative # --------------------------------------------------------------- # sg.indef mudarrisun mudarrisan mudarrisin # sg.def al-mudarrisu al-mudarrisa al-mudarrisi # sg.cons mudarrisu mudarrisa mudarrisi # # du.indef mudarrisaani mudarrisayni mudarrisayni # du.def al-mudarrisaani al-mudarrisayni al-mudarrisayni # du.cons mudarrisaa mudarrisay mudarrisay # # pl.indef mudarrisuuna mudarrisiina mudarrisiina # pl.def al-mudarrisuuna al-mudarrisiina al-mudarrisiina # pl.cons mudarrisuu mudarrisii mudarrisii # Here, we make heavy use of macros. # This macro says: Every time an expression of the form # three-form-decl(...) occurs, replace it with the text that comes after. # The parameters will be substituted into the text. The braces that # denote the macro's text do *NOT* form part of the text that is substituted. # Note that macro substitutions are processed recursively: If the text # of a macro substitution contains calls to other macros, they will also # be processed. This makes "inheritance" very easy to implement. # This macro is used for a particular paradigm corresponding to a # particular number of a word. def three-different-form-decl(indef-form, def-form, cons-form, indef-nom, indef-acc, indef-gen, def-nom, def-acc, def-gen, cons-nom, cons-acc, cons-gen, morph-num, sem-num, gend) { indef-form.indef-nom: morph-num, sem-num, gend, nom, indef; indef-form.indef-acc: morph-num, sem-num, gend, acc, indef; indef-form.indef-gen: morph-num, sem-num, gend, gen, indef; add-al(def-form.def-nom): morph-num, sem-num, gend, nom, def; add-al(def-form.def-acc): morph-num, sem-num, gend, acc, def; add-al(def-form.def-gen): morph-num, sem-num, gend, gen, def; cons-form.cons-nom: morph-num, sem-num, gend, nom, cons; cons-form.cons-acc: morph-num, sem-num, gend, acc, cons; cons-form.cons-gen: morph-num, sem-num, gend, gen, cons; } # It's questionable whether we should do this. This assimilates al- # to a following coronal consonant, e.g. ar-rajul, as-sigaara, # ath-thalj, an-nuur, aDH-DHuhr, etc. def add-al(form) regsub('^al-([std]h|DH|[tdszrnTDSZL])', 'a\1-\1', al-.form) def three-form-decl(form, indef-nom, indef-acc, indef-gen, def-nom, def-acc, def-gen, cons-nom, cons-acc, cons-gen, morph-num, sem-num, gend) { three-different-form-decl(form, form, form, indef-nom, indef-acc, indef-gen, def-nom, def-acc, def-gen, cons-nom, cons-acc, cons-gen, morph-num, sem-num, gend) } # Using the above macro, we create two more macros to handle two common # paradigm types: Accusative and genitive are the same, and the # definite is either the same as the construct (two-form-decl-1) or # the same as the indefinite (two-form-decl-2). def two-form-decl-1(form, non-cons-nom, non-cons-obl, cons-nom, cons-obl, morph-num, sem-num, gend) { three-form-decl(form, non-cons-nom, non-cons-obl, non-cons-obl, non-cons-nom, non-cons-obl, non-cons-obl, cons-nom, cons-obl, cons-obl, morph-num, sem-num, gend) } def two-form-decl-2(form, indef-nom, indef-obl, non-indef-nom, non-indef-obl, morph-num, sem-num, gend) { three-form-decl(form, indef-nom, indef-obl, indef-obl, non-indef-nom, non-indef-obl, non-indef-obl, non-indef-nom, non-indef-obl, non-indef-obl, morph-num, sem-num, gend) } # In turn we create macros for particular paradigms: strong masculine ("uun"), # strong feminine ("aat"), dual, and basic triptote (the paradigm for # "kitaab" above and, in general, most singulars). # Note that an alternative to using braces is to put the macro text on # the same line as the `def' part of the macro (backslashes can be used # to join multiple lines together). def uun-plural(form) two-form-decl-1(form, uuna, iina, uu, ii, pl, pl-X, m) def aat-plural(form) \ two-form-decl-2(form, aatun, aatin, aatu, aati, pl, pl-X, f) def dual(form, gend) \ two-form-decl-1(form, aani, ayni, aa, ay, du, du-X, gend) def triptote(form, morph-num, sem-num, gend) \ three-form-decl(form, un, an, in, u, a, i, u, a, i, morph-num, sem-num, gend) # Here we define macros for full paradigms for words. Note how semicolons # are not used, because they are supplied by the macro text itself. # (Consult the text for three-form-decl() above, and remember that the # braces denoting the macro text are not actually part of the text. This # means that if you really want braces as the outermost thing in some # macro text, you'll need to supply two levels of braces.) def thing(sing, plur) { word sing:N(thing) { triptote(sing, sg, sg-X, m) dual(sing, m) triptote(plur, sg, pl-X, f) } } def fem-thing(sing, plur) { word sing:N(thing) { triptote(sing.t, sg, sg-X, f) dual(sing.t, f) triptote(plur, sg, pl-X, f) } } def person(sing, plur, gend) { word sing:N(person) { triptote(sing, sg, sg-X, gend) dual(sing, gend) triptote(plur, pl, pl-X, gend) } } def male(sing, plur) person(sing, plur, m) def female(sing, plur) person(sing, plur, f) def strong-male(sing) { word sing:N(person) { triptote(sing, sg, sg-X, m) dual(sing, m) uun-plural(sing) } } # Here we define the actual words. Note how short these definitions are, # specifying only what's unpredictable. thing(kitaab, kutub) thing(waqt, _awqaat) thing(Harf, Huruuf) thing(dars, duruus) thing(waqt, _awqaat) fem-thing(sigaara, sagaayir) fem-thing(madiina, mudun) male(rajul, rijaal) male(walad, _awlaad) male(Taalib, Tullaab) female(bint, _abnaat) strong-male(mudarris) word imra_a:N(person) { three-different-form-decl(imra_at, mar_at, imra_at, un, an, in, u, a, i, u, a, i, sg, sg-X, f) three-different-form-decl(imra_at, mar_at, imra_at, aani, ayni, ayni, aani, ayni, ayni, aa, ay, ay, du, du-X, f) triptote(nisaa_, pl, pl-X, f) } def extended_construct_word(stem, plur) { word stem:N(person) { three-form-decl(stem, un, an, in, u, a, i, uu, aa, ii, sg, sg-X, m) dual(stem, m) triptote(plur, pl, pl-X, m) } } extended_construct_word(_ax, _ixwaan) extended_construct_word(_ab, _abnaa_) # Typical paradigms: # 1sg 'aktaa 'aktaa 'akta | # 2sg.m taktaa taktaa takta | # 2sg.f taktayna taktay taktay | # 3sg.m yaktaa yaktaa yakta | # 3sg.f taktaa taktaa takta | # 2du taktayaani taktayaa taktayaa | # 3du.m yaktayaani yaktayaa yaktayaa | # 3du.f taktayaani taktayaa taktayaa | # 1pl naktaa naktaa nakta | # 2pl.m taktawna taktaw taktaw | # 2pl.f taktayna taktayna taktayna | # 3pl.m yaktawna yaktaw yaktaw | # 3pl.f yaktayna yaktayna yaktayna | # # 1sg 'aktuu 'aktuwa 'aktu | 'aktii 'aktiya 'akti # 2sg.m taktuu taktuwa taktu | taktii taktiya takti # 2sg.f taktiina taktii taktii | taktiina taktii taktii # 3sg.m yaktuu yaktuwa yaktu | yaktii yaktiya yakti # 3sg.f taktuu taktuwa taktu | taktii taktiya takti # 2du taktuwaani taktuwaa taktuwaa | taktiyaani taktiyaa taktiyaa # 3du.m yaktuwaani yaktuwaa yaktuwaa | yaktiyaani yaktiyaa yaktiyaa # 3du.f taktuwaani taktuwaa taktuwaa | taktiyaani taktiyaa taktiyaa # 1pl naktuu naktuwa naktu | naktii naktiya nakti # 2pl.m taktuuna taktuu taktuu | taktuuna taktuu taktuu # 2pl.f taktuuna taktuuna taktuuna | taktiina taktiina taktiina # 3pl.m yaktuuna yaktuu yaktuu | yaktuuna yaktuu yaktuu # 3pl.f yaktuuna yaktuuna yaktuuna | yaktiina yaktiina yaktiina def two-form-past(formv, formc) { formc.tu: past, 1st, sg; formc.ta: past, 2nd, m, sg; formc.ti: past, 2nd, f, sg; formv.a: past, 3rd, m, sg; formv.at: past, 3rd, f, sg; formc.tumaa: past, 2nd, du; formv.aa: past, 3rd, m, du; formv.ataa: past, 3rd, f, du; formc.naa: past, 1st, pl; formc.tum: past, 2nd, m, pl; formc.tunna: past, 2nd, f, pl; formv.uu: past, 3rd, m, pl; formc.na: past, 3rd, f, pl; } def 3rd-weak-past-ay(form) { form.ay.tu: past, 1st, sg; form.ay.ta: past, 2nd, m, sg; form.ay.ti: past, 2nd, f, sg; form.aa: past, 3rd, m, sg; form.at: past, 3rd, f, sg; form.ay.tumaa: past, 2nd, du; form.ay.aa: past, 3rd, m, du; form.ataa: past, 3rd, f, du; form.ay.naa: past, 1st, pl; form.ay.tum: past, 2nd, m, pl; form.ay.tunna: past, 2nd, f, pl; form.aw: past, 3rd, m, pl; form.ay.na: past, 3rd, f, pl; } def 3rd-weak-past-aw(form) { form.aw.tu: past, 1st, sg; form.aw.ta: past, 2nd, m, sg; form.aw.ti: past, 2nd, f, sg; form.aa: past, 3rd, m, sg; form.at: past, 3rd, f, sg; form.aw.tumaa: past, 2nd, du; form.aw.aa: past, 3rd, m, du; form.ataa: past, 3rd, f, du; form.aw.naa: past, 1st, pl; form.aw.tum: past, 2nd, m, pl; form.aw.tunna: past, 2nd, f, pl; form.aw: past, 3rd, m, pl; form.aw.na: past, 3rd, f, pl; } def 3rd-weak-past-ii(form) { form.ii.tu: past, 1st, sg; form.ii.ta: past, 2nd, m, sg; form.ii.ti: past, 2nd, f, sg; form.iya: past, 3rd, m, sg; form.iyat: past, 3rd, f, sg; form.ii.tumaa: past, 2nd, du; form.iy.aa: past, 3rd, m, du; form.iy.ataa: past, 3rd, f, du; form.ii.naa: past, 1st, pl; form.ii.tum: past, 2nd, m, pl; form.ii.tunna: past, 2nd, f, pl; form.uu: past, 3rd, m, pl; form.ii.na: past, 3rd, f, pl; } def strong-past(form) two-form-past(form, form) # In general, almost all Arabic present-tense verbs of a particular mood # can be defined using five forms. Verbs with a hamza in the first radical # have a problem in the first-singular; ideally this should be handled # automatically using a regexp or something of that sort, but we don't have # such support currently, so we use an optional param. def gen-pres(mood, fsing, fsing-fem, fdual, fplur-masc, fplur-fem) { # This shows how you can use regular expressions if need be. # regsub(string, regex, repl) is a special built-in that does regular- # expression substitution on STRING, replacing all occurrences of # REGEX with REPL. Regular-expression syntax is as in Python. # In this case, Arabic verbs have a phonetic rule that eliminates # two glottal stops occurring near each other at the beginning of a # word. For example, _a_kulu -> _aakulu, and _u_kalu -> _uukalu. # (That is, the vowel is lengthened.) # _ . regsub(foo, bar, fsing): pres, mood, 1st, sg; _ . regsub('^([aiu])_', '\1\1', fsing): pres, mood, 1st, sg; # _.fsing: pres, mood, 1st, sg; t.fsing: pres, mood, 2nd, m, sg; t.fsing-fem: pres, mood, 2nd, f, sg; y.fsing: pres, mood, 3rd, m, sg; t.fsing: pres, mood, 3rd, f, sg; t.fdual: pres, mood, 2nd, du; y.fdual: pres, mood, 3rd, m, du; t.fdual: pres, mood, 3rd, f, du; n.fsing: pres, mood, 1st, pl; t.fplur-masc: pres, mood, 2nd, m, pl; t.fplur-fem: pres, mood, 2nd, f, pl; y.fplur-masc: pres, mood, 3rd, m, pl; y.fplur-fem: pres, mood, 3rd, f, pl; } # The "two-form" present uses normal (non-3rd-weak) endings but may # have two forms of the root, one form vocalic endings (almost all of them) # and one for consonant endings (only the feminine plural). This # encompasses 2nd-weak verbs and doubled verbs, and (trivially) strong verbs. def two-form-pres-indic(formv, formc) { gen-pres(indic, formv.u, formv.iina, formv.aani, formv.uuna, formc.na) } def two-form-pres-subj(formv, formc) { gen-pres(subj, formv.a, formv.ii, formv.aa, formv.uu, formc.na) } # The jussive is different because the base form (fsing) has no ending. # This means that it may assume the consonant form instead of the vowel # form, or may have a number of variants (in particular, for doubled # verbs). So the base form needs to be given explicitly. def two-form-pres-juss(base, formv, formc) { gen-pres(juss, base, formv.ii, formv.aa, formv.uu, formc.na) } def strong-pres(form) { two-form-pres-indic(form, form) two-form-pres-subj(form, form) two-form-pres-juss(form, form, form) } def 2nd-weak-pres(formv, formc) { two-form-pres-indic(formv, formc) two-form-pres-subj(formv, formc) two-form-pres-juss(formc, formv, formc) } def doubled-pres(formv, formc) { two-form-pres-indic(formv, formc) two-form-pres-subj(formv, formc) two-form-pres-juss(formc, formv, formc) two-form-pres-juss(formv.a, formv, formc) two-form-pres-juss(formv.i, formv, formc) } # Verbs whose third radical is a /w/ or a /y/ have all manner of exceptional # forms; easiest just to list them. In general, there are three types, # depending on whether the base singular forms end in -aa, -ii, or -uu. def 3rd-weak-pres-aa(form) { gen-pres(indic, form.aa, form.ayna, form.ayaani, form.awna, form.ayna) gen-pres(subj, form.aa, form.ay, form.ayaa, form.aw, form.ayna) # Note the shortened vowel here. gen-pres(juss, form.a, form.ay, form.ayaa, form.aw, form.ayna) } def 3rd-weak-pres-ii(form) { gen-pres(indic, form.ii, form.iina, form.iyaani, form.uuna, form.iina) gen-pres(subj, form.iya, form.ii, form.iyaa, form.uu, form.iina) # Note the shortened vowel here. gen-pres(juss, form.i, form.ii, form.iyaa, form.uu, form.iina) } def 3rd-weak-pres-uu(form) { gen-pres(indic, form.uu, form.iina, form.uwaani, form.uuna, form.uuna) gen-pres(subj, form.uwa, form.ii, form.uwaa, form.uu, form.uuna) # Note the shortened vowel here. gen-pres(juss, form.u, form.ii, form.uwaa, form.uu, form.uuna) } def 2nd-weak-verb(pastv, props, pastc, presv, presc) { word pastv: props { two-form-past(pastv, pastc) 2nd-weak-pres(presv, presc) } } # Note the way that macro calls can be constructed as well. Here, the # value of PAST_TYPE is the suffix at the end of the macro name. def 3rd-weak-verb(past_stem, props, past_type, pres_stem, pres_type) { word past_stem . past_type: props { 3rd-weak-past- . past_type(past_stem) 3rd-weak-pres- . pres_type(pres_stem) } } def strong-verb(past, props, pres) { word past: props { strong-past(past) strong-pres(pres) } } 2nd-weak-verb(kaan, TransV(pred=be), kun, akuun, akun) 2nd-weak-verb(naam, IntransV(pred=sleep), nim, anaam, anam) 2nd-weak-verb(qaal, SayV(pred=say), qul, aquul, aqul) strong-verb(katab, TransV(pred=write), aktub) strong-verb(dhahab, IntransV(pred=go), adhhab) # Note that the following verb, which begins with a glottal stop, # will have a modification made to it in the first-person singular present. # (See above.) strong-verb(_akal, IntransV TransV (pred=eat), a_kul) 3rd-weak-verb(ra_, TransV(pred=see), ay, ar, aa) 3rd-weak-verb(_a9T, DitransV(pred=give), ay, u9T, ii) 3rd-weak-verb(laq, TransV(pred=find), ii, alq, aa) strong-verb(9araf, ThinkV(pred=know), a9rif) # see also 9alam strong-verb(tafakkar, ThinkV(pred=think), atafakkar) strong-verb(ta9allam, ThinkV(pred=learn), ata9allam) family N { entry: n<2>[X, 3rd, nonres]: X:sem-obj(*); } family InterrogPro(Pro) { entry: s/*(s/n<2>[res]); entry: s/*(s|n<2>[nonres,nom]); entry: s/*(s/n<2>[nonres,acc]); member: maa, man; } family Pro { entry: n<2>[X, nom, def, nonres]: X:sem-obj(*); member: pro; } family Rel { entry: (n<~2>[CASE,nonres]\n<~2>[nonres])/*(s/n<2>[res]); entry: (n<~2>[CASE,nonres]\n<~2>[nonres])/*(s|n<2>[nonres,nom]); entry: (n<~2>[CASE,nonres]\n<~2>[nonres])/*(s/n<2>[nonres,acc]); member: al-ladhii; } family AndConj(Conj) { entry: n[pl, CASE, STATE] \* n[CASE, STATE] /* n[CASE, STATE]; entry: s$1 \* s$1 /* s$1; member: wa; } family Det(indexRel=det) { entry: n<2>[X, def, nonres] /^ n<2>[X]: X:sem-obj(*); member: haadhaa, dhaalik; } # good luck on this one! construct even more complicated ones! # #ar-rajulu al-ladhii kataba al-kutuba ra_aa wa _a9Taa li binti hu al-kilaaba al-latii akalat sagaayira mudarrisii al-waladi #"the man that wrote the books saw and gave to his daughter the dogs that ate the cigarettes of the boy's teachers." # Works, correctly: #ar-rajulu al-ladhii kataba al-kutuba ra_aa wa _a9Taa li binti hu as-sagaayira al-latii qultu inna al-waladu tafakkara anna al-mar_atu _a9Tat haa li ha_ulaahi al-mudarrisiina #Bad #ar-rajula al-ladhii katabat al-kutuba ra_aa wa _a9Taa li binti hu as-sagaayira al-latii qultu inna al-waladu tafakkara anna al-mar_atu _a9Tat haa li ha_ulaahi al-mudarrisiina #"the man that wrote the books saw and gave to his daughter the cigarettes that I said that the boy thought that the woman gave them to those teachers" # Fails, correctly: #ar-rajulu al-ladhii kataba al-kutuba ra_aa wa _a9Taa li binti hu as-sagaayira al-latii qultu inna al-waladu tafakkara anna al-mar_atu _a9Tat hu li ha-ulaahi al-mudarrisiina # Works: # ar-rajulu al-ladhii al-waladu _a9Taa as-sagaayira li binti hu dhahaba # ar-rajulu _a9Taa li binti hu as-sagaayira # ar-rajulu al-ladhii waladu hu _a9Taa as-sagaayira li al-binti dhahaba # Won't work: # ar-rajulu al-ladhii al-waladu _a9Taa li binti hu as-sagaayira dhahaba # ar-rajulu al-ladhii waladu hu _a9Taa li al-binti as-sagaayira dhahaba family PossClitic(Cli, indexRel=poss) { entry: n<~1>[X, def, nonres] \* n<1>[X, cons]: X:sem-obj(*); entry: (n<~1>[X, def, nonres] / n<2>[res]) \* n<1>[X, cons]: X:sem-obj(*); member: ii, ka, ki, hu, haa, naa, kum, kunna, hum, hunna; } family ObjClitic(Cli, indexRel=poss) { entry: (s$1 | n<3>[nonres]) \ (s$1 / n<2>[nonres] | n<3>[nom,nonres]); entry: (s$1 / n<~2>[res] | n<3>[nonres]) \ (s$1 / n<2>[nonres] | n<3>[nom,nonres]); member: nii, ka, ki, hu, haa, naa, kum, kunna, hum, hunna; } family Adj(indexRel=adj) { entry: n<2>[X, NUM, GEND, CASE, STATE] \ n<2>[X]: X:sem-obj(*); } family Prep-Nom(Prep, indexRel="*NoSem*") { # The pp<~3> notation generates an 'inheritsFrom' tag rather than # an 'id' tag for the feature structure. entry: pp<~3>[lex=*] /< n<3>[gen,nonres]; member: li, fii; } family Comp(indexRel="*NoSem*") { entry: sbar<~1>[lex=*] / s<1>; member: anna, inna; } family IntransV(V) { entry: s[E] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being); } family SayV(V) { entry: s[E] / sbar[Z, lex=inna] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Z); } family ThinkV(V) { entry: s[E] / sbar[Z, lex=anna] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Z); } family TransV(V) { entry: s[E] / n[Y,acc,nonres] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Y:sem-obj); } family DitransV(V) { # The first slash (on the pp) is marked with a mode allowing backward xcomp. entry: s[E] /< pp[Z,lex=li] / n[Y,acc,nonres] | n<2>[X,NUM,PERS,GEND,non-cons,nonres,nom]: E:action(* ^ X:animate-being ^ Y:sem-obj ^ Z:animate-being) ; } testbed { # different states of subject rajulun dhahaba: 1; ar-rajulu dhahaba: 1; rajulu dhahaba: 0; # subject-verb agreement rajulun dhahabuu: 0; ar-rajulu dhahabuu: 0; rajulu dhahabuu: 0; # gender agreement ar-rajulu dhahaba: 1; ar-rajulu dhahabat: 0; al-bintu dhahaba: 0; al-bintu dhahabat: 1; # possession rajulun hu dhahaba: 0; ar-rajulu hu dhahaba: 0; rajulu hu dhahaba: 1; # subject case ar-rajula dhahaba: 0; ar-rajuli dhahaba: 0; # case in construct phrase _axuu ar-rajuli dhahaba: 1; _axuu ar-rajula dhahaba: 0; _axuu ar-rajulu dhahaba: 0; # construct state in construct phrase _axun ar-rajuli dhahaba: 0; al-_axu ar-rajuli dhahaba: 0; # object case ar-rajulu ra_aa al-kitaaba: 1; ar-rajulu ra_aa al-kitaabi: 0; ar-rajulu ra_aa al-kitaabu: 0; # preposition case ar-rajulu _a9Taa al-kitaaba li al-waladi: 1; ar-rajulu _a9Taa al-kitaaba li al-waladu: 0; ar-rajulu _a9Taa al-kitaaba li al-walada: 0; # subcategorization ar-rajulu ra_aa al-kitaaba li al-waladi: 0; # backward xcomp ar-rajulu _a9Taa li al-waladi al-kitaaba: 1; _a9Taa ar-rajulu li al-waladi al-kitaaba: 1; # object clitics ana ra_aytu hu: 1; ra_aytu hu ana: 1; ra_aytu ana hu: 0; hu ra_aytu ana: 0; hu ana ra_aytu: 0; huwa ra_aa nii: 1; huwa ra_aa ii: 0; huwa ra_aa ana: 0; ar-rajulu _a9Taa haa li al-waladi: 1; _a9Taa haa ar-rajulu li al-waladi: 1; # relative clauses # "I gave it to the man that the girl saw him" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at hu: 3; # "I gave it to the man that the girl saw her" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at haa: 0; # "I gave it to the man that the girl saw" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at: 0; # "I gave it to the man that the girl saw the boy" _a9Taytu haa li ar-rajuli al-ladhii al-bintu ra_at al-walada: 0; } ================================================ FILE: src/ccg2xml/build.xml ================================================ ================================================ FILE: src/ccg2xml/ccg.ply ================================================ # Author: Ben Wing # Date: November 2006 ############################################################################# # # # ccg.ply # # # # Convert a .ccg file, specifying a CCG grammar, into files lexicon.xml, # # rules.xml, morph.xml, types.xml and grammar.xml. You can't actually # # run this file itself; you have to use convert-ply.py to convert it # # into a Python file (ccg2xml), which you then run to generate the XML # # files from the .ccg input. For a description of the format of this # # file, see the comments in convert-ply.py. # # # ############################################################################# import sys import re import optparse import copy import os import cStringIO # Local imports import lex, yacc # Get options usage = """%prog [OPTIONS] FILE ... Generate appropriate XML files for input to OpenCCG. """ parser = optparse.OptionParser(usage=usage) parser.add_option("-o", "--omit-output", help="""Omit the specified files from the output. Value should be a list separated by commas or spaces. The allowed values are grammar, morph, lexicon, rules, types, and testbed. If you put a + sign before the list, it means output *only* the specified files.""" ) parser.add_option("-p", "--prefix", help="""Optional prefix to attach to each of the generated files, so that output from different files can occur in the same directory. Defaults to the base name of the input file, minus any extension, plus a hyphen. If you want such a hyphen or similar char, add it yourself.""", metavar="DIR" ) parser.add_option("-d", "--dir", help="""Directory to store files in (defaults to current directory).""", metavar="DIR" ) parser.add_option("-q", "--quiet", action="store_true", help="Don't output explanatory messages, but only warnings and errors.") parser.add_option("-t", "--transformed-input", action="store_true", help="Output transformed input after macro substitutions have been applied.") parser.add_option("-y", "--yacc-debug", action="store_true", help="Show more output about the YACC parser generation. Also probably generate some extra files, e.g. parser.out, containing info about the generated parser.") parser.add_option("-m", "--macro-debug", action="store_true", help="Dump macro definitions at end of file.") parser.add_option("--super-macro-debug", action="store_true", help="Show copious output about macro expansions.") def parse_arguments(argv): global options, global_args (options, global_args) = parser.parse_args(argv) # Global variables used for debugging; we may move them into the # global-state variable global lex_debug global xml_debug global yacc_debug global macro_debug global super_macro_debug lex_debug = 0 xml_debug = 0 yacc_debug = options.yacc_debug macro_debug = options.macro_debug super_macro_debug = options.super_macro_debug ######################################################################## # Utility functions # ######################################################################## # CONVENTIONS: # # --------- XML ---------- # # Thankfully, the structure of XML is extremely simple. We represent # a single XML statement of the form # # # # ... # gurgle # # # as a list # # ['biteme', [('foo', '1'), ('blorp', 'baz')], # ['bitemetoo', ...], # 'gurgle' # ] # # i.e. an XML statement corresponds to a list where the first element # is the statement name, the second element lists any properties, and # the remaining elements list items inside the statement. # # ----------- Property lists ------------- # # The second element of an XML statement in list form is a "property list", # a list of two-element tuples (property and value). Some functions below # (e.g. `getprop', `putprop') manipulate property lists. # # FIXME: Just use a hash table. # # ---------- Abstract syntax trees ----------- # # We use classes to represent statements and blocks. Below this level, it's # simpler to just use the XML that we ultimately have to generate anyway. # The conventions for using XML are either to use property lists or lists of # XML statements in the list form outlined above. ############################# # Handling XML # ############################# def xml_sub(crap): if type(crap) is not str: crap = str(crap) crap = crap.replace('<', '<') crap = crap.replace('>', '>') return crap def print_xml_1(file, xml, indent=0): if xml_debug > 1: errout("%sPrinting: %s\n" % (' ' * indent, str(xml))) if type(xml) is not list: file.write('%s%s\n' % (' ' * indent, xml_sub(xml))) else: check_arg_type("XML statement", xml[0], str) file.write(' ' * indent) file.write('<%s' % xml_sub(xml[0])) for x in xml[1]: check_arg_type("XML statement", x, tuple) if len(x) != 2: raise TypeError("Bad tuple pair: " + str(x)) file.write(' %s="%s"' % (xml_sub(x[0]), xml_sub(x[1]))) subargs = xml[2:] if not subargs: file.write('/>\n') else: file.write('>\n') for x in subargs: print_xml_1(file, x, indent + 2) file.write(' ' * indent) file.write('\n' % xml_sub(xml[0])) # Pretty-print a section of XML, in the format above, to FILE. # Start at indent INDENT. def print_xml(file, xml): if xml_debug == 1: errout("Printing: %s\n" % str(xml)) print_xml_1(file, xml) # Return True if PROP is seen as a property in PROPLIST, a list of tuples # of (prop, value) def property_specified(prop, proplist): return not not ['foo' for (x,y) in proplist if x == prop] # Return value of property PROP in PROPLIST; signal an error if not found. def getprop(prop, proplist): for (x,y) in proplist: if x == prop: return y raise ValueError("Property %s not found in %s" % (prop, proplist)) # Return value of property PROP in PROPLIST, or DEFAULT. def getoptprop(prop, proplist, default=None): for (x,y) in proplist: if x == prop: return y return default # Replace value of property PROP with VALUE in PROPLIST. def putprop(prop, value, proplist): for i in xrange(len(proplist)): if proplist[i][0] == prop: proplist[i] = (prop, value) return else: proplist += [(prop, value)] # Replace property named PROP with NEW in PROPLIST. Often this is called with # with PROP equal to None; the None occurs when a PROP=VALUE clause is expected # but a bare value is supplied. The context will supply a particular default # property (e.g. 'name') to be used when the property name is omitted, but the # generic code to handle property-value clauses doesn't know what this is. # The surrounding code calls property_name_replace() to fill in the proper name. def property_name_replace(prop, new, proplist): for i in xrange(len(proplist)): if proplist[i][0] == prop: proplist[i] = (new, proplist[i][1]) ############################# # Error-handling # ############################# def init_errors(errors_to_string): # Count of number of errors seen so far. global error_count error_count = 0 global write_errors_to_string write_errors_to_string = errors_to_string global stdout_file, stderr_file if errors_to_string: stdout_file = cStringIO.StringIO() stderr_file = cStringIO.StringIO() else: stdout_file = sys.stdout stderr_file = sys.stderr global message_log message_log = [] def save_errors(cur): cur.error_count = error_count cur.write_errors_to_string = write_errors_to_string cur.stdout_file = stdout_file cur.stderr_file = stderr_file class InternalError(StandardError): pass def argformat(format, arg): if type(format) is str: return format % arg else: return str(format) # Throw an error, like fprintf(stderr, ...) def synerr(format, *arg): raise SyntaxError(argformat(format, arg)) # Output to stderr, maybe. But output to stdout if our input is being # output at the same time, so the two will stay in sync. def maybe_errout(str): # Force display of error # FIXME: Maybe we could dump all errors into a single # window display and show the messages together #showerror('Message', str) if options.transformed_input: stdout_file.write(str) else: stderr_file.write(str) def error_or_warning(title, lineno, format, *arg): formatted_arg = argformat(format, arg) if lineno: maybe_errout("%s, line %s: %s\n" % (title, lineno, formatted_arg)) else: maybe_errout("%s: %s\n" % (title, formatted_arg)) # Add the message as a tuple, for easy recall in the editor # Note: lineno being put in irrespective of the fact # of whether it exists or not # FIXME!! The purpose of errors_to_string and message_log duplicate # each other somewhat. Clean up. global message_log message_log += [(title, lineno, formatted_arg)] # Write formatted arguments to stderr, with Error: printed. def error(lineno, format, *arg): global error_count error_count += 1 error_or_warning('Error', lineno, format, *arg) # Write formatted arguments to stderr, with Warning: printed. def warning(lineno, format, *arg): global warning_count warning_count += 1 error_or_warning('Warning', lineno, format, *arg) # Write formatted arguments to stdout. def outout(format, *arg): stdout_file.write(argformat(format, arg)) # Write formatted arguments to stderr. def errout(format, *arg): stderr_file.write(argformat(format, arg)) # Debugging output: Always to sys.stderr. def debug(format, *arg): sys.stderr.write(argformat(format, arg)) def check_arg_type(errtype, arg, ty): if type(arg) is not ty: raise TypeError("%s: Type is not %s: %s" % (errtype, ty, arg)) ############################# # Abstract Syntax Trees # ############################# # Classes beginning with CS (= CCG Syntax) are used for constructing the # abstract syntax tree corresponding to a CCG source file. (An abstract # syntax tree, or AST, is a hierarchical representation of the syntax of a # piece of source code text, in this case a CCG-format file.) The source # file is made up out of blocks, each of which begins with an identifier # and is followed by one or more statements. # A CSNode corresponds to any unified section of source code -- a single # block or statement, a particular part of a statement (e.g. an # attribute-value list or a single attribute-value clause), or even the # whole file. The basic restriction is that it must correspond to a single # YACC production; hence it logically belongs in a unit and is the maximum # extent of text that belongs in the unit or possibly statement in a single # block. It has some corresponding source text with starting and ending # line numbers, a function to generate the XML, and a function to draw the # node. If the node is large enough to represent at least one XML # statement, it should be a list of XML statements in the XML-statement # form described above (a list [TAG, PROPLIST, CHILD ...]); otherwise, the # format is undefined, but most likely will be a property list. The CSNode # is initialized from the YaccProduction object (stored in variable `p', # usually, but accessed as $@) associated with a particular production, # which supplies the extent of source code associated with the production. class CSNode(object): def __init__(self, prod): self.prod = prod def xml(self): # In many cases, it's easiest just to build up the XML at creation time # and store it, rather than constructing it dynamically. Note that we # intentionally don't initialize self.static_xml, so we get an error # if it's not set. return self.static_xml # draw(self, parent, cfile, vars): Draw the node by returning a new widget # containing the drawn representation: Should be defined if node is # drawable. It should return a widget that is a child of PARENT, # also a widget. It is up to the caller to call pack() or grid() # so that the widget's geometry will be set; but the draw() function # should appropriately configure any child widgets that it creates. # VARS is an object containing Tkinter variables that may control the # way that the node is drawn. # A CSStatement corresponds to a single statement in a single block. Note # that, in the interests of simplicity, we don't currently create objects # for pieces of CCG code that are smaller than a statement; instead, we # just use the XML representation. We usually follow the convention that # if we have to make changes to the XML that make it not be in a one-to-one # correspondence with the original code, we do this at the level of the # statement or block. class CSStatement(CSNode): def __init__(self, prod): super(CSStatement, self).__init__(prod) # A CSBlock is a single block. class CSBlock(CSNode): def __init__(self, prod): super(CSBlock, self).__init__(prod) ############################# # Misc # ############################# # Is it identifier material? Input should be a character. def isalnumund(str): return str.isalnum() or str in '_+-' # Prior to Python 2.4, no sorted() def my_sorted(lyst): lystcopy = list(lyst) lystcopy.sort() return lystcopy ######################################################################## # Tokenizing # ######################################################################## # The following IDs have a special meaning to the OpenCCG tokenizer if a # token has the form [*ID*]. #magic_names = ('AMT', 'DATE', 'DUR', 'NUM', 'TIME') # It seems that the tokenizer does not require an [*ID*] token to be a known # magic thing (i.e. [*FOO*] is a legal surface form), so line 402 is commented # out. # If someone decides that [*ID*] tokens should be restricted to the ones # listed above, uncomment lines 397 and 503. # Directives -- These are particular words that are specially handled in # an appropriate position and hence need to be tokens for use in the # parser. However, they can also be part of a generic "word" -- in # other words, we have no "reserved words". directives = ( 'FAMILY', 'ENTRY', 'MEMBER', 'FEATURE', 'PROP', 'RULE', 'NO', 'APP', 'COMP', 'XCOMP', 'SUB', 'XSUB', 'TYPERAISE', 'TYPECHANGE', 'DEF', 'WORD', 'TESTBED', 'RELATION_SORTING' ) # Additional tokens that can form part of a word. A bare 'x' can form # part of a word as well, except for in a few circumstances. basic_word_no_x_tokens = ('ID', 'QUOTEDID') + directives word_no_x_tokens = ('NUMBER',) + basic_word_no_x_tokens word_no_number_tokens = ('X',) + basic_word_no_x_tokens word_tokens = ('NUMBER',) + word_no_number_tokens + ('MAGIC_ID',) bracket_tokens = ('LPAREN', 'RPAREN', 'LBRACKET', 'RBRACKET', 'LBRACE', 'RBRACE') other_tokens = ( # String tokens 'SLASH', 'BACKSLASH', 'LESS', 'GREATER', 'CARET', 'STAR', 'DOT', 'AT', 'EQUALS', 'GOESTO', 'PIPE', 'COMMA', 'SEMI', 'DOLLAR', 'COLON', 'BANG', 'TILDE', # Handled through t_ID 'PLUS', 'MINUS', 'PLUSMINUS', # Only in a def() 'NEWLINE', 'BOGUS_VALUE' # Kludge kludge kludge, fuck me harder ) tokens = word_tokens + bracket_tokens + other_tokens t_LPAREN = r'\(' t_RPAREN = r'\)' t_LBRACKET = r'\[' t_RBRACKET = r'\]' t_LBRACE = r'\{' t_RBRACE = r'\}' t_SLASH = r'/' t_BACKSLASH = r'\\' t_LESS = r'<' t_GREATER = r'>' t_CARET = r'\^' t_STAR = r'\*' t_DOT = r'\.' t_AT = r'@' t_EQUALS = r'=' t_GOESTO = r'=>' t_PIPE = r'\|' t_COMMA = r',' t_SEMI = r';' t_DOLLAR = r'\$' t_COLON = r':' t_BANG = r'!' t_TILDE = r'~' # Identifiers and directives directives_map = { } for r in directives: directives_map[r.lower().replace('_', '-')] = r directives_map['x'] = 'X'; # We handle +, -, and +- here because + and - can, in general, form part # of a token. directives_map['+'] = 'PLUS'; directives_map['-'] = 'MINUS'; directives_map['+-'] = 'PLUSMINUS'; def t_ID(t): r'''(([\-+%a-zA-Z_0-9]|[^\000-\177])+|"[^"\n]+"|\'[^'\n]+')''' # convert to directive, maybe if re.match(r'^\d+$', t.value): t.type = 'NUMBER' try: t.value = int(t.value) except ValueError: error(t.lineno, "Integer value too large: %s", t.value) t.value = 0 elif t.value in directives_map: t.type = directives_map[t.value] else: t.type = 'ID' # remove quotes if they're there if t.value[0] == '"' or t.value[0] == "'": t.type = 'QUOTEDID' t.value = t.value[1:-1] return t # The distinction from ordinary IDs is currently not really needed, i.e. t_ID # could to the job, too. However, it leaves open the possibility to handle # magic IDs differently from ordinary ones. def t_MAGIC_ID(t): r'''(\[\*[^*]+\*\])''' t.type = 'MAGIC_ID' return t #t_MAGIC_WORD.func_doc = '(\[\*(' + '|'.join(magic_names) + ')\*\])' t_ignore = " \t\r" #bracketmap = {'(': 'LPAREN', ')': 'RPAREN', # '[': 'LBRACKET', ']': 'RBRACKET', # '{': 'LBRACE', '}': 'RBRACE'} # #def t_LBRACKET(t): # r'[\[\(\{]' # global parendepth # parendepth += 1 # t.type = bracketmap[t.value] # return t # #def t_RBRACKET(t): # r'[\]\)\}]' # global parendepth # parendepth -= 1 # t.type = bracketmap[t.value] # return t def t_backslash_newline(t): r'\\\r?\n' t.lineno += 1 # If it's not a line continuation, it's just a normal backslash if not lexer_track_newlines: t.type = 'BACKSLASH' return t def t_newline(t): r'\n' t.lineno += 1 if lexer_track_newlines: t.type = 'NEWLINE' return t # Comments def t_comment(t): r'\#[^\n]*\n' t.lineno += 1 def t_error(t): error(t.lineno, "Illegal character '%s'", t.value[0]) t.skip(1) def init_lexer(): # This is a signal to us to go into "line mode", where we return a # newline as a token and treat backslash at the end of a line as a line # continuation device. global lexer_track_newlines lexer_track_newlines = 0 # Build the lexer. This does introspection, on all the t_*() functions. global globallexer globallexer = lex.lex(debug=lex_debug) def save_lexer(cur): cur.lexer_track_newlines = lexer_track_newlines cur.globallexer = globallexer ######################################################################## # Parsing # ######################################################################## def p_word(p): 'FILLED IN BELOW' p[0] = p[1] # fill in the documentation (i.e. the cfg rule) p_word.func_doc = 'word : ' + '\n| '.join(word_tokens) # hack, to deal with a reduce/reduce conflict def p_word_except_x(p): 'FILLED IN BELOW' p[0] = p[1] # fill in the documentation (i.e. the cfg rule) p_word_except_x.func_doc = 'word_except_x : ' + '\n| '.join(word_no_x_tokens) def p_word_no_numbers(p): 'FILLED IN BELOW' p[0] = p[1] # fill in the documentation (i.e. the cfg rule) p_word_no_numbers.func_doc = ( 'word_no_numbers : ' + '\n| '.join(word_no_number_tokens)) %y ############################# # Begin Yacc Declarations # ############################# ############################# # Word lists, etc # ############################# empty: : $$ = [] commas: COMMA : commas COMMA typed_word : word : word COLON word : $$ = '%s:%s' % ($1, $3) # Possibly empty list of words word_0: word : word commas word_list: word_0 * # Possibly empty list of words or * word_or_star: word | STAR word_or_star_0: word_or_star : word_or_star commas # Non-empty list of words #nonempty_word_list: word_0 : $$ = [$1] # : nonempty_word_list word_0 : $$ = $1 + [$2] # Attribute lists contain specifications of the form ATTR=VALUE. The # return value is a list of (attribute, value) tupes. attr : word EQUALS word : $$ = ($1, $3) attr_0: attr : attr commas attr_list: attr_0* opt_paren_attr_list: empty : LPAREN attr_list RPAREN : $$ = $2 # Extended attribute lists contain either VALUE or ATTR=VALUE. The return # value is a list of (attribute, value) tupes; when a bare value is given, # the attribute is None. ext_attr : word : $$ = (None, $1) : word EQUALS word : $$ = ($1, $3) ext_attr_0: ext_attr : ext_attr commas ext_attr_list: ext_attr_0* opt_paren_ext_attr_list: empty : LPAREN ext_attr_list RPAREN : $$ = $2 ############################# # Statements # ############################# top : statement* statement : family_block | feature_block | rule_block | macro_def | word_block | testbed_block | relation_sorting_block : SEMI ############################# # Macros # ############################# %p def init_macros(): # Used to turn off macro substitution while processing a macro definition. global no_macro_sub no_macro_sub = 0 # Needed to handle issue where macro def is immediately followed by # macro call. global return_bogus_value return_bogus_value = 0 # Mapping of macro definitions to parameter list and text. global macro_defs macro_defs = {} # It doesn't really matter what the parameter names are for built-ins. # There just have to be the right number of them. macro_defs['regsub'] = MacroDef(['fromre', 'totext', 'str'], regsub) macro_defs['ifmatch'] = MacroDef(['regex', 'string', 'doif', 'doelse'], ifmatch) macro_defs['ifmatch-nocase'] = MacroDef(['regex', 'string', 'doif', 'doelse'], ifmatch_nocase) def save_macros(cur): cur.no_macro_sub = no_macro_sub cur.return_bogus_value = return_bogus_value cur.macro_defs = macro_defs class MacroDef: def __init__(self, args, text): self.args = args self.text = text class CCGToken(lex.LexToken): def __init__(self, type, value): self.type = type self.value = value def arg_to_text(arg): return ''.join([str(x.value) for x in arg]) # Implementation of built-in 'regsub()': Concatenate the tokens into # text, then do regex substitution. def regsub(fromre, totext, string): return re.sub(arg_to_text(fromre), arg_to_text(totext), arg_to_text(string)) # If REGEX matches the beginning of STRING, return DOIF, else return DOELSE. def ifmatch(regex, string, doif, doelse): if re.match(arg_to_text(regex), arg_to_text(string)): return doif else: return doelse # Same as ifmatch() but case-insensitive. def ifmatch_nocase(regex, string, doif, doelse): if re.match(arg_to_text(regex), arg_to_text(string), re.IGNORECASE): return doif else: return doelse def print_macros(): for (key, value) in macro_defs.iteritems(): print "Macro: %s(%s): %s" % (key, value.args, value.text) # Given some text, expand the macros in it, recursively (i.e. apply # any macros, then apply macros to the resulting text, etc.). After # that, combine text that has the . operator applied to it. def macroexpand_text(text): if super_macro_debug: print "Text before expanding: %s" % arg_to_text(text) # Now recursively expand macros. The code to actually check for # macros is in MacroLexer. lexer = MacroLexer(None) lexer.pushstack(text) newtext = [] while True: tok = lexer.token() #print "Reading token: %s" % tok if not tok: break newtext.append(tok) text = newtext l = len(text) if super_macro_debug: print "Text after expanding: %s" % arg_to_text(text) # Now directly handle instances with the '.' operator, so that # the operator can be used to create new macro calls x = 1 while x < l - 1: if (text[x].type == 'DOT' and text[x-1].type in ['ID', 'QUOTEDID'] and text[x+1].type in ['ID', 'QUOTEDID']): tok = CCGToken(text[x-1].type, text[x-1].value + text[x+1].value) tok.lineno = text[x].lineno # If either is quoted, the result should be quoted. if text[x+1].type == 'QUOTEDID': tok.type = 'QUOTEDID' text[x-1] = tok text[x:x+2] = [] x -= 2 l -= 2 x += 1 return text # Return text of macro, with ARGS substituted for formal parameters of # the macro. def macrosub(macdef, args, lineno): text = macdef.text # If the text definition is a function (for builtins), # macro-expand the arguments, then call the function. if callable(text): args = [macroexpand_text(x) for x in args] text = text(*args) if type(text) is str: text = [CCGToken('QUOTEDID', text)] text[0].lineno = lineno return text else: return macroexpand_text(text) else: # Otherwise, make a copy of the text and substitute the arguments # into it. text = text[:] args = dict(zip(macdef.args, args)) l = len(text) x = 0 while x < l: if (text[x].type == 'ID' or text[x].type in directives) \ and text[x].value in args: newtext = args[text[x].value] text[x:x+1] = newtext l += len(newtext) - 1 x += len(newtext) - 1 x += 1 return macroexpand_text(text) %y # We need to do some hackery with BOGUS_VALUE in order to avoid problems # when a macro definition is immediately followed by a call to that same # macro. The problem is that generally the parser wants to read one token # ahead. As a result, by the time it's processed the token that ends a # macro definition, it's already read the following token -- and if that # token begins a macro call, we're screwed. To avoid this, we ensure that # there is an extra BOGUS_VALUE token returned after every macro definition. # To make this happen, we set a flag return_bogus_value just before the # parser processes the token ending the macro definition. At this point, # the parser has already read that token from the lexer, and before it # reduces that token, it reads the next token from the lexer -- which # returns a bogus token, as we instructed it. macro_def : macro_def_1 BOGUS_VALUE turn_off_macro_sub: : global no_macro_sub no_macro_sub = 1 return_bogus_value: : global return_bogus_value return_bogus_value = 1 global no_macro_sub no_macro_sub = 0 macro_def_1 : turn_off_macro_sub DEF word LPAREN macro_param_list turn_on_linetrack RPAREN macro_text : macdef = MacroDef($5, $8) macdef.args = $5 macdef.text = $8 if $3 in macro_defs: error($@.lineno(0), "Redefining macro %s" % $3) macro_defs[$3] = macdef #print_macros() macro_param_list : word_list macro_text: bracemacro_text | linemacro_text bracemacro_text: turn_off_linetrack LBRACE bracemacro_text_list return_bogus_value RBRACE: $$ = $3 bracemacro_text_list: empty : $$ = [] : bracemacro_text_list bracemacro_text_entry : $$ = $1 + $2 # The key thing about these is that they must be invoked BEFORE the # token that tells you whether to turn the mode on or off. If you # try to set the global variable after (even directly after) the # RPAREN or NEWLINE or whatever has been processed by a rule, it's too # late: The parser has already looked ahead, and any newline directly # following the token in question already processed the wrong way. turn_on_linetrack: : global lexer_track_newlines lexer_track_newlines = 1 turn_off_linetrack: : global lexer_track_newlines lexer_track_newlines = 0 %p def p_bracemacro_text_entry(p): '''bracemacro_text_entry : LPAREN bracemacro_text_list RPAREN | LBRACKET bracemacro_text_list RBRACKET | LBRACE bracemacro_text_list RBRACE''' p[0] = [p.slice[1]] + p[2] + [p.slice[3]] def p_bracemacro_text_entry_other(p): 'FILLED IN BELOW' p[0] = [p.slice[1]] # fill in the documentation (i.e. the cfg rule) p_bracemacro_text_entry_other.func_doc = ( 'bracemacro_text_entry : ' + '\n| '.join(other_tokens + word_tokens) ) %y linemacro_text: turn_off_linetrack return_bogus_value NEWLINE: $$ = [] linemacro_text: linemacro_begin linemacro_next* turn_off_linetrack return_bogus_value NEWLINE: $$ = [$1] + $2 %p def p_linemacro_begin(p): p[0] = p.slice[1] def p_linemacro_next(p): p[0] = p.slice[1] linemacro_begin_tokens = [x for x in tokens if x != 'NEWLINE' and x != 'LBRACE'] linemacro_next_tokens = [x for x in tokens if x != 'NEWLINE'] # fill in the documentation (i.e. the cfg rule) p_linemacro_begin.func_doc = ( 'linemacro_begin : ' + '\n| '.join(linemacro_begin_tokens) ) p_linemacro_next.func_doc = ( 'linemacro_next : ' + '\n| '.join(linemacro_next_tokens) ) %y ############################# # Feature blocks # ############################# %p def init_features(): # For each feature value, map its name to a CCGFeatval structure # describing it. global feature_values feature_values = {} # List of values for a particular feature; each value is a CCGFeatval, # listing a value, its parents, licensing info, macro info, and its # feature. global feature_to_values feature_to_values = {} # List of distributive features global distributive_features distributive_features = [] # List of XML for licensing features global licensing_feature_xml licensing_feature_xml = [] # Mapping of the names of feature values to the value inserted into the # XML 'val' attribute; usually the same as the name. (YUCK YUCK YUCK) global fv_names_to_values fv_names_to_values = {} def save_features(cur): cur.feature_values = feature_values cur.feature_to_values = feature_to_values cur.distributive_features = distributive_features cur.licensing_feature_xml = licensing_feature_xml cur.fv_names_to_values = fv_names_to_values # A feature value: The "name" of the feature value (corresponding to a # feature macro), the parents of this value, and any licensing info. Also # may include a .feature, which is the "feature" that this value is a value # for. class CCGFeatval: def __init__(self, name, parents, licensing): self.name = name self.parents = parents self.licensing = licensing def __str__(self): return "CCGFeatval(%s, parents=%s, licensing=%s)" % ( (self.name, self.parents, self.licensing)) def __repr__(self): return str(self) # Encapsulates directly obtained values and values obtained recursively, # so we can avoid needlessly adding parents to the latter kind. # Used temporarily when building the hierarchy. Both direct and recursive # are lists of CCGFeatvals. class CCGFeatvalList: def __init__(self, direct, recursive=None): recursive = recursive or [] # fuckme! self.direct = direct self.recursive = recursive def __str__(self): return "CCGFeatvalList(direct=%s,recursive=%s)" % (self.direct,self.recursive) def __repr__(self): return str(self) # For the given feature, and list of CCGFeatvals, convert the parents in # each CCGFeatval to a list of CCGFeatvals rather just strings, and clean # any excess parents. Basically, if a value has multiple parents and one # is reachable by following a path starting from another, it needs to be # removed. # FIXME!! Also output warnings when a featvar and featval have the same name. def install_feature(feature, lis, lineno): # Add names to reverse-feature list and check for duplicates. for x in lis: if x.name in feature_values: warning(lineno, "Duplicate feature value `%s' (feature `%s', previously in feature `%s')", x.name, feature, feature_values[x.name].feature) else: feature_values[x.name] = x x.feature = feature # Change the parents list of each value to point to actual featval objects # rather than just strings; check for unrecognized and duplicate values. for x in lis: newpar = [] for y in x.parents: if y in feature_values: if feature_values[y] in newpar: synerr("Duplicate feature value %s as parent of %s, feature %s", y, x.name, feature) else: newpar.append(feature_values[y]) else: synerr("Unrecognized feature value %s as parent of %s, feature %s", y, x.name, feature) x.parents = newpar # Check NODE and its parents to make sure it hasn't been seen before in # LIST, adding NODE to LIST as soon as it's seen. def check_cycles(node, list): if node in list: synerr("Cycle seen involving feature value %s", node.name) for x in node.parents: check_cycles(x, list + [node]) # Check for cycles. for x in lis: check_cycles(x, []) # Check NODE and its parents to make sure that ORIGNODE is not reachable. def check_reachable(node, orignode): if node == orignode: return True for x in node.parents: if check_reachable(x, orignode): return True return False # Clean excess parents. for x in lis: newpar = [] for y in x.parents: for z in x.parents: if z != y and check_reachable(z, y): break else: newpar.append(y) x.parents = newpar # Finally: Add to feature list. feature_to_values[feature] = lis # Return XML to go in types.xml. def make_feature_types_xml(): xml = [] for (x, featvals) in feature_to_values.iteritems(): # FIXME! Figure out what's going wrong here. # typename = x # print "fv_names_to_values: %s" % fv_names_to_values # if x in fv_names_to_values: # typename = fv_names_to_values[x] # xml += [['type', [('name', typename)]]] xml += [['type', [('name', x)]]] for y in featvals: if y.parents: xml += [['type', [('name', y.name), ('parents', ' '.join([z.name for z in y.parents]))]]] else: xml += [['type', [('name', y.name), ('parents', x)]]] return xml # Return XML to go in morph.xml. def make_feature_morph_xml(): xml = [] for x in my_sorted(feature_values): featval = feature_values[x] if featval.macrotie: entry = ['macro', [('name', '@%s' % x)]] for y in featval.macrotie: if type(y) is int: entry += [['fs', [('id', y)], ['feat', [('attr', featval.feature), ('val', fv_names_to_values[x])]]]] else: (wordtie, typename) = y entry += [['lf', [], ['satop', [('nomvar', wordtie)], ['diamond', [('mode', typename)], ['prop', [('name', fv_names_to_values[x])]]]]]] xml += [entry] return xml # Return XML to go in lexicon.xml. def make_feature_lexicon_xml(): xml = [] if distributive_features: xml.append(['distributive-features', [('attrs', ' '.join(distributive_features))]]) if licensing_feature_xml: xml.append(['licensing-features', []] + licensing_feature_xml) return xml %y # Allow you to override the value inserted by a feature macro, if you # really want to (requested by Fred). featval_2: word: $$ = ($1, $1) : word COLON word: $$ = ($1, $3) featval_1: featval_2 : $$ = $1 + ([], []) : featval_2 LBRACKET word_list RBRACKET : $$ = $1 + ($3, []) : featval_2 LPAREN attr_list RPAREN : $$ = $1 + ([], $3) : featval_2 LBRACKET word_list RBRACKET LPAREN attr_list RPAREN : $$ = $1 + ($3, $6) featval: featval_1: (name, value, parents, licensing) = $1 fv_names_to_values[name] = value $$ = CCGFeatval(name, parents, licensing) set_featval: featval: $$ = CCGFeatvalList([$1]) : featval LBRACE set_featval_list RBRACE: # The set_featval_list returns a CCGFeatvalList, where the direct entries # are those actually in the list itself, and the recursive entries # are descendants of them. First add ourself as parent to the direct # entries. Then move direct into recursive and put ourself as the only # direct entry. for x in $3.direct: x.parents += [$1.name] $$ = CCGFeatvalList([$1], $3.direct + $3.recursive) set_featval_0: set_featval : set_featval commas set_featval_list: set_featval_0 : set_featval_list set_featval_0: $1.direct += $2.direct; $1.recursive += $2.recursive; $$ = $1 featvar: NUMBER: $$ = int($1) : word_no_numbers: $$ = ($1, $1) : word_no_numbers COLON word: $$ = ($1, $3) featvar_0 : featvar : featvar commas opt_featspec: empty : $$ = (None, None) : LESS featvar_0+ GREATER : $$ = ($2, None) : LPAREN attr_list RPAREN : $$ = (None, $2) : LESS featvar_0+ GREATER LPAREN attr_list RPAREN: $$ = ($2, $5) opt_feature_bang: empty | BANG # We declare this in a slightly strange way to work around the awful bug # involving non-recognition of empty RHS rules. feature_decl_tail: SEMI: $$ = [] : COLON set_featval_list SEMI: $$ = $2 feature_decl : opt_feature_bang word opt_featspec feature_decl_tail: if $1: global distributive_features distributive_features.append($2) if $4: feature_values = $4.direct + $4.recursive else: feature_values = [] install_feature($2, feature_values, $@.lineno(0)) (macrotie, licensing) = $3 # Add macro-tie info to each feature for x in feature_values: x.macrotie = macrotie # Handle licensing attributes on the feature values global licensing_feature_xml for x in feature_values: if x.licensing: licensing_feature_xml.append( ['feat', [('attr', $2), ('val', fv_names_to_values[x.name])] + x.licensing]) # Handle licensing attributes on the feature itself rather than # on a feature value if licensing: licensing_feature_xml.append(['feat', [('attr', $2)] + licensing]) feature_block : FEATURE LBRACE feature_decl* RBRACE ############################# # Atomic categories # ############################# unification_id: NUMBER: $$ = ('id', $1) : TILDE NUMBER: $$ = ('inheritsFrom', $2) unification_id_0: unification_id : unification_id commas unification_id_spec: LESS unification_id_0* GREATER: $$ = $2 atomcat_bracket_entry : word EQUALS word : $$ = ['feat', [('attr', $1), ('val', $3)]] atomcat_bracket_entry : word EQUALS STAR : $$ = ['feat', [('attr', $1), ('val', '[*DEFAULT*]')]] # FIXME!!!! Be more intelligent in determining how to separate nomvars # and featvars, instead of just using some isupper() hacks. We should # check to see if the nomvars are represented in the corresponding # hylo spec. We should also output warnings if a bare word occurs and # it is not identified anywhere as either a nomvar (should appear in hylo), # a featvar (should appear in feature {}), or a featval (likewise). atomcat_bracket_entry : word : if $1[0].isupper() and (len($1) == 1 or not $1[1].isupper()): $$ = ['feat', [('attr', 'index')], ['lf', [], ['nomvar', [('name', $1)]]]] elif $1 in feature_values: $$ = ['feat', [('attr', feature_values[$1].feature), ('val', $1)]] else: $$ = ['feat', [('attr', $1)], ['featvar', [('name', "%s" % $1.upper())]]] atomcat_bracket_entry : word COLON word: $$ = ['feat', [('attr', $1)], ['featvar', [('name', "%s:%s" % ($1.upper(), $3))]]] atomcat_bracket_entry_0: atomcat_bracket_entry : atomcat_bracket_entry commas atomcat_bracket : LBRACKET atomcat_bracket_entry_0* RBRACKET : $$ = $2 # The use of word_except_x here is a hack to avoid a reduce/reduce conflict # due to the use of x as an operator as well as a word. Without it, the # parser doesn't know, e.g., how to disambiguate something beginning # FOO/x(... -- is x an operator or a category? The parser only looks one # character ahead, so it can't figure this out. With this hack, you cannot # use a single lowercase x as a category name without putting it in quotes, # e.g. 'x'. atomcat : word_except_x unification_id_spec? atomcat_bracket? : $$ = ['atomcat', [('type', $1)], ['fs', $2] + $3] ############################# # Slashes # ############################# %p # Temporary switch to Python mode to insert the needed function slash_to_default_mode = {'/':'>', '\\':'<', '|':'.'} ability_to_ability_value = {'@': 'active', '!': 'inert', None:None} ability_value_to_ability = {'active': '@', 'inert': '!', None:None} def makeslash(direc, mode, ability): if not mode: mode = slash_to_default_mode[direc] if direc == '|': direc = None ability = ability_to_ability_value[ability] return ['slash'] + [(direc and [('dir', direc)] or []) + (ability and [('ability', ability)] or []) + (mode and [('mode', mode)] or [])] %y bareslash : SLASH | BACKSLASH | PIPE slash_ability : AT | BANG slash_mode : X GREATER : $$ = 'x>' : LESS X : $$ = '[E] \ np<2>[X nom] / np<3>[Y acc] # # # XML output: # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # ###################### # This is a basic attempt to create a compact (BNF-style) syntax for # representing legal XML for categories. The idea is that this could be # parsed and used to verify the XML, or perhaps to convert it to some other # form. It's not clear this is worth it -- there is already an XML schema # notation for describing legal XML (albeit it's extremely obnoxious and # verbose), and verifiers for verifying XML given a schema and a piece of # XML, and XSLT (again, obnoxiously verbose) for transforming XML. # category = ( atomcat | complexcat ) # complexcat { # atomcat # (slash (category | dollar) | dollar | setarg)+ # lf? # } # basicArg = ( slash | category ) # dollarArg = ( slash | dollar ) # dollar(name) # setarg { basicArg basicArg+ } # atomcat(!type=[NMTOKEN]) { # fs? # lf? # } # fs(id) { # (feat(attr='index') { # lf { nomvar(name) } } # | feat(attr, val) # )+ # } # slash(dir=('/', '\\', '|'), mode=('.', '*', '^', 'x', 'x<', ''), # varmodality, ability=('inert', 'active')) # fs(id): # ... %p ################## # Initialization # ################## # This maps face names to actual properties. A face is the complete # description for how a particular piece of text is to be displayed. The # properties can specify the font family, size, bold or not, italic or not, # subscript or not, "scale" (modify the size by the specified value), and # "inherit" to inherit from a specified face. If not otherwise given, # all faces inherit from the default. face_mapping_init = { # The default mapping should contain a value for all parameters 'default':{'family':'times', 'size':16, 'bold':False, 'italic':False, 'subscript':False, 'superscript':False, 'scale':100, 'foreground':None, 'background':None}, 'subscript':{'family':'helvetica', 'subscript':True, 'scale':70, }, 'category':{'bold':True, 'family':'helvetica'}, 'dollar':{}, 'slash':{}, 'slash mode':{'subscript':True, 'scale':65}, 'paren':{}, 'brace':{}, 'family name':{'foreground':'blue', 'scale':130, 'bold':True}, 'lexical item':{'italic':True}, 'numeric index':{'inherit':'subscript'}, 'subscript comma':{'inherit':'subscript'}, 'nomvar':{'inherit':'subscript', 'foreground':'forest green'}, 'feature':{'inherit':'subscript', 'bold':True, 'foreground':'red'}, # Ideally the following should be in small caps 'caret':{'scale':115}, 'semname':{'scale':115}, 'semrole':{'scale':85, 'bold':True}, 'member heading':{}, 'member comma':{}, 'member':{'bold':True}, } # Offset to be applied to the baseline to handle subscripts and superscripts, # relative to the size of the font of the subscripted/superscripted text. # FIXME: Maybe should be relative to the larger size of the base (non-offset) # text. This would require that the 'scale' option not be handled in # fixup_face_properties() but dealt with at the time that the offset is # computed, so that the original text size is still available. subscript_offset = -0.5 superscript_offset = 1 # Factor to scale all: FIXME, not currently working zoom_factor = 100 # Merged and fixed up equivalent of the above. This will also have a # 'font' property containing the Tk font item corresponding to the # family, size, bold, and italic properties. face_mapping = {} # Fix up a derived table of properties. Currently this only handles 'scale'. # This destructively modifies the property table. def fixup_face_properties(props): if 'scale' in props: scale=props['scale'] del props['scale'] assert 'size' in props # Consider the size to its scaling factor props['size'] = int(props['size'] * scale / 100.0 + 0.5) family = props['family'] size = props['size'] weight = props['bold'] and 'bold' or 'normal' slant = props['italic'] and 'italic' or 'roman' props['font'] = tkFont.Font(family=family, size=size, weight=weight, slant=slant) return props # Merge two tables of properties, with P2 overriding P1. Remove the # 'inherit' property in the process. Creates a new table, and does not # modify P1 or P2. def merge_face_properties(p1, p2): props = {} for x in p1: if x != 'inherit': props[x] = p1[x] for x in p2: if x != 'inherit': props[x] = p2[x] return props # Derive the complete list of properties associated with a face name. def face_properties(name): props = face_mapping_init[name] # If name is default, return properties directly if name == 'default': return merge_face_properties(props, {}) # Else, determine where to inherit from and merge properties with # recursively computed value if 'inherit' in props: inherit = props['inherit'] else: inherit = 'default' return merge_face_properties(face_properties(inherit), props) # Compute the merged properties for all faces. def compute_face_properties(): for x in face_mapping_init: props = fixup_face_properties(face_properties(x)) face_mapping[x] = props def late_init_draw_once(): compute_face_properties() ################################# # Drawing a section of text # ################################# # Create tags in a text widget corresponding to the faces and their # properties. FIXME: Maybe we should do this only when needed, for each # text widget. def create_tags(text): for x in face_mapping: props = face_mapping[x] offs = 0 if props['subscript']: offs = subscript_offset elif props['superscript']: offs = superscript_offset offs = offs*props['size'] offs = '%sp' % offs # Dimension in points text.tag_config(x, font=props['font'], offset=offs) fg = props['foreground'] bg = props['background'] if fg: text.tag_config(x, foreground=fg) if bg: text.tag_config(x, background=bg) # A "draw-into" object, used for incrementally building up some text # in various fonts. Initialized with a parent widget and some initial text. # Drawing into it is done by calls to text(). When done, call finish() # to return a widget containing the text (which can then be packed, gridded, # etc.). class draw_into(object): def __init__(self, master, width=120): self.wid = Text(master, height=3, width=width, borderwidth=0, relief=FLAT, background='white') self.curface = None self.wid.slash_image = [] self.curtext = '' create_tags(self.wid) # Self.alltext maintains the length of the text printed # for the current widget self.alltext = 0 # FIXME: the tirgger for bigger height of the Text # widget is arbitrarily set to 95. This should be # driven by width of individual fonts and chars self.expandTrigger = 95 def finish_run(self): if self.curtext: self.wid.insert(INSERT, self.curtext, (self.curface,)) #props = face_mapping[self.curface] #Label(self.wid, text=self.curtext, # font=props['font']).pack(side=LEFT) self.curtext = '' def text(self, tex, face='default'): if self.curface == face: self.curtext += tex else: self.finish_run() self.curtext = tex self.curface = face # Increase recorded length of text self.alltext += len(tex) # Increase height if necessary if (self.alltext > self.expandTrigger): heightval = 3* (self.alltext/self.expandTrigger +1) self.wid.config(height= heightval) def finish(self): self.finish_run() self.wid.config(state=DISABLED) return self.wid def image(self, img): # When there is an image to be embedded self.finish_run() # Access the OPENCCG_HOME environment variable # to determine the correct path for the images openccg_home = os.environ['OPENCCG_HOME'] gifdir = openccg_home+"/images/slashes/" image = PhotoImage(file=gifdir+img) # We are creating an instantiated variable here # for the image, because the actual photo object is destroyed once # the execution leaves the __init__ code. Without building it this way, # the display was showing only a space for the image but not the image itself self.wid.slash_image += [image] self.wid.image_create(INSERT, image=image) def onHilite(self): self.wid.config(bg = '#E9FFE3') def offHilite(self): self.wid.config(bg = 'white') def category_draw_children(into, chils, depth, vars, need_initial_comma=False, sep='', sepface='default'): for x in chils: if sep and need_initial_comma: into.text(sep, sepface) need_initial_comma = True category_draw(into, x, depth=depth + 1, vars=vars) # Given the XML for a category, draw a graphical representation into the # widget INTO. The drawing is done by calling into.text(TEXT, FACE) def category_draw(into, xml, depth, vars): ty = xml[0] props = xml[1] chils = xml[2:] if ty == 'complexcat': if depth > 0: into.text('(', 'paren') category_draw_children(into, chils, depth, vars) if depth > 0: into.text(')', 'paren') elif ty == 'atomcat': into.text(getprop('type', props), 'category') category_draw_children(into, chils, depth, vars) elif ty == 'setarg': into.text('{', 'brace') category_draw_children(into, chils, depth, vars) into.text('}', 'brace') elif ty == 'fs': needcomma = False if vars.show_feat_id.get(): idval = getoptprop('id', props) if idval: #into.text('<%s>' % idval, 'numeric index') into.text('%s' % idval, 'numeric index') needcomma = True if vars.show_feat_struct.get(): category_draw_children(into, chils, depth, vars, need_initial_comma=needcomma, sep=',', sepface='subscript comma') elif ty == 'feat': attr = getprop('attr', props) if attr == 'index': assert len(chils) == 1 assert chils[0][0] == 'lf' chils = chils[0][2:] assert len(chils) == 1 assert chils[0][0] == 'nomvar' into.text(getprop('name', chils[0][1]), 'nomvar') else: val = getoptprop('val', props, None) if val: if vars.show_full_features.get(): into.text("%s=%s" % (attr, getprop('val', props)), 'feature') else: into.text("%s" % getprop('val', props), 'feature') else: into.text("%s" % attr, 'feature') elif ty == 'slash': dir = getoptprop('dir', props, '|') mode = getoptprop('mode', props) ability = getoptprop('ability', props) # into.text('%s' % dir, 'slash') # into.text('%s%s' % (mode or '', # ability_to_ability_value[ability] or ''), # 'slash mode') # We create the file name here # By interpreting various parameters # and joiing them together as a string if dir == '\\': slash_string = 'bk' elif dir == '/': slash_string = 'fd' else: slash_string = 'str' #slash_mode : X GREATER : $$ = 'x>' # : LESS X : $$ = '':'cross_greater', '':'greater', '<':'lesser', 'x':'cross', '.':'dot', '*':'star', '^':'box'} if mode == None: image_string = slash_string + '.GIF' else: image_string = slash_string+ '_' + modelist[mode] + '.GIF' into.image(image_string) elif ty == 'dollar': name = getoptprop('name', props) into.text('$', 'dollar') into.text('%s' % name, 'numeric index') else: # Have commented the following assert Statement # and the debug statement # Because of validation errors #debug('ty??? %s\n' % ty) #assert False dummy = 1 %y complexcat_entry : atomcat : LPAREN complexcat RPAREN : $$ = $2 complexcat_postmod : DOLLAR NUMBER : $$ = [['slash', []], ['dollar', [('name', $2)]]] complexcat_postmod : slash complexcat_entry : $$ = [$1, $2] #complexcat_postmod : slash DOLLAR NUMBER : # $$ = [$1, ['dollar', [('name', $3)]]] complexcat : complexcat_entry : complexcat complexcat_postmod : if $1[0] != 'complexcat': $1 = ['complexcat', []] + [$1] $$ = $1 + $2 cat_set_entry : slash complexcat_entry : $$ = [$1, $2] cat_set_entry_0: cat_set_entry : cat_set_entry commas complexcat : complexcat LBRACE cat_set_entry_0+ RBRACE : # $3 comes as a list of lists of the form [slash, cat]; we need to # flatten the list, which is what reduce() does if $1[0] != 'complexcat': $1 = ['complexcat', []] + [$1] $$ = $1 + [['setarg', []] + reduce(lambda x,y:x+y, $3)] ############################# # Hybrid logic # ############################# # Example: # # Source: # # E:action(* X:animate-being Y:sem-obj) # # XML output: # # # # # # # # # # # # %p def hylo_draw_children(into, chils, need_initial_comma=False, sep='', sepface='caret'): for x in chils: if sep and need_initial_comma: into.text(sep, sepface) need_initial_comma = True hylo_draw(into, x) # Given the XML for a hylo, draw a graphical representation into the # widget INTO. The drawing is done by calling into.text(TEXT, FACE) def hylo_draw(into, xml): ty = xml[0] props = xml[1] chils = xml[2:] if ty == 'satop': into.text('@') into.text(getprop('nomvar', props), 'nomvar') into.text('(') hylo_draw_children(into, chils, sep=' ^ ') into.text(')') elif ty == 'prop': name = getprop('name', props) if name == '[*DEFAULT*]': into.text('*', 'semname') else: into.text(name, 'semname') elif ty == 'diamond': mode = getprop('mode', props) # FIXME: Instead of uppercasing, we really want small caps into.text('<%s>' % mode.upper(), 'semrole') assert len(chils) > 0 if len(chils) == 1: hylo_draw(into, chils[0]) else: into.text('(') hylo_draw_children(into, chils, sep='^') into.text(')') elif ty == 'nomvar': into.text(getprop('name', props), 'nomvar') else: assert False %y hylo_entry : STAR : $$ = ['prop', [('name', '[*DEFAULT*]')]] : typed_word : if $1[0].isupper(): $$ = ['nomvar', [('name', $1)]] else: $$ = ['prop', [('name', $1)]] hylo_entry : LESS word GREATER hylo_entry : $$ = ['diamond', [('mode', $2)], $4] hylo_entry : LESS word GREATER LPAREN hylo_list RPAREN : $$ = ['diamond', [('mode', $2)]] + $5 carets : CARET : carets CARET hylo_entry_0: hylo_entry : hylo_entry carets hylo_list : empty : hylo_list hylo_entry_0 : $$ = $1 + [$2] hylo_list_0 : hylo_list : carets: $$ = [] : carets hylo_list: $$ = $2 hylo_spec : typed_word LPAREN hylo_list_0 RPAREN : $$ = ['satop', [('nomvar', $1)]] + $3 hybrid_logic : hylo_spec* : AT hylo_spec*: $$ = $2 ############################# # Words # ############################# %p def init_morphology(): global morph_xml morph_xml = [] # List families/parts-of-speech of a word. This comes from the # families/parts-of-speech specified in a word {} declaration; hence we # can't really tell families from POS's. This also comes from any # member declarations inside of a family. global word_to_family_pos word_to_family_pos = {} # List word members of a family/part-of-speech; more or less the # inverse of the previous hash. (Not a perfect inverse because it # doesn't currently list any members that come from a member # declaration inside of a family, but only from word {} declarations.) global family_pos_to_word family_pos_to_word = {} # word->predicate mapping; this comes from pred=foo declarations in the # properties of a word. This is needed because this info must be added # to tags in a family. global word_to_predicate word_to_predicate = {} # Mapping of families to parts-of-speech; comes from family {} # declarations. global family_to_pos family_to_pos = {} # Contains a key for each part-of-speech seen in a family {} # declaration. global pos_hash pos_hash = {} # (XML for) list of word members explicitly specified using a member # statement. global family_members family_members = {} def save_morphology(cur): cur.morph_xml = morph_xml cur.word_to_family_pos = word_to_family_pos cur.family_pos_to_word = family_pos_to_word cur.word_to_predicate = word_to_predicate cur.family_to_pos = family_to_pos cur.pos_hash = pos_hash cur.family_members = family_members # Assume that hash[key] is a list, add VALUE to the list if not already there. def add_uniquely_to_hash_entry_list(hash, key, value): if key not in hash: hash[key] = [] if value not in hash[key]: hash[key] += [value] def note_family_member(word, families): for x in families: add_uniquely_to_hash_entry_list(word_to_family_pos, word, x) add_uniquely_to_hash_entry_list(family_pos_to_word, x, word) def make_word_morph_xml(): xml = [] for x in morph_xml: word_pos_list = [] word = getprop('stem', x[1]) # Each word needs to be listed as many times as it has parts of # speech. We collect together all families and POS's associated # with a word, either from word {} or member declarations, # and determine all POS's from them. for y in word_to_family_pos.get(word, []): if y in family_to_pos: pos = family_to_pos[y] elif y in pos_hash: pos = y else: error(None, 'Family/part-of-speech %s not found (word declaration %s)', y, word) if pos not in word_pos_list: word_pos_list += [pos] for y in word_pos_list: # Make a copy of the word's XML and set the POS appropriately. entry = x[:] putprop('pos', y, entry[1]) xml += [entry] return xml %y word_param: word_list: $$ = ($1, []) : word_list LPAREN ext_attr_list RPAREN: # WORD(VALUE) is equivalent to WORD(class=VALUE). property_name_replace(None, 'class', $3) $$ = ($1, $3) word_spec_1: WORD word COLON word_param: (families, params) = $4 note_family_member($2, families) pred = getoptprop('pred', params) if pred: word_to_predicate[$2] = pred $$ = ($2, [('pos', None), ('stem', $2)] + params) word_spec_1: WORD word COLON: $$ = ($2, [('pos', None), ('stem', $2)]) word_spec: WORD word: $$ = ($2, [('pos', None), ('stem', $2)]) : word_spec_1 word_block: word_spec SEMI: (word, params) = $1 morph_xml.append(['entry', [('word', word)] + params]) word_block: word_spec_1 COLON word_macros SEMI: (word, params) = $1 morph_xml.append(['entry', [('word', word)] + $3 + params]) word_macros: word_list: $$ = [('macros', ' '.join(['@%s' % x for x in $1]))] word_form: word_or_star SEMI: $$ = ($1, []) word_form: word_or_star COLON word_macros SEMI: $$ = ($1, $3) word_forms: : $$ = [] : word_forms word_form : $$ = $1 + [$2] word_block: word_spec LBRACE word_forms RBRACE: (word, params) = $1 for (form, macros) in $3: if form == '*': form = word morph_xml.append(['entry', [('word', form)] + macros + params]) ############################# # Family blocks # ############################# %p def init_lexicon(): global lexicon_xml lexicon_xml = [] def save_lexicon(cur): cur.lexicon_xml = lexicon_xml # lexicon_xml already contains XML for each family and its entries (i.e. # lexical insertion rules). We also need to add to each family the words # that are members of the family -- these come from both word {} # declarations and member statements. def make_family_lexicon_xml(): for x in lexicon_xml: # Make sure that open families don't have member entries, or otherwise # [*DATE*], [*NUM*], etc. won't work. closed = getprop('closed', x[1]) if closed == 'false': continue name = getprop('name', x[1]) words_seen = [] # Add each stem explicitly given in a member statement. The # predicate comes from any predicate given in the member statement # along with the stem, or from the word {} declaration as a backup. for y in family_members[name]: stem = getprop('stem', y[1]) words_seen += [stem] pred = getoptprop('pred', y[1]) if not pred: pred = word_to_predicate.get(stem, None) x += [['member', [('stem', stem)] + (pred and [('pred', pred)] or [])]] # Add each stem that specifies that it belongs to this family, # unless we already added it. for y in family_pos_to_word.get(name, []): if y not in words_seen: words_seen += [y] pred = word_to_predicate.get(y, None) x += [['member', [('stem', y)] + (pred and [('pred', pred)] or [])]] return lexicon_xml # A CSFamily is a `family {}' block. class CSFamily(CSBlock): def __init__(self, prod, name, props, statements): super(CSFamily, self).__init__(prod) self.name = name self.props = props self.statements = statements self.text = None self.homeButton = None self.btnFrame = None self.menuHolder = None self.childFrame = None self.cfile = None self.cf = None self.vars = None self.canvas = None self.mainFrame = None def draw(self, childFrame, cfile, vars, row, canvas, mainFrame): # Draw the family name f = Frame(childFrame, bd=1, relief=SUNKEN, background='white') cf = draw_into(f, width=20) cf.text('%s' % self.name, 'family name') child_widget=cf.finish() self.menuHolder = child_widget child_widget.pack(fill=BOTH, expand=YES) child_widget.bind("", self.editPopup) self.childFrame = childFrame self.cfile = cfile self.cf = cf self.vars = vars self.canvas = canvas self.mainFrame = mainFrame f.grid(row=row, column=0, sticky=NSEW) # Draw the various statements f = Frame(childFrame, bd=1, relief=SUNKEN, background='white') for x in self.statements: frame = x.draw(f, cfile, vars) if frame: frame.pack(fill=BOTH, expand=YES) f.grid(row=row, column=1, sticky=NSEW) childFrame.rowconfigure(row, weight=1) # Define the binding procedure for the right-click for editing an entry def editPopup(self, event): popup = Menu(self.menuHolder, tearoff =0) popup.add_command(label=' Edit ', command = lambda: self.editSection(self.childFrame, self.cfile, self.cf, self.vars, self.canvas, self.mainFrame)) try: popup.tk_popup(event.x_root+40, event.y_root, 0) finally: popup.grab_release() # Now bind the right-click to the saveSection buttons self.menuHolder.bind("", self.savePopup) # Define the right click binding for the save entry def savePopup(self, event): popup = Menu(self.menuHolder, tearoff = 0) popup.add_command(label = 'Done', command = lambda: self.saveSection(self.childFrame, self.cfile, self.cf, self.vars, self.canvas, self.mainFrame)) popup.add_command(label = 'Home', command = lambda: self.editHome(self.cfile)) fileData = self.cfile.getAllText() popup.add_command(label = 'Undo All', command = lambda: self.undoEdit(fileData, self.cfile)) try: popup.tk_popup (event.x_root+40, event.y_root, 0) finally: popup.grab_release() # Edit a section, i.e. a family of the grammar individually rather than the entire grammar # Note that this will have very preliminary editing capabilities and the complete grammar # editing should be done through the Edit global view def editSection(self, childFrame, cfile, hiliteText, vars, canvas, mainFrame): editFrame = Frame(mainFrame, bd=1, background='white') self.text = Text(editFrame, padx=5, wrap=None, undo = YES, background='white', height =10) vbar = Scrollbar(editFrame) hbar = Scrollbar(editFrame, orient='horizontal') self.text.config(yscrollcommand=vbar.set) # call vbar.set on text move self.text.config(xscrollcommand=hbar.set) vbar.config(command=self.text.yview) # call text.yview on scroll move hbar.config(command=self.text.xview) # or hbar['command']=text.xview # Changing the mode of the cfile object here, # so that once the uer clicks done, # the whole object is recompiled and redisplayed cfile.mode= 'Edit' # Highlight the row being edited hiliteText.onHilite() vbar.pack(side=RIGHT, fill=Y) hbar.pack(side=BOTTOM, fill=X) self.text.pack(fill= BOTH, expand= YES) # Set a mark at the beginning of the text self.text.mark_set("START", INSERT) self.text.mark_gravity("START", LEFT) # Push in the rest of the file's contents fileData = cfile.getAllText() self.text.insert(INSERT, fileData) # Move the insert position to the first occurence of the family name # FIXME: this is poor implementation # The positioning of the insert cursor should be happening by parsing the # CFG production rules, using CSFamily.prod.lineno and endlineno self.text.config(takefocus=True) idx= self.text.search('family '+ self.name, "START") self.text.mark_set(CURRENT, idx) self.text.see(CURRENT) #editFrame.grid(row=row+1, columnspan =3, sticky = NSEW) editFrame.grid(row=2, columnspan =2, sticky = NSEW) childFrame.update_idletasks() canvas.config(scrollregion=canvas.bbox("all")) # Finished editing #def saveSection(self, childFrame, cfile, hiliteText, varset, canvas, mainFrame, homeButton, undoButton): def saveSection(self, childFrame, cfile, hiliteText, varset, canvas, mainFrame): # We force the text contents of the cfile object to copy over # all that is presently in the current text-box cfile.setAllText(self.text.get(1.0,END)) # Undo the highlight of the row hiliteText.offHilite() # Recompile whatever was edited and redisplay # Note: changes are not saved hereby!! cfile.compile_if_needed() cfile.onLexicon() # Restore the right-click binding to the original self.menuHolder.bind("", self.editPopup) # Restore view to original place where you wanted to edit def editHome(self, cfile): # Move the insert position to the first occurence of the family name # FIXME: this is poor implementation # The positioning of the insert cursor should be happening by parsing the # CFG production rules, using CSFamily.prod.lineno and endlineno self.text.config(takefocus=True) idx= self.text.search('family '+ self.name, "START") if not idx: showwarning('Error', 'Original entry for '+self.name+ ' not found!') self.text.mark_set(CURRENT, idx) self.text.see(CURRENT) # Undo all editing done till now def undoEdit(self, fileData, cfile): askqn = askokcancel('Warning','Undo all changes till now?') if askqn: self.text.delete("START", END) self.text.insert(CURRENT, fileData) self.editHome(cfile) # CSFamilyEntry is an `entry' statement inside a `family' block. # # PROPS is a property list corresponding to the entry's name ('name') and # any other properties, deriving from the form # # entry NAME(PROP=VAL, ...): # # Either the name or properties, or both, may be omitted. # # CAT is the XML corresponding to the entry's category, and LF is the XML for # the logical form (hybrid logic). class CSFamilyEntry(CSStatement): def __init__(self, prod, props, cat, lf=None): super(CSFamilyEntry, self).__init__(prod) self.props = props # NOTE: self.cat is a single XML statement, but self.lf is a list # of XML statements. FIXME. self.cat = cat self.lf = lf def xml(self): if self.lf: lf = [['lf', []] + self.lf] else: lf = [] return [['entry', self.props, self.cat + lf]] def draw(self, parent, cfile, vars): name = getoptprop('name', self.props) f = Frame(parent, background='white') cf = draw_into(f) cf.text(' ') if name: cf.text('%s: ' % name) category_draw(cf, self.cat, depth=0, vars=vars) if self.lf and vars.show_semantics.get(): cf.text(' : ') hylo_draw_children(cf, self.lf) cf.finish().pack(fill=BOTH, expand=YES, side=LEFT) return f # CSFamilyMember is a `member' statement inside a `family' block. ITEMS # lists the items given, in property-list form: # # STEM --> [('stem', STEM)] # STEM(PRED) --> [('stem', STEM), ('pred', PRED)] class CSFamilyMember(CSStatement): def __init__(self, prod, items): super(CSFamilyMember, self).__init__(prod) self.items = items def xml(self): return [['member', x] for x in self.items] def draw(self, parent, cfile, vars): return None f = Frame(parent, background='white', bd=1, relief=SUNKEN) cf = draw_into(f) cf.text('Members: ', 'member heading') first = True for x in self.items: stem = getprop('stem', x) pred = getoptprop('pred', x) if not first: cf.text(', ', 'member comma') cf.text(' %s%s' % (stem, pred and "(pred=%s)" % pred or ''), 'member') first = False print len (self.items) cf.finish().pack(fill=BOTH, expand=YES) return f %y # Omitting the colon between entry category and hybrid logic doesn't # actually cause parsing problems, but it's probably not a good idea to # encourage this, because the syntax might change in the future. entry_name_1: opt_paren_attr_list # We shouldn't need the first entry below, but we do, due to the bugginess # in PLY in handling empty rules. entry_name: word: $$ = [('name', $1)] : word entry_name_1: $$ = [('name', $1)] + $2 : entry_name_1 entry : ENTRY entry_name COLON complexcat COLON hybrid_logic SEMI : $$ = CSFamilyEntry($@, props=$2, cat=$4, lf=$6) entry : ENTRY entry_name COLON complexcat SEMI : $$ = CSFamilyEntry($@, props=$2, cat=$4) member_entry : word : $$ = [('stem', $1)] member_entry : word LPAREN word RPAREN : $$ = [('stem', $1), ('pred', $3)] member_entry_0: member_entry : member_entry commas member : MEMBER COLON member_entry_0+ SEMI : $$ = CSFamilyMember($@, items=$3) family_statement : member | entry family_statement_list : empty : family_statement_list family_statement : $$ = $1 + [$2] family_block : FAMILY word opt_paren_ext_attr_list LBRACE family_statement_list RBRACE : # FAMILY(VALUE) is equivalent to FAMILY(pos=VALUE). property_name_replace(None, 'pos', $3) # Create the AST object -- before adding to $3. $$ = CSFamily($@, name=$2, props=$3, statements=$5) # 'pos' (part of speech) defaults to the family name; they would only # differ when more than one family is used to define a particular part of # speech, to handle related characteristics (family Prep-Nom vs. pos Prep). if not property_specified('pos', $3): $3 += [('pos', $2)] # Store mappings related to POS. pos = getprop('pos', $3) family_to_pos[$2] = pos pos_hash[pos] = True # Now construct the XML for the family xml = ['family', [('name', $2)] + $3] family_members[$2] = [] for x in $5: if type(x) is CSFamilyMember: family_members[$2].extend(x.xml()) else: xml.extend(x.xml()) # If members have been specified ('member' statements) and there is no # 'closed' property, make the family closed. # if family_members[$2] and not property_specified('closed', xml[1]): # xml[1] += [('closed', 'true')] # Actually, we *always* need classes closed, due to a bizarreness in # OpenCCG. if not property_specified('closed', xml[1]): xml[1] += [('closed', 'true')] # Add names to entries ('entry' statements) without them. primcount = 0 for x in xml[2:]: if not property_specified('name', x[1]): primcount += 1 x[1] = [('name', 'Entry-%s' % primcount)] + x[1] # For each specified member, note the family it's in so that its part # of speech can be calculated. for x in family_members[$2]: add_uniquely_to_hash_entry_list(word_to_family_pos, getprop('stem', x[1]), $2) lexicon_xml.append(xml) $$.static_xml = [xml] ############################# # Rule blocks # ############################# %p def init_rules(): global rules rules = { ('app', '+') : True, ('app', '-') : True, ('comp', '+') : True, ('comp', '-') : True, ('xcomp', '+') : True, ('xcomp', '-') : True, ('sub', '+') : False, ('sub', '-') : False, ('xsub', '+') : False, ('xsub', '-') : False, ('typeraise', '+') : [(False, True, True)], ('typeraise', '-') : [(True, True, True)], 'typechange' : [], } global rules_to_xml_mapping rules_to_xml_mapping = { 'app' : ['application', []], 'comp' : ['composition', [('harmonic', 'true')]], 'xcomp' : ['composition', [('harmonic', 'false')]], 'sub' : ['substitution', [('harmonic', 'true')]], 'xsub' : ['substitution', [('harmonic', 'false')]], } def save_rules(cur): cur.rules = rules cur.rules_to_xml_mapping = rules_to_xml_mapping def make_rules_xml(): xml = [] unique = 0 for (key, value) in my_sorted(rules.items()): if type(key) is tuple and key[0] in rules_to_xml_mapping: rx = copy.deepcopy(rules_to_xml_mapping[key[0]]) rx[1] += [('dir', key[1] == '+' and 'forward' or 'backward')] xml.append(rx) elif type(key) is tuple and key[0] == 'typeraise': for (dollar, arg, result) in value: xml.append(['typeraising', [('dir', key[1] == '+' and 'forward' or 'backward'), ('useDollar', dollar and 'true' or 'false')]] + (arg != True and [['arg', [], arg]] or []) + (result != True and [['result', [], result]] or [])) elif key == 'typechange': for (arg, result, lf) in value: unique += 1 if lf: lf = [['lf', []] + lf] else: lf = [] xml.append(['typechanging', [('name', 'typechange-%d' % unique)], ['arg', [], arg], ['result', [], result + lf]]) else: raise InternalError("Invalid element in rules hash: %s" % str(key)) return xml def dotyperaise(plusminus, dollarp, arg, result): if plusminus == '+' or plusminus == '+-': rules[('typeraise', '+')] += [(dollarp, arg, result)] if plusminus == '-' or plusminus == '+-': rules[('typeraise', '-')] += [(dollarp, arg, result)] def rulesreinit(): rules.clear() rules[('typeraise', '+')] = [] rules[('typeraise', '-')] = [] rules['typechange'] = [] %y ruletype : APP | COMP | XCOMP | SUB | XSUB opt_dollar : DOLLAR : $$ = True : empty : $$ = False opt_atomcat : atomcat : empty : $$ = True opt_complexcat : COLON complexcat : $$ = $2 : empty : $$ = True plusminus_spec : PLUS | MINUS | PLUSMINUS rule : NO SEMI : rulesreinit() : NO ruletype SEMI | NO ruletype PLUSMINUS SEMI : \ del rules[($2, '+')]; del rules[($2, '-')] : NO ruletype PLUS SEMI : del rules[($2, '+')] : NO ruletype MINUS SEMI : del rules[($2, '-')] : NO TYPERAISE SEMI | NO TYPERAISE PLUSMINUS SEMI : \ rules[('typeraise', '+')] = []; rules[('typeraise', '-')] = [] : NO TYPERAISE PLUS SEMI : rules[('typeraise', '+')] = [] : NO TYPERAISE MINUS SEMI : rules[('typeraise', '-')] = [] : NO TYPECHANGE SEMI : rules['typechange'] = [] : ruletype PLUSMINUS SEMI : \ rules[($1, '+')] = True; rules[($1, '-')] = True : ruletype PLUS SEMI : rules[($1, '+')] = True : ruletype MINUS SEMI : rules[($1, '-')] = True : TYPERAISE plusminus_spec opt_dollar opt_complexcat SEMI: dotyperaise($2, $3, $4, True) : TYPERAISE plusminus_spec opt_dollar COLON complexcat GOESTO opt_atomcat SEMI: dotyperaise($2, $3, $5, $7) : TYPECHANGE COLON complexcat GOESTO complexcat SEMI: \ rules['typechange'] += [($3, $5, None)] : TYPECHANGE COLON complexcat GOESTO complexcat COLON hybrid_logic SEMI: \ rules['typechange'] += [($3, $5, $7)] rule_list : rule_list rule : empty rule_block : RULE LBRACE rule_list RBRACE ############################# # Testbed # ############################# %p def init_testbed(): global testbed_statements testbed_statements = [] def save_testbed(cur): cur.testbed_statements = testbed_statements def add_testbed_statement(bang, words, number): testbed_statements.append(['item', [('string', ' '.join(words))] + bang + number]) def make_testbed_xml(): return testbed_statements %y opt_testbed_bang: BANG: $$ = [('known', 'true')] : empty testbed_entry: opt_testbed_bang word_list SEMI: \ add_testbed_statement($1, $2, []) : opt_testbed_bang word_list COLON NUMBER SEMI: \ add_testbed_statement($1, $2, [('numOfParses', $4)]) testbed_block: TESTBED LBRACE testbed_entry* RBRACE ############################# # Relation-sorting # ############################# %p def init_relation_sorting(): global relation_sorting relation_sorting = [] def save_relation_sorting(cur): cur.relation_sorting = relation_sorting def make_relation_sorting_lexicon_xml(): if relation_sorting: return [['relation-sorting', [('order', ' '.join(relation_sorting))]]] else: return [] %y relation_sorting_block: RELATION_SORTING COLON word_or_star_0 * SEMI: global relation_sorting relation_sorting += $3 ############################# # End Yacc Declarations # ############################# %p def p_error(p): if p: error(p.lineno, "Syntax error at '%s'", p.value) else: error(None, "Unexpected end of file") ############################# # Lexer classes # ############################# # A Lexer that allows for a list of tokens to be pushed onto the front of # the list of tokens to be returned. Any number of such lists can be # pushed. class StackLexer(object): def __init__(self, lexer): self.lexer = lexer self.tokenstack = [] self.tokenstackind = [] self.lineno = 1 def input(self, s): self.lexer.input(s) def pushstack(self, stack): self.tokenstack.append(stack) self.tokenstackind.append(0) def token(self): global return_bogus_value if return_bogus_value: return_bogus_value = 0 tok = CCGToken('BOGUS_VALUE', 'BOGUS_VALUE') tok.lineno = self.lineno return tok while self.tokenstack: try: tok = self.tokenstack[-1][self.tokenstackind[-1]] self.tokenstackind[-1] += 1 return tok except IndexError: self.tokenstack.pop() self.tokenstackind.pop() if self.lexer: tok = self.lexer.token() if tok: self.lineno = tok.lineno return tok return None # A Lexer that checks for macro calls and expands them appropriately. class MacroLexer(StackLexer): def __init__(self, lexer): self.last_token = None self.indentlevel = 0 super(MacroLexer, self).__init__(lexer) def simpletoken(self): return super(MacroLexer, self).token() def noeoftoken(self): tok = self.innertoken() if not tok: raise SyntaxError("Unexpected EOF") return tok def innertoken(self): macrotok = self.simpletoken() if not macrotok or no_macro_sub or \ not (macrotok.type == 'ID' and macrotok.value in macro_defs): return macrotok else: newtok = self.simpletoken() if not newtok or newtok.type != 'LPAREN': self.pushstack([newtok]) return macrotok macrodef = macro_defs[macrotok.value] args = [] stop = False while not stop: thisarg = [] parencount = 0 expect_rbrace = 0 newtok = self.noeoftoken() if newtok.type == 'LBRACE': parencount += 1 expect_rbrace = 1 newtok = self.noeoftoken() while True: if newtok.type in ['LBRACE', 'LBRACKET', 'LPAREN']: parencount += 1 if newtok.type in ['RBRACE', 'RBRACKET', 'RPAREN']: parencount -= 1 if parencount < 0: if newtok.type == 'RPAREN': stop = True break error(newtok.lineno, "Syntax error at %s", newtok.value) parencount = 0 if parencount == 0 and newtok.type == 'RBRACE' and \ expect_rbrace: expect_rbrace = 0 newtok = self.noeoftoken() continue if parencount == 0 and newtok.type == 'COMMA': break thisarg.append(newtok) newtok = self.noeoftoken() args.append(thisarg) # Allow extra trailing comma if len(args) == len(macrodef.args) + 1 and not args[-1]: args.pop() if len(args) != len(macrodef.args): error(macrotok.lineno, "Invalid number of arguments to macro %s", macrotok.value) else: if super_macro_debug: print "Processing macro: %s" % macrotok.value self.pushstack(macrosub(macrodef, args, self.lineno)) return self.innertoken() def token(self): def pretty_output_transformed(token): def newline(num=1): outout('\n' * num) outout(' ' * 2 * self.indentlevel) if tok.lineno and self.lineno < tok.lineno: if tok.lineno - self.lineno == 1: newline() else: newline(2) elif self.last_token and (self.last_token.type == 'RBRACE' or self.last_token.type == 'SEMI'): newline() elif tok.type == 'LBRACE': newline(2) value = str(tok.value) lastval = self.last_token and str(self.last_token.value) if value and lastval and ((isalnumund(lastval[0]) and isalnumund(value[0])) or self.last_token.type in ('COLON', 'COMMA')): outout(' ') if tok.type == 'QUOTEDID': outout('"%s"', value) else: outout('%s', value) if tok.type == 'LBRACE': self.indentlevel += 1 elif tok.type == 'RBRACE': self.indentlevel -= 1 return tok # Beginning of actual function tok = self.innertoken() if options.transformed_input and self.lexer and \ tok and tok.type != 'BOGUS_VALUE': pretty_output_transformed(tok) self.last_token = tok # print "Saw token: %s" % tok return tok ############################# # Parsing # ############################# def init_parse_once(): # Initialize the parser once, at beginning. This does introspection on # the rules (i.e. p_*() functions) in this file. yacc.yacc(start='top', debug=yacc_debug, method='LALR', write_tables=0) # Parse a .CCG file whose contents are in STR. class parse_results: pass def parse_string(str): retval = parse_results() if str: retval.parse = yacc.parse(str, lexer=MacroLexer(globallexer)) else: retval.parse = [] save_global_state(retval) return retval ############################# # Graphics # ############################# # Given the return value from parsing (a list of abstract syntax tree-related # objects), draw them into the given frame. def draw_parse(parse, cfile, childFrame, vars, canvas, mainFrame): row = 0 if parse: for x in parse: if hasattr(x, 'draw'): x.draw(childFrame, cfile, vars, row, canvas, mainFrame) row += 1 # Make the column containing the lexical entries expand as necessary childFrame.columnconfigure(1, weight=1) #frame.grid(column=0) ############################# # Initialization # ############################# # We encapsulate all global-variable initialization into a function that # can be called repeatedly so we can reinitialize our state and parse more # than one file. ARGV is the command-line arguments to parse (normally # sys.argv[1:]) and ERRORS_TO_STRING indicates whether to write stdout and # stderr output to strings or to the normal output locations. def init_global_state(errors_to_string=False): init_errors(errors_to_string) init_lexer() init_macros() init_features() init_morphology() init_lexicon() init_testbed() init_rules() init_relation_sorting() # When we're finished parsing, save the global state to the specified # object, so we can track the parse results for more than one file. def save_global_state(cur): save_errors(cur) save_lexer(cur) save_macros(cur) save_features(cur) save_morphology(cur) save_lexicon(cur) save_testbed(cur) save_rules(cur) save_relation_sorting(cur) def init_global_state_once(): init_parse_once() late_init_graphics_done = 0 # Graphics-related initialization that must be done late, after the first # Tk top-level window has been created. def late_init_graphics(): global late_init_graphics_done if not late_init_graphics_done: late_init_draw_once() late_init_graphics_done = 1 ############################# # Main driver # ############################# # Function to output a particular XML file def output_xml_file(prefix, grammar_name, filebase, top_level_tag, xml): xml_file = os.path.join(options.dir, '%s%s.xml' % (prefix, filebase)) if not options.quiet: errout('Outputting XML file: %s\n' % xml_file) xml = [top_level_tag, [('name', grammar_name), ('xmlns:xsi', 'http://www.w3.org/2001/XMLSchema-instance'), ('xsi:noNamespaceSchemaLocation', '../%s.xsd' % filebase)]] + xml fil = open(xml_file, 'w') fil.write('\n') print_xml(fil, xml) fil.close() def make_grammar_xml(prefix): return [['lexicon', [('file', '%slexicon.xml' % prefix)]], ['morphology', [('file', '%smorph.xml' % prefix)]], ['rules', [('file', '%srules.xml' % prefix)]], ['types', [('file', '%stypes.xml' % prefix)]]] # Map saying how to output the specified XML file output_file_map = { 'lexicon': ('ccg-lexicon', lambda pref: make_feature_lexicon_xml() + make_relation_sorting_lexicon_xml() + make_family_lexicon_xml()), 'rules': ('rules', lambda pref: make_rules_xml()), 'morph': ('morph', lambda pref: make_word_morph_xml() + make_feature_morph_xml()), 'types': ('types', lambda pref: make_feature_types_xml()), 'grammar': ('grammar', make_grammar_xml), 'testbed': ('regression', lambda pref: make_testbed_xml()), } # Process the --omit-output list. def split_output_files(arg): files = re.split('[,\s]+', arg) for x in files: if x not in output_file_map: parser.error('Unknown file in --omit-output argument') return files def main(): parse_arguments(sys.argv[1:]) init_global_state_once() init_global_state() if options.omit_output: if options.omit_output[0] == '+': output_files = split_output_files(options.omit_output[1:]) else: suppress_output = split_output_files(options.omit_output) output_files = [] for x in output_file_map: if x not in suppress_output: output_files.append(x) else: output_files = [x for x in output_file_map] # Now actually parse the input arguments prefix = options.prefix lastfile = '-' args = global_args or ['-'] for arg in args: if arg == '-': if not options.quiet: errout("ccg2xml: Processing standard input\n") fil = sys.stdin else: if not options.quiet: errout("ccg2xml: Processing %s\n" % arg) fil = file(arg) lastfile = arg if prefix == None: (phead, ptail) = os.path.split(arg) (pbase, pext) = os.path.splitext(ptail) prefix = '%s-' % pbase retval = parse_string(fil.read()) # print "Retval: %s\n" % retval if macro_debug: print_macros() # Make output directory if needed, and output files if error_count > 0: if not options.quiet: maybe_errout('Errors during compilation, files not output.\n') sys.exit(1) else: if options.dir: if not os.path.isdir(options.dir): os.makedirs(options.dir) else: options.dir = '.' for x in output_files: file_info = output_file_map[x] output_xml_file(prefix, lastfile, x, file_info[0], file_info[1](prefix)) if __name__ == '__main__': # when run as a script main() # Local Variables: # mode: python # end: ================================================ FILE: src/ccg2xml/ccg_editor.py ================================================ #!/usr/bin/python # Author: Ben Wing # Date: April 2006 ############################################################################# # # # ccg_editor.ply # # # # Edit a CCG-format file, graphically. Will have a mode for displaying # # CCG files in a friendly fashion and allowing for editing of parts or # # all of the file. Will also have a mode for testing a CCG grammar, and # # allow for compilation and error-finding under control of the editor. # # # ############################################################################# # This code is based on PyEdit version 1.1, from Oreilly's Programming # Python, 2nd Edition, 2001, by Mark Lutz. from Tkinter import * # base widgets, constants from tkFileDialog import * # standard dialogs from tkMessageBox import * from tkSimpleDialog import * from tkColorChooser import askcolor from string import split, atoi import sys, os, string, md5 import ccg2xml import Tree import re START = '1.0' # index of first char: row=1,col=0 SEL_FIRST = SEL + '.first' # map sel tag to index SEL_LAST = SEL + '.last' # same as 'sel.last' FontScale = 0 # use bigger font on linux if sys.platform[:3] != 'win': # and other non-windows boxes FontScale = 3 # Initial top-level window; it's not clear we need this. # FIXME: It sucks that we have to call Tk() to get the first top-level window # but Toplevel() for all others. We should be able to call Tk() initially, # and then Toplevel() to create all top-level windows, including the first. root = None # List of all open CFile objects openfiles = {} filenames = [] class CTab(Frame): # Initialize this tab. Usually called from a subclass. PARENT is # the parent widget, CFILE the CFile object associated with the # top-level window, and TABNAME is the name of this tab (that tab # will be removed from the toolbar). def __init__(self, parent, cfile, tabname): Frame.__init__(self, parent) self.parent = parent self.cfile = cfile self.toolbar = None self.checkbar = None self.menubar = [ ('File', 0, [('Open...', 0, self.cfile.onOpen), ('New', 0, self.cfile.onNew), ('Save', 0, self.onSave), ('Save As...', 5, self.onSaveAs), ('Close', 0, self.cfile.onClose), 'separator', ('Quit VisCCG', 0, self.cfile.onQuit)] ), ('Tools', 0, [('Font List', 0, self.cfile.onFontList), ('Pick Bg...', 4, self.cfile.onPickBg), ('Pick Fg...', 0, self.cfile.onPickFg), ('Color List', 0, self.cfile.onColorList), 'separator', ('Info...', 0, self.cfile.onInfo)] ) ] self.toolbar = [ # ('Display', self.cfile.onDisplay, {'side': LEFT}), ('Edit', self.cfile.onEdit, {'side': LEFT}), ('Lexicon', self.cfile.onLexicon, {'side': LEFT}), ('Testbed', self.cfile.onTestbed, {'side': LEFT}), ('Features', self.cfile.onFeatures, {'side': LEFT}), ('Words', self.cfile.onWords, {'side': LEFT}), ('Rules', self.cfile.onRules, {'side': LEFT}), ('Quit', self.cfile.onClose, {'side': RIGHT}), ('Help', self.cfile.help, {'side': RIGHT}), ('Save', self.onSave, {'side': RIGHT}), ] # self.remove_toolbar_button(tabname) # Add MENU (a tuple corresponding to a single top-level menu item) # after the item with the name AFTER. def add_menu(self, after, menu): newmenu = [] for x in self.menubar: newmenu += [x] if x[0] == after: newmenu += [menu] self.menubar = newmenu # Remove the toolbar button named NAME. def remove_toolbar_button(self, name): newtoolbar = [] for x in self.toolbar: if x[0] != name: newtoolbar += [x] self.toolbar = newtoolbar def reinit(self): pass ##################### # File menu commands ##################### def onSave(self): self.onSaveAs(self.cfile.currfile) # may be None def onSaveAs(self, forcefile=None): file = forcefile or self.cfile.my_asksaveasfilename() if file: text = self.cfile.getAllText() try: open(file, 'w').write(text) except: showerror('CCG Editor', 'Could not write file ' + file) else: self.cfile.setFileName(file) # may be newly created self.cfile.edit_modified(NO) self.cfile.last_save_signature = self.cfile.getSignature(text) class CEdit(CTab): def __init__(self, parent, cfile): CTab.__init__(self, parent, cfile, 'Edit') self.debugFrame= None # Add a frame here, so that debug mode can be enabled # by embedding other objects within this frame editFrame = Frame(self, bd=1, bg= 'white') editFrame.pack(fill=BOTH, expand=YES, side=TOP) # Add a button frame, embed the button and # link to command for the debug mode btnFrame = Frame(editFrame, bd = 1) btnFrame.grid (row=0, columnspan=3, sticky=NSEW) vldButton = Button (btnFrame, text='Validate', command = lambda: self.onValidate(editFrame, cfile)) vldButton.pack(side=RIGHT) # Put the main edit window in the row below this vbar = Scrollbar(editFrame) hbar = Scrollbar(editFrame, orient='horizontal') self.text = Text(editFrame, padx=5, wrap='none', undo=YES) vbar.grid(row=1, column=2, sticky=NS) hbar.grid(row=2, columnspan=2, sticky=EW) # pack text last self.text.grid(row=1, column=1, sticky=NSEW) # else sbars clipped editFrame.columnconfigure(1, weight=1) editFrame.rowconfigure(1, weight=1) # Add a list containing line numbers self.lineList = Text(editFrame, relief=SUNKEN, bg='white', bd=2, yscrollcommand = vbar.set, width=3) self.lineList.grid(row=1, column=0, sticky=NS) self.lineList.config(font=self.cfile.fonts[0], bg=self.cfile.colors[0]['bg'], fg=self.cfile.colors[0]['fg']) # TODO: The first time the display of the line numbers # strangely doesn't go through --- somehow cfile # isn't initialized. However, it works properly in the display. # Need to understand why this happens. try: self.showLineNums() except KeyError: self.text.config(yscrollcommand=vbar.set) # call vbar.set on text move self.text.config(xscrollcommand=hbar.set) #vbar.config(command=text.yview) # call text.yview on scroll move hbar.config(command=self.text.xview) # or hbar['command']=text.xview self.text.config(font=self.cfile.fonts[0], bg=self.cfile.colors[0]['bg'], fg=self.cfile.colors[0]['fg']) #Setting the movement of the listbox and the text #together to be controlled by the scrollbar vbar.config(command=self.scrollSet) self.add_menu('File', ('Edit', 0, [('Cut', 0, self.onCut), ('Copy', 1, self.onCopy), ('Paste', 0, self.onPaste), 'separator', ('Delete', 0, self.onDelete), ('Select All', 0, self.onSelectAll)] )) self.add_menu('Edit', ('Search', 0, [('Goto...', 0, self.cfile.onGoto), ('Find...', 0, self.cfile.onFind), ('Refind', 0, self.cfile.onRefind), ('Change...', 0, self.onChange)] )) def scrollSet(self, *args): self.lineList.yview(*args) self.text.yview(*args) def reinit(self): self.showLineNums() self.text.focus() def showLineNums(self): #Make the list of lines editable self.lineList.config(state=NORMAL) textData = self.cfile.getAllText() listOfLines = textData.split('\n') for num in range(1,len(listOfLines)): self.lineList.insert(END,"%s\n" % num) #Now that we are done changing the number of lines, #we reset the text to be uneditable self.lineList.config(state=NORMAL) def onValidate(self, editFrame, cfile): #showwarning(title= 'Sorry', message='Validate and debug feature coming soon!') # Destroy previous display of debug or error messages # if present if self.debugFrame: self.debugFrame.grid_forget() # Compile if file signature has changed cfile.compile_if_needed() # Now, call the error debug routine if errors are found if (ccg2xml.error_count > 0): self.debugError(editFrame, cfile) else: showinfo(title='VisCCG: Success', message='No validation errors!') def debugError(self, editFrame, cfile): self.debugFrame = Frame(editFrame, bg='white', bd=2) self.debugFrame.grid(row=3, columnspan=2, sticky=NSEW) # Create Listbox and scrollbars sbar = Scrollbar(self.debugFrame) list = Listbox(self.debugFrame, relief=SUNKEN, bg='white', bd=2, yscrollcommand = sbar.set) sbar.config(command=list.yview) list.pack(fill=BOTH, side=LEFT, expand=YES) sbar.pack(fill=Y, side=RIGHT) # Display each message in the log for mesg in ccg2xml.message_log: type = mesg[0] lineno = mesg[1] errwarn = mesg[2] if lineno: dispError = type+' at Line '+str(lineno)+': '+errwarn else: dispError = type+': '+errwarn list.insert(END, dispError) ##################### # Edit menu commands ##################### def onCopy(self): # get text selected by mouse,etc if not self.text.tag_ranges(SEL): # save in cross-app clipboard showerror('CCG Editor', 'No text selected') else: text = self.text.get(SEL_FIRST, SEL_LAST) self.clipboard_clear() self.clipboard_append(text) def onDelete(self): # delete selected text, no save if not self.text.tag_ranges(SEL): showerror('CCG Editor', 'No text selected') else: self.text.delete(SEL_FIRST, SEL_LAST) def onCut(self): if not self.text.tag_ranges(SEL): showerror('CCG Editor', 'No text selected') else: self.onCopy() # save and delete selected text self.onDelete() def onPaste(self): try: text = self.selection_get(selection='CLIPBOARD') except TclError: showerror('CCG Editor', 'Nothing to paste') return self.text.insert(INSERT, text) # add at current insert cursor self.text.tag_remove(SEL, '1.0', END) self.text.tag_add(SEL, INSERT+'-%dc' % len(text), INSERT) self.text.see(INSERT) # select it, so it can be cut def onSelectAll(self): self.text.tag_add(SEL, '1.0', END+'-1c') # select entire text self.text.mark_set(INSERT, '1.0') # move insert point to top self.text.see(INSERT) # scroll to top ####################### # Search menu commands ####################### def onChange(self): new = Toplevel(self) Label(new, text='Find text:').grid(row=0, column=0) Label(new, text='Change to:').grid(row=1, column=0) self.change1 = Entry(new) self.change2 = Entry(new) self.change1.grid(row=0, column=1, sticky=EW) self.change2.grid(row=1, column=1, sticky=EW) Button(new, text='Find', command=self.onDoFind).grid(row=0, column=2, sticky=EW) Button(new, text='Apply', command=self.onDoChange).grid(row=1, column=2, sticky=EW) new.columnconfigure(1, weight=1) # expandable entrys def onDoFind(self): self.onFind(self.change1.get()) # Find in change box def onDoChange(self): if self.text.tag_ranges(SEL): # must find first self.text.delete(SEL_FIRST, SEL_LAST) # Apply in change self.text.insert(INSERT, self.change2.get()) # deletes if empty self.text.see(INSERT) self.onFind(self.change1.get()) # goto next appear self.text.update() # force refresh #################################### # Others, useful outside this class #################################### def isEmpty(self): return not self.getAllText() def getAllText(self): return self.text.get('1.0', END+'-1c') # extract text as a string def setAllText(self, text): self.text.delete('1.0', END) # store text string in widget self.text.insert(END, text) # or '1.0' self.text.mark_set(INSERT, '1.0') # move insert point to top self.text.see(INSERT) # scroll to top, insert set self.cfile.edit_modified(NO) def clearAllText(self): self.text.delete('1.0', END) # clear text in widget class CWords(CTab): def __init__(self, parent, cfile): CTab.__init__(self, parent, cfile, 'Words') self.child=None self.wordList = None self.cfile = cfile # Called when we switch to this mode using the toolbar at top. def reinit(self): if self.child: self.child.pack_forget() self.child = Frame(self, background='white') self.child.pack(expand=YES, fill=BOTH) scrollbar = Scrollbar(self.child, orient=VERTICAL) self.wordList = Listbox(self.child, yscrollcommand=scrollbar.set) self.wordList.grid(row=0, column=0, sticky=N+S+E+W) scrollbar.config(command= self.wordList.yview) scrollbar.grid(row=0, column=1, sticky=N+S) self.child.grid_rowconfigure(0, weight=1) self.child.grid_columnconfigure(0, weight=1) #If the data hasn't been compiled yet, then do so try: dummy = ccg2xml.morph_xml except: self.cfile.compile_if_needed() #Adding dummy code for all words for x in ccg2xml.morph_xml: assert x[0] == 'entry' self.wordList.insert (END, ccg2xml.getprop('word', x[1])) #print ccg2xml.getprop('word', x[1]) class CLexicon(CTab): class lexicon_vars(object): def __init__(self): self.show_feat_id = IntVar() self.show_feat_id.set(1) self.show_feat_struct = IntVar() self.show_feat_struct.set(1) self.show_full_features = IntVar() self.show_full_features.set(0) self.show_semantics = IntVar() self.show_semantics.set(1) def __init__(self, parent, cfile): CTab.__init__(self, parent, cfile, 'Lexicon') self.child = None self.cnv = None self.mainFrame = None self.vars = self.lexicon_vars() # FIXME? It's a bit awkward that ccg.ply has references to the # variables below scattered throughout it. But I'm not sure what # a better solution would be. self.checkbar = [ ("Show feature ID's", self.vars.show_feat_id), ("Show features", self.vars.show_feat_struct), ('Full-form features', self.vars.show_full_features), ('Show semantics', self.vars.show_semantics), ] # Called when we switch to this mode using the toolbar at top. def reinit(self): self.redraw() def redraw(self): self.cfile.compile_if_needed() if self.child: self.child.pack_forget() if self.mainFrame: self.mainFrame.pack_forget() self.mainFrame = Frame(self, bd=1, bg='white') self.mainFrame.pack_propagate(0) self.mainFrame.pack(expand=YES, fill=BOTH) self.mainFrame.grid_rowconfigure(0, weight=1) self.mainFrame.grid_columnconfigure(0, weight=1) xscrollbar = Scrollbar(self.mainFrame, orient=HORIZONTAL) xscrollbar.grid(row=1, column=0, sticky=E+W) yscrollbar = Scrollbar(self.mainFrame) yscrollbar.grid(row=0, column=1, sticky=N+S) self.cnv = Canvas(self.mainFrame, bd=2, xscrollcommand=xscrollbar.set, yscrollcommand=yscrollbar.set, width = 847, height=369) xscrollbar.config(command= self.cnv.xview) yscrollbar.config(command= self.cnv.yview) self.child = Frame(self.cnv, bd=2, relief=SUNKEN, background='white') self.cnv.create_window(0, 0, anchor='nw', window=self.child) ccg2xml.draw_parse(self.cfile.curparse.parse, self.cfile, self.child, self.vars, self.cnv, self.mainFrame) self.child.update_idletasks() self.cnv.config(scrollregion=self.cnv.bbox("all")) self.cnv.grid(row=0, column=0, sticky='NSEW') class CRules(CTab): def __init__(self, parent, cfile): CTab.__init__(self, parent, cfile, 'Rules') class CFeatures(CTab): def __init__(self, parent, cfile): CTab.__init__(self, parent, cfile, 'Features') self.child=None self.checkbar=None self.edit=None self.text=None # Called when we switch to this mode using the toolbar at top. def reinit(self): if self.child: self.child.pack_forget() self.child = Frame(self, background='white', width = 847, height = 369) self.child.pack(expand=YES, fill=BOTH) butframe = Frame(self.child, cursor='hand2', relief=SUNKEN, bd=2) butframe.pack(fill=X) but1 = Button(butframe, text='Expand All', command=self.expand_all) but1.pack(side=LEFT) but2 = Button(butframe, text='Contract All', command=self.contract_all) but2.pack(side=LEFT) # Force editing in the same frame: but a lower view: # pass self.child as the parent frame self.edit = Button(butframe, text='Edit', command= lambda:self.edit_tree(self.child)) self.edit.pack(side=RIGHT) featframe = Frame(self.child, bd=2, relief=SUNKEN, background='white') featframe.pack(expand=YES, fill=BOTH) self.cfile.compile_if_needed() # Build the tree self.tree={} self.root_name = re.sub(r'^(.*)\.(.*)$', r'\1', self.cfile.file) self.tree[self.root_name]=[] for feat in self.cfile.curparse.feature_to_values: self.tree[self.root_name] += [str(feat)] for feat in self.cfile.curparse.feature_to_values: self.tree[feat] = [] for x in self.cfile.curparse.feature_to_values[feat]: if x.name not in self.tree: self.tree[x.name] = [] for x in self.cfile.curparse.feature_to_values[feat]: if x.parents: par = x.parents[0] self.tree[par.name] += [x.name] else: self.tree[feat] += [x.name] # Define the images for opened and closed categories shut_icon=PhotoImage(data='R0lGODlhCQAQAJH/AMDAwAAAAGnD/wAAACH5BAEAAAAALAAA' 'AAAJABAAQAIdhI8hu2EqXIroyQrb\nyRf0VG0UxnSZ5jFjulrhaxQ' 'AO6olVwAAOw==') open_icon=PhotoImage(data='R0lGODlhEAAJAJH/AMDAwAAAAGnD/wAAACH5BAEAAAAALAAA' 'AAAQAAkAQAIahI+pyyEPg3KwPrko\nTqH7/yGUJWxcZTapUQAAO8b' 'yUgAAOw==') # Create the tree self.t=Tree.Tree(master=featframe, root_id='', root_label=self.root_name, collapsed_icon=shut_icon, expanded_icon=open_icon, get_contents_callback=self.get_treedata, line_flag=False) self.t.grid(row=0, column=0, sticky = 'nsew') featframe.grid_rowconfigure(0, weight=1) featframe.grid_columnconfigure(0, weight=1) sb=Scrollbar(featframe) sb.grid(row=0, column=1, sticky='ns') self.t.configure(yscrollcommand=sb.set) sb.configure(command=self.t.yview) sb=Scrollbar(featframe, orient=HORIZONTAL) sb.grid(row=1, column=0, sticky='ew') self.t.configure(xscrollcommand=sb.set) sb.configure(command=self.t.xview) # Expand the whole tree out self.expand_tree(self.t.root) # Returns the nodes rooted at the node passed and adds them to the tree def get_treedata(self,node): lbl = str(node.get_label()) children = self.tree[lbl] for x in children: if self.tree[x]: expands=1 else: expands=0 self.t.add_node(name=x,flag=expands) # Expand the tree rooted at node recursively def expand_tree(self, node): node.expand() for child in node.children(): if child.expandable(): self.expand_tree(child) def expand_all(self): self.expand_tree(self.t.root) def contract_all(self): self.t.root.collapse() def edit_tree(self, parent): editFrame = Frame(parent, bd=1, background='white') self.text = Text(editFrame, padx=5, wrap=None, undo = YES, background='white') vbar = Scrollbar(editFrame) hbar = Scrollbar(editFrame, orient='horizontal') self.text.config(yscrollcommand=vbar.set) # call vbar.set on text move self.text.config(xscrollcommand=hbar.set) vbar.config(command=self.text.yview) # call text.yview on scroll move hbar.config(command=self.text.xview) # or hbar['command']=text.xview # Change the text on the button, and also pass the rest # of the arguments so that the grid for the statements can be reset self.edit.config(text='Done', command= lambda:self.save_tree(parent)) # Changing the mode of the cfile object here, # so that once the user clicks done, # the whole object is recompiled and redisplayed self.cfile.mode= 'Edit' vbar.pack(side=RIGHT, fill=Y) hbar.pack(side=BOTTOM, fill=X) self.text.pack(fill= BOTH, expand= YES) # Set a mark at the beginning of the text self.text.mark_set("START", INSERT) self.text.mark_gravity("START", LEFT) # Push in the rest of the file's contents fileData = self.cfile.getAllText() self.text.insert(INSERT, fileData) # Move the insert position to the first occurence of the family name # FIXME: this is poor implementation # The positioning of the insert cursor should be happening by parsing the # CFG production rules, using CSFamily.prod.lineno and endlineno self.text.config(takefocus=True) idx= self.text.search('feature', "START") if idx: self.text.mark_set(CURRENT, idx) self.text.see(CURRENT) else: showwarning('Warning','Features not located in text') editFrame.pack(expand=YES, fill=BOTH) def save_tree(self, parent): # We force the text contents of the cfile object to copy over # all that is presently in the current text-box self.cfile.setAllText(self.text.get(1.0,END)) self.edit.config(text='Edit', command= lambda:self.edit_tree(parent)) # Recompile whatever was edited and redisplay # Note: changes are not saved hereby!! self.cfile.compile_if_needed() self.cfile.onFeatures() class CTestbed(CTab): def __init__(self, parent, cfile): CTab.__init__(self, parent, cfile, 'Testbed') self.child = None self.edit = None self.text = None self.editFrame = None self.cnv = None self.mainFrame = None self.newInsert = None def makelab(self, text, row, col, **props): lab = Label(self.child, text=text, background='white', **props) # Make the label grow to fill all space allocated for the column lab.grid(row=row, column=col, sticky='NSEW') # Called when we switch to this mode using the toolbar at top. def reinit(self): if self.child: self.child.pack_forget() if self.mainFrame: self.mainFrame.pack_forget() self.mainFrame = Frame(self, bd=1, bg='white') self.mainFrame.pack(expand=YES, fill=BOTH) self.mainFrame.grid_rowconfigure(0, weight=1) self.mainFrame.grid_columnconfigure(0, weight=1) xscrollbar = Scrollbar(self.mainFrame, orient=HORIZONTAL) xscrollbar.grid(row=1, column=0, sticky=E+W) yscrollbar = Scrollbar(self.mainFrame) yscrollbar.grid(row=0, column=1, sticky=N+S) self.cnv= Canvas(self.mainFrame, bd=2, xscrollcommand=xscrollbar.set, yscrollcommand=yscrollbar.set, width = 847, height=369) xscrollbar.config(command=self.cnv.xview) yscrollbar.config(command=self.cnv.yview) self.child = Frame(self.cnv, bd=2, relief=SUNKEN, background='white') self.child.rowconfigure(1, weight=1) self.child.columnconfigure(1, weight=1) self.child.pack(expand=YES, fill=BOTH) butnFrame = Frame(self.child, relief=SUNKEN, bd=2) butnFrame.grid(row=0, sticky='NSEW', columnspan=2) self.edit = Button(butnFrame, text='Edit', command= self.edit_testbed) self.edit.pack(side=RIGHT) self.newInsert = Button(butnFrame, text='New Sentence', command= self.new_sentence) self.newInsert.pack(side=RIGHT) self.cfile.compile_if_needed() self.makelab("Num Parses", 1, 0, bd=1, relief=SUNKEN, fg="#77AA77", font = ("Helvetica", FontScale +12)) self.makelab("Sentence", 1, 1, bd=1, relief=SUNKEN, fg="#77AA77", font = ("Helvetica", FontScale +12)) # Make the column containing the sentences grow to include all # extra space self.child.columnconfigure(1, weight=1) for i in xrange(len(self.cfile.curparse.testbed_statements)): x = self.cfile.curparse.testbed_statements[i] assert x[0] == 'item' x = x[1] # Left-justify the text numparse = ccg2xml.getprop('numOfParses', x) string = ccg2xml.getprop('string', x) # How many parses of the sentence are produced? self.makelab('%s' % numparse, i+2, 0) # Print the sentence itself self.makelab('%s%s' % (numparse == 0 and '*' or '', string), i+2, 1, anchor=W) self.cnv.create_window(0, 0, anchor='nw', window=self.child) self.child.update_idletasks() #self.child.grid(row=0, column=0, sticky=NSEW) self.cnv.config(scrollregion=self.cnv.bbox("all")) self.cnv.grid(row=0, column=0, sticky='NSEW') # Edit the testbed def edit_testbed(self): self.editFrame = Frame(self.mainFrame, bd=1, background='white') #self.editFrame.grid(row=len(self.cfile.curparse.testbed_statements)+3, columnspan=2, sticky='NSEW') self.editFrame.grid(row=2, columnspan=2, sticky='NSEW') self.text = Text(self.editFrame, padx=5, wrap=None, undo = YES, background='white') vbar = Scrollbar(self.editFrame) hbar = Scrollbar(self.editFrame, orient='horizontal') self.text.config(yscrollcommand=vbar.set) # call vbar.set on text move self.text.config(xscrollcommand=hbar.set) vbar.config(command=self.text.yview) # call text.yview on scroll move hbar.config(command=self.text.xview) # or hbar['command']=text.xview # Change the text on the button, and also pass the rest # of the arguments so that the grid for the statements can be reset self.edit.config(text='Done', command= self.save_testbed) # Changing the mode of the cfile object here, # so that once the user clicks done, # the whole object is recompiled and redisplayed self.cfile.mode= 'Edit' vbar.pack(side=RIGHT, fill=Y) hbar.pack(side=BOTTOM, fill=X) self.text.pack(fill= BOTH, expand= YES) # Set a mark at the beginning of the text self.text.mark_set("START", INSERT) self.text.mark_gravity("START", LEFT) # Push in the rest of the file's contents fileData = self.cfile.getAllText() self.text.insert(INSERT, fileData) # Move the insert position to the first occurence of the family name # FIXME: this is poor implementation # The positioning of the insert cursor should be happening by parsing the # CFG production rules, using CSFamily.prod.lineno and endlineno self.text.config(takefocus=True) idx= self.text.search('testbed', "START") if idx: self.text.mark_set(CURRENT, idx) self.text.see(CURRENT) else: showwarning(title= 'VisCCG: Warning', message='No initial testbed found') #self.editFrame.pack(expand=YES, fill=BOTH) self.child.update_idletasks() self.cnv.config(scrollregion=self.cnv.bbox("all")) # Save the edited text def save_testbed(self): # We force the text contents of the cfile object to copy over # all that is presently in the current text-box self.cfile.setAllText(self.text.get(1.0,END)) self.edit.config(text='Edit', command= self.edit_testbed) self.editFrame.pack_forget() # Recompile whatever was edited and redisplay # Note: changes are not saved hereby!! self.cfile.compile_if_needed() self.cfile.onTestbed() # Enter a new sentence def new_sentence(self): master = Tk() master.title('VisCCG: New Sentence for the testbed') sent = Entry(master, bg='#FFFFFF', width = 100) nParses = Entry(master, bg='#FFFFFF', width = 2) sLabel = Label (master, text = 'Sentence:') nLabel = Label (master, text = 'Number of parses:') sent.focus_set() b = Button(master, text="Add sentence", width=10, command= lambda:self.editNew(master, sent, nParses)) c = Button(master, text="Cancel", command= master.destroy) sent.grid (row=1, column=0, sticky = W) nParses.grid (row=1, column=1, sticky= W) sLabel.grid (row=0, column=0, sticky=W) nLabel.grid (row=0, column=1, sticky = W) b.grid (row=2, column = 0) c.grid (row=2, column = 1) # Print from the new sentence def editNew(self, master, sent, nParses): # Prepare the file's contents for editing fileData = self.cfile.getAllText() self.text = Text(master) self.text.mark_set("START", INSERT) self.text.mark_gravity("START", LEFT) self.text.insert(INSERT, fileData) testSent = sent.get() npSent = nParses.get() self.text.config(takefocus=True) idx= self.text.search('testbed', "START") if idx: self.text.mark_set("START", idx) idx = self.text.search('{', "START", forwards = True) self.text.mark_set("START", idx) idx = self.text.search('\n', "START", forwards = True) # FIXME: really poor search for locating the right position # to insert text here. Needs correction! self.text.mark_set(INSERT, idx) self.text.mark_gravity(INSERT, RIGHT) self.text.insert (INSERT, '\n\t'+ testSent+ ':\t'+ npSent+ ';') else: showwarning(title= 'VisCCG: Warning', message='No initial testbed found, creating new') self.text.mark_set(INSERT, END) self.text.mark_gravity(INSERT, RIGHT) self.text.insert (INSERT, ' testbed {\n') self.text.insert (INSERT, '\n\t'+ testSent+ ':\t'+ npSent+ ';') self.text.insert (INSERT, '}\n') # Set the original file's data to be this fileData= self.text.get(1.0, END) self.cfile.setAllText(fileData) # Destroy the entry window master.destroy() # Update the display self.cfile.mode= 'Edit' self.cfile.compile_if_needed() self.cfile.onTestbed() # Creates the top-level window and populates the widgets below it. class CFile(object): #### NOTE NOTE NOTE! Variables declared like this, in the class itself, #### are class variables (not instance variables) until they are #### assigned to. If you want pure instance variables, you need to #### initialize them inside of __init__(). # Hash table describing modes and the associated class modelist = {'Edit':CEdit, 'Lexicon':CLexicon, 'Features':CFeatures, 'Words':CWords, 'Testbed':CTestbed, 'Rules':CRules} startfiledir = '.' ftypes = [('All files', '*'), # for file open dialog ('Text files', '.txt'), # customize in subclass ('Python files', '.py')] # or set in each instance colors = [{'fg':'black', 'bg':'white'}, # color pick list {'fg':'yellow', 'bg':'black'}, # first item is default {'fg':'white', 'bg':'blue'}, # tailor me as desired {'fg':'black', 'bg':'beige'}, # or do PickBg/Fg chooser {'fg':'yellow', 'bg':'purple'}, {'fg':'black', 'bg':'brown'}, {'fg':'lightgreen', 'bg':'darkgreen'}, {'fg':'darkblue', 'bg':'orange'}, {'fg':'orange', 'bg':'darkblue'}] fonts = [('courier', 9+FontScale, 'normal'), # platform-neutral fonts ('courier', 12+FontScale, 'normal'), # (family, size, style) ('courier', 10+FontScale, 'bold'), # or popup a listbox ('courier', 10+FontScale, 'italic'), # make bigger on linux ('times', 10+FontScale, 'normal'), ('helvetica', 10+FontScale, 'normal'), ('ariel', 10+FontScale, 'normal'), ('system', 10+FontScale, 'normal'), ('courier', 20+FontScale, 'normal')] def __init__(self, file=None): self.file = file self.openDialog = None self.saveDialog = None self.lastfind = None self.current_parse = None self.mode = None self.last_save_signature = None self.last_compile_signature = None # First top-level window is Tk(); rest are Toplevel() global root if not root: root = Tk() self.top = root else: self.top = Toplevel(root) ccg2xml.late_init_graphics() openfiles[self] = True self.top.protocol('WM_DELETE_WINDOW', self.onClose) # We create an outer frame to hold the toolbar and the main widget. # Create all the different kinds of main widget. # FIXME: Maybe outer isn't necessary? self.outer = Frame(self.top) self.outer.pack(expand=YES, fill=BOTH) # make frame stretchable self.modes = {} for mode in self.modelist: self.modes[mode] = self.modelist[mode](self.outer, self) self.main = None self.toolbar_widget = None self.checkbar_widget = None #self.switch_to('Edit') self.setFileName(None) if file: self.onFirstOpen(file) else: # When the user has just opened a new file # Need to load template from the src folder openccg_home = os.environ['OPENCCG_HOME'] template = open(openccg_home + '/src/ccg2xml/grammar_template.ccg', 'r').read() self.setAllText(template) # Save the MD5 signature for future comparison self.last_save_signature = self.getSignature(self.getAllText()) self.switch_to('Edit') def switch_to(self, mode): # Switch to a different mode (display, edit, test). Remove the # existing main and toolbar widgets, if existing. Redo the menubar # and toolbar widgets according to the new mode and then display # the new widgets. # # FIXME: We should probably create the menubar and toolbar widgets # only once, and remember them. if self.mode != mode: if self.main: self.main.pack_forget() if self.toolbar_widget: self.toolbar_widget.pack_forget() if self.checkbar_widget: self.checkbar_widget.pack_forget() self.mode = mode self.main = self.modes[mode] self.makeMenubar() self.makeToolbar(mode) self.makeCheckbar() #print "Reinit being called now... " self.main.reinit() # Pack the main widget after the toolbar, so it goes below it. self.main.pack(side=TOP, expand=YES, fill=BOTH) # Create the menubar; assumes that self.menubar has been set to the # appropriate menubar description. Note that the menubar has to be a # child of the top-level window itself rather than any child of it, so # that it can be correctly displayed at the top of the window -- or # possibly in its decoration (Windows) or at top of screen (Mac). # # From PP2E guimaker.py. def makeMenubar(self): menubar = Menu(self.top) self.top.config(menu=menubar) for (name, key, items) in self.main.menubar: pulldown = Menu(menubar) self.addMenuItems(pulldown, items) menubar.add_cascade(label=name, underline=key, menu=pulldown) if sys.platform[:3] == 'win': menubar.add_command(label='Help', command=self.help) else: pulldown = Menu(menubar) # linux needs real pulldown pulldown.add_command(label='About', command=self.help) menubar.add_cascade(label='Help', menu=pulldown) # Add items to a menu or submenu. From PP2E guimaker.py. def addMenuItems(self, menu, items): for item in items: # scan nested items list if item == 'separator': # string: add separator menu.add_separator({}) elif type(item) is list: # list: disabled item list for num in item: menu.entryconfig(num, state=DISABLED) elif type(item[2]) is not list: menu.add_command(label = item[0], # command: underline = item[1], # add command command = item[2]) # cmd=callable else: pullover = Menu(menu) self.addMenuItems(pullover, item[2]) # sublist: menu.add_cascade(label = item[0], # make submenu underline = item[1], # add cascade menu = pullover) def makeToolbar(self, selected): """ make toolbar (of buttons) at top, if any expand=no, fill=x so same width on resize """ if self.main.toolbar: self.toolbar_widget = Frame(self.outer, cursor='hand2', relief=SUNKEN, bd=2) self.toolbar_widget.pack(side=TOP, fill=X) for (name, action, where) in self.main.toolbar: but = Button(self.toolbar_widget, text=name, command=action) if name == selected: but.config(relief=SUNKEN) but.pack(where) def makeCheckbar(self): """ make check-button bar at top, if any expand=no, fill=x so same width on resize """ if self.main.checkbar: self.checkbar_widget = Frame(self.outer, cursor='hand2', relief=SUNKEN, bd=2) self.checkbar_widget.pack(side=TOP, fill=X) for (name, var) in self.main.checkbar: Checkbutton(self.checkbar_widget, text=name, variable=var, command=self.main.redraw).pack(side=LEFT) def getAllText(self): return self.modes['Edit'].getAllText() def setAllText(self, text): self.modes['Edit'].setAllText(text) #self.modes['Display'].setAllText(text) def _getints(self, string): """Internal function.""" if string: if type(string) is str: textwid = self.modes['Edit'].text return tuple(map(getint, textwid.tk.splitlist(string))) else: return string def edit(self, *args): """Internal method This method controls the undo mechanism and the modified flag. The exact behavior of the command depends on the option argument that follows the edit argument. The following forms of the command are currently supported: edit_modified, edit_redo, edit_reset, edit_separator and edit_undo """ textwid = self.modes['Edit'].text return self._getints( textwid.tk.call((textwid._w, 'edit') + args)) or () def edit_modified(self, arg=None): """Get or Set the modified flag If arg is not specified, returns the modified flag of the widget. The insert, delete, edit undo and edit redo commands or the user can set or clear the modified flag. If boolean is specified, sets the modified flag of the widget to arg. """ # Added to use md5 functionality to watch for changed data if arg is None: alltext = self.getAllText() if (self.last_save_signature != self.getSignature(alltext)): return YES return self.edit("modified", arg) def onInfo(self): text = self.getAllText() # added on 5/3/00 in 15 mins bytes = len(text) # words uses a simple guess: lines = len(string.split(text, '\n')) # any separated by whitespace words = len(string.split(text)) index = self.main.text.index(INSERT) where = tuple(string.split(index, '.')) showinfo('CCG Editor Information', 'Current location:\n\n' + 'line:\t%s\ncolumn:\t%s\n\n' % where + 'File text statistics:\n\n' + 'Modified: %s\n\n' % self.edit_modified()+ 'bytes:\t%d\nlines:\t%d\nwords:\t%d\n' % (bytes, lines, words)) ####################### # Search menu commands ####################### def onGoto(self, line=None): if not line: line = askinteger('CCG Editor', 'Enter line number') self.main.text.update() self.main.text.focus() if line is not None: maxindex = self.main.text.index(END+'-1c') maxline = atoi(split(maxindex, '.')[0]) if line > 0 and line <= maxline: self.main.text.mark_set(INSERT, '%d.0' % line) # goto line self.main.text.tag_remove(SEL, '1.0', END) # delete selects self.main.text.tag_add(SEL, INSERT, 'insert + 1l') # select line self.main.text.see(INSERT) # scroll to line else: showerror('CCG Editor', 'Bad line number') def onFind(self, lastkey=None): key = lastkey or askstring('CCG Editor', 'Enter search string') self.main.text.update() self.main.text.focus() self.lastfind = key if key: where = self.main.text.search(key, INSERT, END) # don't wrap if not where: showerror('CCG Editor', 'String not found') else: pastkey = where + '+%dc' % len(key) # index past key self.main.text.tag_remove(SEL, '1.0', END) # remove any sel self.main.text.tag_add(SEL, where, pastkey) # select key self.main.text.mark_set(INSERT, pastkey) # for next find self.main.text.see(where) # scroll display def onRefind(self): self.onFind(self.lastfind) ###################### # Tools menu commands ###################### def onFontList(self): self.fonts.append(self.fonts[0]) # pick next font in list del self.fonts[0] # resizes the text area self.modes['Edit'].text.config(font=self.fonts[0]) self.modes['Display'].text.config(font=self.fonts[0]) def onColorList(self): self.colors.append(self.colors[0]) # pick next color in list del self.colors[0] # move current to end self.modes['Edit'].text.config(fg=self.colors[0]['fg'], bg=self.colors[0]['bg']) self.modes['Display'].text.config(fg=self.colors[0]['fg'], bg=self.colors[0]['bg']) def onPickFg(self): self.pickColor('fg') def onPickBg(self): self.pickColor('bg') def pickColor(self, part): (triple, hexstr) = askcolor() if hexstr: apply(self.modes['Edit'].text.config, (), {part: hexstr}) apply(self.modes['Display'].text.config, (), {part: hexstr}) # def onRunCode(self, parallelmode=1): # """ # run Python code being edited--not an ide, but handy; # tries to run in file's dir, not cwd (may be pp2e root); # inputs and adds command-line arguments for script files; # code's stdin/out/err = editor's start window, if any; # but parallelmode uses start to open a dos box for i/o; # """ # from PP2E.launchmodes import System, Start, Fork # filemode = 0 # thefile = str(self.getFileName()) # cmdargs = askstring('CCG Editor', 'Commandline arguments?') or '' # if os.path.exists(thefile): # filemode = askyesno('CCG Editor', 'Run from file?') # if not filemode: # run text string # namespace = {'__name__': '__main__'} # run as top-level # sys.argv = [thefile] + string.split(cmdargs) # could use threads # exec self.getAllText() + '\n' in namespace # exceptions ignored # elif askyesno('CCG Editor', 'Text saved in file?'): # mycwd = os.getcwd() # cwd may be root # os.chdir(os.path.dirname(thefile) or mycwd) # cd for filenames # thecmd = thefile + ' ' + cmdargs # if not parallelmode: # run as file # System(thecmd, thecmd)() # block editor # else: # if sys.platform[:3] == 'win': # spawn in parallel # Start(thecmd, thecmd)() # or use os.spawnv # else: # Fork(thecmd, thecmd)() # spawn in parallel # os.chdir(mycwd) ##################### # File menu commands ##################### def getSignature(self, contents): return md5.md5(contents).digest() def my_askopenfilename(self): # objects remember last result dir/file if not self.openDialog: self.openDialog = Open(initialdir=self.startfiledir, filetypes=self.ftypes) return self.openDialog.show() def my_asksaveasfilename(self): # objects remember last result dir/file if not self.saveDialog: self.saveDialog = SaveAs(initialdir=self.startfiledir, filetypes=self.ftypes) self.last_save_signature = self.getSignature(self.getAllText()) return self.saveDialog.show() def onOpen(self): file = self.my_askopenfilename() # FIXME! Only create new window if file exists and is readable if file: CFile(file) def onFirstOpen(self, file): try: text = open(file, 'r').read() except: showerror('CCG Editor', 'Could not open file ' + file) else: self.setAllText(text) self.setFileName(file) def compile_if_needed(self): # Compare the last compiled MD5 signature and present one # and compile if needed. # To force compilation, set this signature to None text = self.getAllText() textSign = self.getSignature(text) if textSign != self.last_compile_signature: # Now compile ccg2xml.init_global_state(errors_to_string=True) ccg2xml.options.quiet = True self.curparse = ccg2xml.parse_string(text) self.last_compiled_signature = textSign def onDisplay(self): self.switch_to('Display') def onEdit(self): self.switch_to('Edit') def onLexicon(self): self.switch_to('Lexicon') def onTestbed(self): self.switch_to('Testbed') def onRules(self): self.switch_to('Rules') def onWords(self): self.switch_to('Words') def onFeatures(self): self.switch_to('Features') def onNew(self): CFile() def getFileName(self): return self.currfile def setFileName(self, name): self.currfile = name # for save if name: title = 'VisCCG Editor: %s' % name else: title = 'VisCCG Editor' self.top.title(title) self.top.iconname(title) def help(self): showinfo('Help', 'Sorry, no help for ' + self.__class__.__name__) # Close this window; if this is the last window, quit def onClose(self): assert self in openfiles if len(openfiles) == 1 or self.top == root: self.onQuit() # If we got this far, the user refused to quit, so do nothing else: ccg2xml.debug("fooooo\n") del openfiles[self] self.top.destroy() def onQuit(self): modfiles = False for f in openfiles: if f.edit_modified() == YES: modfiles = True break if not modfiles or askyesno('CCG Editor', 'Files are modified, Really quit?'): self.top.quit() def main(): ccg2xml.parse_arguments(sys.argv[1:]) ccg2xml.init_global_state_once() if ccg2xml.global_args and len(ccg2xml.global_args) > 0: # file name: fname = ccg2xml.global_args[0] else: fname = None CFile(fname) mainloop() if __name__ == '__main__': # when run as a script main() ================================================ FILE: src/ccg2xml/convert-ply.py ================================================ #!/usr/bin/python import sys import re import optparse import os.path import time # Author: Ben Wing # Date: April 2006 ############################################################################# # # # convert-ply.py # # # # Convert a .ply file into a .py file which can be run to generate a # # compiler for a language and use it to parse a specified file. This # # program is something like a compiler-compiler-compiler -- it uses a # # script to generate a compiler-compiler script, which in turn is used # # to generate a compiler, which in turn processes a program in some # # arbitrary syntax and does whatever it wants with it. Very meta!!! # # # ############################################################################# # The format of a .ply file is that of a .py file with YACC-like directives # interspersed. %y on a line by itself switches to YACC mode, and %p # switches back to Python mode. In YACC mode, lines should look like this: # lhs: rhs1 rhs2 ...: python code # first alternative # : rhs3 rhs4 ...: more python code # second alternative # : rhs5 rhs6 # alternative with default code of $$ = $1 # : rhs7 rhs8 ...: # # If no code follows the colon on the same line, the code consists # # of all following indented lines. # python code # python code # ... # # This represents a context-free rule where LHS expands to one or more # rules (e.g. RHS1 RHS2 ... or RHS3 RHS4 ... or RHS5 RHS6 ... etc.). # Associated with each rule is some code, which will be invoked when the # parser finds an appropriately matching right-hand side somewhere in # the stream of tokens being parsed and proceeds to "reduce" the relevant # tokens into the single left-hand side token LHS. Associated with each # token is a value. For terminal tokens, the value is the string in the # text that produced this token. For non-terminal tokens, the value is # determined by the code associated with the rule that produced this token # (i.e. one of the rules with this token as its LHS). The value of a # non-terminal token is set by assigning to $$; values of RHS tokens are # referenced using $1, $2, etc., where $1 is the first RHS token, $2 # is the second, etc. Hence, the default code of $$ = $1 assigns the # value of the first RHS token to the LHS token. The code can also # reference $@, which is a structure encapsulating all RHS values; this # is mostly useful for getting at $@.lineno, a function referring to the # starting line number of a particular token (especially $@.lineno(0), the # starting line number of the set of RHS tokens). # WARNING!!! Currently, PLY has a serious bug in its handling of empty # RHS rules; often it reports a syntax error in place of properly reducing # an empty RHS rule. If this happens, you must rewrite the appropriate # rules without the use of an empty RHS rule. (As of yet, I'm not sure # exactly what the circumstances are that trigger this bug.) # # Note that this bug exists as of PLY 1.6, which is what we are currently # using. It's quite possible that later versions of PLY (especially the # new PLY 2.x series) fix the bug. ########################################################################### # # Command-line options and usage # ########################################################################### usage = """%prog [OPTIONS] FILE ... Convert from .ply format to a .py file, for lex/yacc. """ parser = optparse.OptionParser(usage=usage) parser.add_option("-o", "--outfile", default=None, help="""Specify the output file. Default is y.INFILE.py, where INFILE is the source file's name minus any .ply extension.""", metavar="FILE") (options, args) = parser.parse_args() def syntax_error(err, line): global errors errors += 1 if errors > maxerr: raise SyntaxError("Too many errors (more than %s) when compiling" % maxerr) sys.stderr.write("%s in file %s, line %d: %s\n" % (err, current_file, current_lineno, line)) wordrange = r'\-a-zA-Z0-9_%' operrange = r'\+\*\|\?' wordre = '[%s]+' % wordrange def make_name_python_safe(name): return re.sub('[^A-Za-z0-9_]', '_', name) # Replace dollar signs in CODE to point to the actual array of RHS values. # If RENUMBER_AT is given, however, convert $RENUMBER_AT to None, and # subtract one from all $ references above this value. def replace_dollar_signs(code, renumber_at=None): newcode = "" prevright = 0 for match in re.finditer(r"""('''([^\\\n]|\\(.|\n))*?'''| \"\"\"([^\\\n]|\\(.|\n))*?\"\"\"| '([^\\\n]|\\(.|\n))*?'| \"([^\\\n]|\\(.|\n))*?\"| [#][^\n]*\n?| ([^\\'\"#]|\\(.|\n))*)""", code, re.VERBOSE): errored = 0 if prevright != match.start(0): syntax_error("Apparent syntax error in code at position %d" % prevright, code) errored = 1 newcode += code[prevright:match.start(0)] prevright = match.end(0) matchstr = match.group(0) if not matchstr: if match.start(0) == len(code): break elif not errored: errored = 1 syntax_error("Apparent syntax error in code at position %d" % match.start(0), code) elif matchstr[0] in '\'"#': # A comment or literal; don't substitute in it pass elif renumber_at: # Renumber def replace_dollar_def(match): str = match.group() ref = int(str[1:]) if ref == renumber_at: return '[]' elif ref > renumber_at: return '$%d' % (ref - 1) else: return str matchstr = re.sub(r'\$([0-9]+)', replace_dollar_def, matchstr) else: # Apply substitutions matchstr = re.sub(r'\$@', 'p', matchstr) matchstr = re.sub(r'\$\$', 'p[0]', matchstr) matchstr = re.sub(r'\$([0-9]+)', r'p[\1]', matchstr) newcode += matchstr return newcode # Output a single CFG rule def output_python_cfg_rule(fil, lhs, rhs, code): global unique_no # Look for occurrences of foo* or foo+; handle all of them by # adding appropriate list rules. newrhs = "" prevright = 0 for match in re.finditer(r'(%s)\s*([+*])' % wordre, rhs): matchstr = match.group(1) newrhs += rhs[prevright:match.start(0)] prevright = match.end(0) unique_no += 1 matchlhs = '%s_%s_list_%s' % (lhs, matchstr, unique_no) newrhs += matchlhs if match.group(2) == '+': output_python_cfg_rule(fil, matchlhs, matchstr, " $$ = [$1]\n") else: output_python_cfg_rule(fil, matchlhs, " ", " $$ = []\n") output_python_cfg_rule(fil, matchlhs, '%s %s' % (matchlhs, matchstr), " $$ = $1 + [$2]\n") if prevright: newrhs += rhs[prevright:] rhs = newrhs # Look for occurrences of foo?; handle by splitting into two rules # (It should be possible to handle by using empty rules, but this # is currently broken in PLY and more-or-less randomly doesn't work, # with the empty rule not being recognized and a syntax error # resulting) match = re.search(r'(%s)\s*[?]' % wordre, rhs) if match: matchtoken = match.group(1) # matched token, w/o following '?' leftrhs = rhs[0:match.start(0)] # text before match rightrhs = rhs[match.end(0):] # text after match # Output the "it's there" alternative output_python_cfg_rule(fil, lhs, leftrhs + matchtoken + rightrhs, code) # compute token ID, as would be referenced by a $# reference, based # on the text before the match. first delete everything before a # '|' (alternatives) then count the number of words. tokennum = len(re.sub('.*\|', '', leftrhs).split()) + 1 code = replace_dollar_signs(code, tokennum) # Output the "it's not there" alternative, with the dollar references # renumbered. We should *not* attempt any tail-recursion elimination # here, in case there are further foo? occurrences later in the file. output_python_cfg_rule(fil, lhs, leftrhs + rightrhs, code) else: unique_no += 1 print >> fil, "def p_%s_%d(p):" % (make_name_python_safe(lhs), unique_no) rhs = rhs.strip() rhs = re.sub(r'\s*\|\s*', r'\n | ', rhs) rhs = re.sub(r'\n\s*\n', '\n', rhs) if rhs.find('\n') >= 0: print >> fil, " '''%s : %s'''" % (lhs, rhs) else: print >> fil, " '%s : %s'" % (lhs, rhs) code = replace_dollar_signs(code) print >> fil, code def output_default_python_cfg_rule(fil, lhs, rhs): output_python_cfg_rule(fil, lhs, rhs, " $$ = $1\n") def finish_any_cfg(fil): global curlhs, currhs, yacc_python_mode, yacc_python_code if currhs: # A RHS not yet finished; finish it if yacc_python_code: output_python_cfg_rule(fil, curlhs, currhs, yacc_python_code) else: output_default_python_cfg_rule(fil, curlhs, currhs) clear_rule_context() def clear_rule_context(): global curlhs, currhs, yacc_python_mode, yacc_python_code curlhs = None currhs = None yacc_python_mode = False yacc_python_code = None ## Process file(s) args = args or ['-'] for arg in args: global current_file current_file = arg # Open input and output files if arg == '-': fil = sys.stdin else: fil = open(arg) if options.outfile: outarg = options.outfile else: if arg == '-': outarg = 'y.ccg.py' else: fname = arg if fname.endswith('.ply'): fname = fname[0:-4] (fdir, ffile) = os.path.split(fname) outarg = os.path.join(fdir, 'y.%s.py' % ffile) outfil = open(outarg, 'w') # Initialize state errors = 0 maxerr = 5 unique_no = 0 clear_rule_context() mode = 'python' contline = None print >> outfil, """#!/usr/bin/python ################## NOTE NOTE NOTE ################## # # This file (%s) was automatically generated from %s. # Generated by %s at %s. # # DO NOT MODIFY THIS FILE DIRECTLY. YOUR CHANGES WILL BE LOST. # Instead, modify the file `%s' that generated this file, and then # rerun `%s%s %s'. # ################## NOTE NOTE NOTE ################## """ % (outarg, current_file, sys.argv[0], time.asctime(), current_file, sys.argv[0], options.outfile and " -o %s" % options.outfile or "", current_file) global current_lineno current_lineno = 0 for line in fil: current_lineno += 1 line = line.rstrip("\r\n") if contline: line = contline + line contline = None if line == '%y': mode = 'yacc' elif line == '%p': mode = 'python' finish_any_cfg(outfil) elif line == '%l': mode = 'lex' else: if mode == 'python': print >> outfil, line else: if yacc_python_mode: if re.match(r'\S', line): yacc_python_mode = False else: yacc_python_code += line + '\n' continue if re.match(r'\s*#.*$', line): print >> outfil, line continue elif line and line[-1] == '\\': contline = line[0:-1] continue elif re.match(r'\s*$', line): print >> outfil, line continue # Eliminate comments, but conservatively, to avoid any # possibility of removing comments inside of quotes (which # should occur only in Python code, anyway, in which case # it doesn't really matter) line = re.sub(r'''^([^\'\"#]*)#.*$''', '\1', line) if mode == 'yacc': match = re.match(r'(%s)\s*(:.*)$' % wordre, line) if match: # We are starting a new rule finish_any_cfg(outfil) curlhs = match.group(1) line = ' ' + match.group(2) match = re.match(r'(\s*)(:)?([^:]*)(:.*)?$', line) if not match or not match.group(1): syntax_error("Unrecognized rule beginning", line) if re.match(r'^.*[^%s\s%s].*$' % (operrange, wordrange), match.group(3)): syntax_error("Illegal characters in RHS", line) if not match.group(2): # We are continuing an RHS if currhs == None: syntax_error("Invalid RHS continuation", line) else: currhs += match.expand(r' \3') else: # We are starting an RHS if curlhs == None: syntax_error("Invalid RHS without LHS", line) else: if currhs: output_default_python_cfg_rule(outfil, curlhs, currhs) currhs = match.group(3) if match.group(4): # strip colon, spaces code = match.group(4)[1:].strip() if not code: # Start eating the rest of the code until new rule yacc_python_mode = True yacc_python_code = "" else: code = ' ' + code + '\n' output_python_cfg_rule(outfil, curlhs, currhs, code) currhs = None finish_any_cfg(outfil) fil.close() outfil.close() ================================================ FILE: src/ccg2xml/grammar_template.ccg ================================================ ############ [name of the grammar] #################### # ## [a brief description of your grammar] ## [your name, Month Year] # # # For help on how to create grammars with VisCCG, see the tutorial: # http://comp.ling.utexas.edu/wiki/doku.php/openccg/ccggui_tut ####################### Features ####################### feature { } ######################### Words ######################### ######################### Rules ######################### ################## Lexicon/Categories #################### ####################### Testbed ######################### testbed { } ================================================ FILE: src/ccg2xml/lex.py ================================================ #----------------------------------------------------------------------------- # ply: lex.py # # Author: David M. Beazley (dave@dabeaz.com) # # Copyright (C) 2001-2005, David M. Beazley # # $Header: /cvsroot/openccg/openccg/src/ccg2xml/lex.py,v 1.1 2006/09/30 08:11:29 benwing Exp $ # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # See the file COPYING for a complete copy of the LGPL. # # # This module automatically constructs a lexical analysis module from regular # expression rules defined in a user-defined module. The idea is essentially the same # as that used in John Aycock's Spark framework, but the implementation works # at the module level rather than requiring the use of classes. # # This module tries to provide an interface that is closely modeled after # the traditional lex interface in Unix. It also differs from Spark # in that: # # - It provides more extensive error checking and reporting if # the user supplies a set of regular expressions that can't # be compiled or if there is any other kind of a problem in # the specification. # # - The interface is geared towards LALR(1) and LR(1) parser # generators. That is tokens are generated one at a time # rather than being generated in advanced all in one step. # # There are a few limitations of this module # # - The module interface makes it somewhat awkward to support more # than one lexer at a time. Although somewhat inelegant from a # design perspective, this is rarely a practical concern for # most compiler projects. # # - The lexer requires that the entire input text be read into # a string before scanning. I suppose that most machines have # enough memory to make this a minor issues, but it makes # the lexer somewhat difficult to use in interactive sessions # or with streaming data. # #----------------------------------------------------------------------------- r""" lex.py This module builds lex-like scanners based on regular expression rules. To use the module, simply write a collection of regular expression rules and actions like this: # lexer.py import lex # Define a list of valid tokens tokens = ( 'IDENTIFIER', 'NUMBER', 'PLUS', 'MINUS' ) # Define tokens as functions def t_IDENTIFIER(t): r' ([a-zA-Z_](\w|_)* ' return t def t_NUMBER(t): r' \d+ ' return t # Some simple tokens with no actions t_PLUS = r'\+' t_MINUS = r'-' # Initialize the lexer lex.lex() The tokens list is required and contains a complete list of all valid token types that the lexer is allowed to produce. Token types are restricted to be valid identifiers. This means that 'MINUS' is a valid token type whereas '-' is not. Rules are defined by writing a function with a name of the form t_rulename. Each rule must accept a single argument which is a token object generated by the lexer. This token has the following attributes: t.type = type string of the token. This is initially set to the name of the rule without the leading t_ t.value = The value of the lexeme. t.lineno = The value of the line number where the token was encountered For example, the t_NUMBER() rule above might be called with the following: t.type = 'NUMBER' t.value = '42' t.lineno = 3 Each rule returns the token object it would like to supply to the parser. In most cases, the token t is returned with few, if any modifications. To discard a token for things like whitespace or comments, simply return nothing. For instance: def t_whitespace(t): r' \s+ ' pass For faster lexing, you can also define this in terms of the ignore set like this: t_ignore = ' \t' The characters in this string are ignored by the lexer. Use of this feature can speed up parsing significantly since scanning will immediately proceed to the next token. lex requires that the token returned by each rule has an attribute t.type. Other than this, rules are free to return any kind of token object that they wish and may construct a new type of token object from the attributes of t (provided the new object has the required type attribute). If illegal characters are encountered, the scanner executes the function t_error(t) where t is a token representing the rest of the string that hasn't been matched. If this function isn't defined, a LexError exception is raised. The .text attribute of this exception object contains the part of the string that wasn't matched. The t.skip(n) method can be used to skip ahead n characters in the input stream. This is usually only used in the error handling rule. For instance, the following rule would print an error message and continue: def t_error(t): print "Illegal character in input %s" % t.value[0] t.skip(1) Of course, a nice scanner might wish to skip more than one character if the input looks very corrupted. The lex module defines a t.lineno attribute on each token that can be used to track the current line number in the input. The value of this variable is not modified by lex so it is up to your lexer module to correctly update its value depending on the lexical properties of the input language. To do this, you might write rules such as the following: def t_newline(t): r' \n+ ' t.lineno += t.value.count("\n") To initialize your lexer so that it can be used, simply call the lex.lex() function in your rule file. If there are any errors in your specification, warning messages or an exception will be generated to alert you to the problem. (dave: this needs to be rewritten) To use the newly constructed lexer from another module, simply do this: import lex import lexer plex.input("position = initial + rate*60") while 1: token = plex.token() # Get a token if not token: break # No more tokens ... do whatever ... Assuming that the module 'lexer' has initialized plex as shown above, parsing modules can safely import 'plex' without having to import the rule file or any additional imformation about the scanner you have defined. """ # ----------------------------------------------------------------------------- __version__ = "1.6" import re, types, sys, copy # Exception thrown when invalid token encountered and no default class LexError(Exception): def __init__(self,message,s): self.args = (message,) self.text = s # Token class class LexToken: def __str__(self): return "LexToken(%s,%r,%d)" % (self.type,self.value,self.lineno) def __repr__(self): return str(self) def skip(self,n): try: self._skipn += n except AttributeError: self._skipn = n # ----------------------------------------------------------------------------- # Lexer class # # input() - Store a new string in the lexer # token() - Get the next token # ----------------------------------------------------------------------------- class Lexer: def __init__(self): self.lexre = None # Master regular expression self.lexdata = None # Actual input data (as a string) self.lexpos = 0 # Current position in input text self.lexlen = 0 # Length of the input text self.lexindexfunc = [ ] # Reverse mapping of groups to functions and types self.lexerrorf = None # Error rule (if any) self.lextokens = None # List of valid tokens self.lexignore = None # Ignored characters self.lineno = 1 # Current line number self.debug = 0 # Debugging mode self.optimize = 0 # Optimized mode self.token = self.errtoken def __copy__(self): c = Lexer() c.lexre = self.lexre c.lexdata = self.lexdata c.lexpos = self.lexpos c.lexlen = self.lexlen c.lexindexfunc = self.lexindexfunc c.lexerrorf = self.lexerrorf c.lextokens = self.lextokens c.lexignore = self.lexignore c.debug = self.debug c.lineno = self.lineno c.optimize = self.optimize c.token = c.realtoken return c # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self,s): if not isinstance(s, (unicode, types.StringType)): raise ValueError, "Expected a string" self.lexdata = s self.lexpos = 0 self.lexlen = len(s) self.token = self.realtoken # Change the token routine to point to realtoken() global token if token == self.errtoken: token = self.token # ------------------------------------------------------------ # errtoken() - Return error if token is called with no data # ------------------------------------------------------------ def errtoken(self): raise RuntimeError, "No input string given with input()" # ------------------------------------------------------------ # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what # you are doing # ------------------------------------------------------------ def realtoken(self): # Make local copies of frequently referenced attributes lexpos = self.lexpos lexlen = self.lexlen lexignore = self.lexignore lexdata = self.lexdata while lexpos < lexlen: # This code provides some short-circuit code for whitespace, tabs, and other ignored characters if lexdata[lexpos] in lexignore: lexpos += 1 continue # Look for a regular expression match m = self.lexre.match(lexdata,lexpos) if m: i = m.lastindex lexpos = m.end() tok = LexToken() tok.value = m.group() tok.lineno = self.lineno tok.lexer = self func,tok.type = self.lexindexfunc[i] if not func: self.lexpos = lexpos return tok # If token is processed by a function, call it self.lexpos = lexpos newtok = func(tok) self.lineno = tok.lineno # Update line number # Every function must return a token, if nothing, we just move to next token if not newtok: continue # Verify type of the token. If not in the token map, raise an error if not self.optimize: if not self.lextokens.has_key(newtok.type): raise LexError, ("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( func.func_code.co_filename, func.func_code.co_firstlineno, func.__name__, newtok.type),lexdata[lexpos:]) return newtok # No match. Call t_error() if defined. if self.lexerrorf: tok = LexToken() tok.value = self.lexdata[lexpos:] tok.lineno = self.lineno tok.type = "error" tok.lexer = self oldpos = lexpos newtok = self.lexerrorf(tok) lexpos += getattr(tok,"_skipn",0) if oldpos == lexpos: # Error method didn't change text position at all. This is an error. self.lexpos = lexpos raise LexError, ("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) if not newtok: continue self.lexpos = lexpos return newtok self.lexpos = lexpos raise LexError, ("No match found", lexdata[lexpos:]) # No more input data self.lexpos = lexpos + 1 return None # ----------------------------------------------------------------------------- # validate_file() # # This checks to see if there are duplicated t_rulename() functions or strings # in the parser input file. This is done using a simple regular expression # match on each line in the filename. # ----------------------------------------------------------------------------- def validate_file(filename): import os.path base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea what the file is. Return OK try: f = open(filename) lines = f.readlines() f.close() except IOError: return 1 # Oh well fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') counthash = { } linen = 1 noerror = 1 for l in lines: m = fre.match(l) if not m: m = sre.match(l) if m: name = m.group(1) prev = counthash.get(name) if not prev: counthash[name] = linen else: print "%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev) noerror = 0 linen += 1 return noerror # ----------------------------------------------------------------------------- # _read_lextab(module) # # Reads lexer table from a lextab file instead of using introspection. # ----------------------------------------------------------------------------- def _read_lextab(lexer, fdict, module): exec "import %s as lextab" % module lexer.lexre = re.compile(lextab._lexre, re.VERBOSE) lexer.lexindexfunc = lextab._lextab for i in range(len(lextab._lextab)): t = lexer.lexindexfunc[i] if t: if t[0]: lexer.lexindexfunc[i] = (fdict[t[0]],t[1]) lexer.lextokens = lextab._lextokens lexer.lexignore = lextab._lexignore if lextab._lexerrorf: lexer.lexerrorf = fdict[lextab._lexerrorf] # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- def lex(module=None,debug=0,optimize=0,lextab="lextab"): ldict = None regex = "" error = 0 files = { } lexer = Lexer() lexer.debug = debug lexer.optimize = optimize global token,input if module: # User supplied a module object. if isinstance(module, types.ModuleType): ldict = module.__dict__ elif isinstance(module, types.InstanceType): _items = [(k,getattr(module,k)) for k in dir(module)] ldict = { } for (i,v) in _items: ldict[i] = v else: raise ValueError,"Expected a module or instance" else: # No module given. We might be able to get information from the caller. try: raise RuntimeError except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame f = f.f_back # Walk out to our calling function ldict = f.f_globals # Grab its globals dictionary if optimize and lextab: try: _read_lextab(lexer,ldict, lextab) if not lexer.lexignore: lexer.lexignore = "" token = lexer.token input = lexer.input return lexer except ImportError: pass # Get the tokens map if (module and isinstance(module,types.InstanceType)): tokens = getattr(module,"tokens",None) else: try: tokens = ldict["tokens"] except KeyError: tokens = None if not tokens: raise SyntaxError,"lex: module does not define 'tokens'" if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): raise SyntaxError,"lex: tokens must be a list or tuple." # Build a dictionary of valid token names lexer.lextokens = { } if not optimize: # Utility function for verifying tokens def is_identifier(s): for c in s: if not (c.isalnum() or c == '_'): return 0 return 1 for n in tokens: if not is_identifier(n): print "lex: Bad token name '%s'" % n error = 1 if lexer.lextokens.has_key(n): print "lex: Warning. Token '%s' multiply defined." % n lexer.lextokens[n] = None else: for n in tokens: lexer.lextokens[n] = None if debug: print "lex: tokens = '%s'" % lexer.lextokens.keys() # Get a list of symbols with the t_ prefix tsymbols = [f for f in ldict.keys() if f[:2] == 't_'] # Now build up a list of functions and a list of strings fsymbols = [ ] ssymbols = [ ] for f in tsymbols: if callable(ldict[f]): fsymbols.append(ldict[f]) elif isinstance(ldict[f], types.StringType): ssymbols.append((f,ldict[f])) else: print "lex: %s not defined as a function or string" % f error = 1 # Sort the functions by line number fsymbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) # Sort the strings by regular expression length ssymbols.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) # Check for non-empty symbols if len(fsymbols) == 0 and len(ssymbols) == 0: raise SyntaxError,"lex: no rules of the form t_rulename are defined." # Add all of the rules defined with actions first for f in fsymbols: line = f.func_code.co_firstlineno file = f.func_code.co_filename files[file] = None ismethod = isinstance(f, types.MethodType) if not optimize: nargs = f.func_code.co_argcount if ismethod: reqargs = 2 else: reqargs = 1 if nargs > reqargs: print "%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__) error = 1 continue if nargs < reqargs: print "%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__) error = 1 continue if f.__name__ == 't_ignore': print "%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__) error = 1 continue if f.__name__ == 't_error': lexer.lexerrorf = f continue if f.__doc__: if not optimize: try: c = re.compile(f.__doc__, re.VERBOSE) except re.error,e: print "%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e) error = 1 continue if debug: print "lex: Adding rule %s -> '%s'" % (f.__name__,f.__doc__) # Okay. The regular expression seemed okay. Let's append it to the master regular # expression we're building if (regex): regex += "|" regex += "(?P<%s>%s)" % (f.__name__,f.__doc__) else: print "%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__) # Now add all of the simple rules for name,r in ssymbols: if name == 't_ignore': lexer.lexignore = r continue if not optimize: if name == 't_error': raise SyntaxError,"lex: Rule 't_error' must be defined as a function" error = 1 continue if not lexer.lextokens.has_key(name[2:]): print "lex: Rule '%s' defined for an unspecified token %s." % (name,name[2:]) error = 1 continue try: c = re.compile(r,re.VERBOSE) except re.error,e: print "lex: Invalid regular expression for rule '%s'. %s" % (name,e) error = 1 continue if debug: print "lex: Adding rule %s -> '%s'" % (name,r) if regex: regex += "|" regex += "(?P<%s>%s)" % (name,r) if not optimize: for f in files.keys(): if not validate_file(f): error = 1 try: if debug: print "lex: regex = '%s'" % regex lexer.lexre = re.compile(regex, re.VERBOSE) # Build the index to function map for the matching engine lexer.lexindexfunc = [ None ] * (max(lexer.lexre.groupindex.values())+1) for f,i in lexer.lexre.groupindex.items(): handle = ldict[f] if type(handle) in (types.FunctionType, types.MethodType): lexer.lexindexfunc[i] = (handle,handle.__name__[2:]) else: # If rule was specified as a string, we build an anonymous # callback function to carry out the action lexer.lexindexfunc[i] = (None,f[2:]) # If a lextab was specified, we create a file containing the precomputed # regular expression and index table if lextab and optimize: lt = open(lextab+".py","w") lt.write("# %s.py. This file automatically created by PLY. Don't edit.\n" % lextab) lt.write("_lexre = %s\n" % repr(regex)) lt.write("_lextab = [\n"); for i in range(0,len(lexer.lexindexfunc)): t = lexer.lexindexfunc[i] if t: if t[0]: lt.write(" ('%s',%s),\n"% (t[0].__name__, repr(t[1]))) else: lt.write(" (None,%s),\n" % repr(t[1])) else: lt.write(" None,\n") lt.write("]\n"); lt.write("_lextokens = %s\n" % repr(lexer.lextokens)) lt.write("_lexignore = %s\n" % repr(lexer.lexignore)) if (lexer.lexerrorf): lt.write("_lexerrorf = %s\n" % repr(lexer.lexerrorf.__name__)) else: lt.write("_lexerrorf = None\n") lt.close() except re.error,e: print "lex: Fatal error. Unable to compile regular expression rules. %s" % e error = 1 if error: raise SyntaxError,"lex: Unable to build lexer." if not lexer.lexerrorf: print "lex: Warning. no t_error rule is defined." if not lexer.lexignore: lexer.lexignore = "" # Create global versions of the token() and input() functions token = lexer.token input = lexer.input return lexer # ----------------------------------------------------------------------------- # run() # # This runs the lexer as a main program # ----------------------------------------------------------------------------- def runmain(lexer=None,data=None): if not data: try: filename = sys.argv[1] f = open(filename) data = f.read() f.close() except IndexError: print "Reading from standard input (type EOF to end):" data = sys.stdin.read() if lexer: _input = lexer.input else: _input = input _input(data) if lexer: _token = lexer.token else: _token = token while 1: tok = _token() if not tok: break print "(%s,'%s',%d)" % (tok.type, tok.value, tok.lineno) ================================================ FILE: src/ccg2xml/yacc.py ================================================ #----------------------------------------------------------------------------- # ply: yacc.py # # Author(s): David M. Beazley (dave@dabeaz.com) # # Copyright (C) 2001-2005, David M. Beazley # # $Header: /cvsroot/openccg/openccg/src/ccg2xml/yacc.py,v 1.1 2006/09/30 08:11:29 benwing Exp $ # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # See the file COPYING for a complete copy of the LGPL. # # # This implements an LR parser that is constructed from grammar rules defined # as Python functions. Roughly speaking, this module is a cross between # John Aycock's Spark system and the GNU bison utility. # # The current implementation is only somewhat object-oriented. The # LR parser itself is defined in terms of an object (which allows multiple # parsers to co-exist). However, most of the variables used during table # construction are defined in terms of global variables. Users shouldn't # notice unless they are trying to define multiple parsers at the same # time using threads (in which case they should have their head examined). # # This implementation supports both SLR and LALR(1) parsing. LALR(1) # support was implemented by Elias Ioup (ezioup@alumni.uchicago.edu) # and hacked abit by Dave to run faster. # # :::::::: WARNING ::::::: # # Construction of LR parsing tables is fairly complicated and expensive. # To make this module run fast, a *LOT* of work has been put into # optimization---often at the expensive of readability and what might # consider to be good Python "coding style." Modify the code at your # own risk! # ---------------------------------------------------------------------------- __version__ = "1.6" #----------------------------------------------------------------------------- # === User configurable parameters === # # Change these to modify the default behavior of yacc (if you wish) #----------------------------------------------------------------------------- yaccdebug = 1 # Debugging mode. If set, yacc generates a # a 'parser.out' file in the current directory debug_file = 'parser.out' # Default name of the debugging file tab_module = 'parsetab' # Default name of the table module default_lr = 'SLR' # Default LR table generation method error_count = 3 # Number of symbols that must be shifted to leave recovery mode import re, types, sys, cStringIO, md5, os.path # Exception raised for yacc-related errors class YaccError(Exception): pass #----------------------------------------------------------------------------- # === LR Parsing Engine === # # The following classes are used for the LR parser itself. These are not # used during table construction and are independent of the actual LR # table generation algorithm #----------------------------------------------------------------------------- # This class is used to hold non-terminal grammar symbols during parsing. # It normally has the following attributes set: # .type = Grammar symbol type # .value = Symbol value # .lineno = Starting line number # .endlineno = Ending line number (optional, set automatically) class YaccSymbol: def __str__(self): return self.type def __repr__(self): return str(self) # This class is a wrapper around the objects actually passed to each # grammar rule. Index lookup and assignment actually assign the # .value attribute of the underlying YaccSymbol object. # The lineno() method returns the line number of a given # item (or 0 if not defined). The linespan() method returns # a tuple of (startline,endline) representing the range of lines # for a symbol. class YaccProduction: def __init__(self,s): self.slice = s self.pbstack = [] def __getitem__(self,n): return self.slice[n].value def __setitem__(self,n,v): self.slice[n].value = v def __len__(self): return len(self.slice) def lineno(self,n): return getattr(self.slice[n],"lineno",0) def linespan(self,n): startline = getattr(self.slice[n],"lineno",0) endline = getattr(self.slice[n],"endlineno",startline) return startline,endline def pushback(self,n): if n <= 0: raise ValueError, "Expected a positive value" if n > (len(self.slice)-1): raise ValueError, "Can't push %d tokens. Only %d are available." % (n,len(self.slice)-1) for i in range(0,n): self.pbstack.append(self.slice[-i-1]) # The LR Parsing engine. This is defined as a class so that multiple parsers # can exist in the same process. A user never instantiates this directly. # Instead, the global yacc() function should be used to create a suitable Parser # object. class Parser: def __init__(self,magic=None): # This is a hack to keep users from trying to instantiate a Parser # object directly. if magic != "xyzzy": raise YaccError, "Can't instantiate Parser. Use yacc() instead." # Reset internal state self.productions = None # List of productions self.errorfunc = None # Error handling function self.action = { } # LR Action table self.goto = { } # LR goto table self.require = { } # Attribute require table self.method = "Unknown LR" # Table construction method used def errok(self): self.errorcount = 0 def restart(self): del self.statestack[:] del self.symstack[:] sym = YaccSymbol() sym.type = '$' self.symstack.append(sym) self.statestack.append(0) def parse(self,input=None,lexer=None,debug=0): lookahead = None # Current lookahead symbol lookaheadstack = [ ] # Stack of lookahead symbols actions = self.action # Local reference to action table goto = self.goto # Local reference to goto table prod = self.productions # Local reference to production list pslice = YaccProduction(None) # Production object passed to grammar rules pslice.parser = self # Parser object self.errorcount = 0 # Used during error recovery # If no lexer was given, we will try to use the lex module if not lexer: import lex as lexer pslice.lexer = lexer # If input was supplied, pass to lexer if input: lexer.input(input) # Tokenize function get_token = lexer.token statestack = [ ] # Stack of parsing states self.statestack = statestack symstack = [ ] # Stack of grammar symbols self.symstack = symstack errtoken = None # Err token # The start state is assumed to be (0,$) statestack.append(0) sym = YaccSymbol() sym.type = '$' symstack.append(sym) while 1: # Get the next symbol on the input. If a lookahead symbol # is already set, we just use that. Otherwise, we'll pull # the next token off of the lookaheadstack or from the lexer if not lookahead: if not lookaheadstack: lookahead = get_token() # Get the next token else: lookahead = lookaheadstack.pop() if not lookahead: lookahead = YaccSymbol() lookahead.type = '$' if debug: errorlead = ("%s . %s" % (" ".join([xx.type for xx in symstack][1:]), str(lookahead))).lstrip() # Check the action table s = statestack[-1] ltype = lookahead.type t = actions.get((s,ltype),None) if t is not None: if t > 0: # shift a symbol on the stack if ltype == '$': # Error, end of input sys.stderr.write("yacc: Parse error. EOF\n") return statestack.append(t) if debug > 1: sys.stderr.write("%-60s shift state %s\n" % (errorlead, t)) symstack.append(lookahead) lookahead = None # Decrease error count on successful shift if self.errorcount > 0: self.errorcount -= 1 continue if t < 0: # reduce a symbol on the stack, emit a production p = prod[-t] pname = p.name plen = p.len # Get production function sym = YaccSymbol() sym.type = pname # Production name sym.value = None if debug > 1: sys.stderr.write("%-60s reduce %d\n" % (errorlead, -t)) if plen: targ = symstack[-plen-1:] targ[0] = sym try: sym.lineno = targ[1].lineno sym.endlineno = getattr(targ[-1],"endlineno",targ[-1].lineno) except AttributeError: sym.lineno = 0 del symstack[-plen:] del statestack[-plen:] else: sym.lineno = 0 targ = [ sym ] pslice.slice = targ pslice.pbstack = [] # Call the grammar rule with our special slice object p.func(pslice) # If there was a pushback, put that on the stack if pslice.pbstack: lookaheadstack.append(lookahead) for _t in pslice.pbstack: lookaheadstack.append(_t) lookahead = None symstack.append(sym) statestack.append(goto[statestack[-1],pname]) continue if t == 0: n = symstack[-1] return getattr(n,"value",None) sys.stderr.write(errorlead, "\n") if t == None: if debug: sys.stderr.write(errorlead + "\n") # We have some kind of parsing error here. To handle # this, we are going to push the current token onto # the tokenstack and replace it with an 'error' token. # If there are any synchronization rules, they may # catch it. # # In addition to pushing the error token, we call call # the user defined p_error() function if this is the # first syntax error. This function is only called if # errorcount == 0. if not self.errorcount: self.errorcount = error_count errtoken = lookahead if errtoken.type == '$': errtoken = None # End of file! if self.errorfunc: global errok,token,restart errok = self.errok # Set some special functions available in error recovery token = get_token restart = self.restart tok = self.errorfunc(errtoken) del errok, token, restart # Delete special functions if not self.errorcount: # User must have done some kind of panic # mode recovery on their own. The # returned token is the next lookahead lookahead = tok errtoken = None continue else: if errtoken: if hasattr(errtoken,"lineno"): lineno = lookahead.lineno else: lineno = 0 if lineno: sys.stderr.write("yacc: Syntax error at line %d, token=%s\n" % (lineno, errtoken.type)) else: sys.stderr.write("yacc: Syntax error, token=%s" % errtoken.type) else: sys.stderr.write("yacc: Parse error in input. EOF\n") return else: self.errorcount = error_count # case 1: the statestack only has 1 entry on it. If we're in this state, the # entire parse has been rolled back and we're completely hosed. The token is # discarded and we just keep going. if len(statestack) <= 1 and lookahead.type != '$': lookahead = None errtoken = None # Nuke the pushback stack del lookaheadstack[:] continue # case 2: the statestack has a couple of entries on it, but we're # at the end of the file. nuke the top entry and generate an error token # Start nuking entries on the stack if lookahead.type == '$': # Whoa. We're really hosed here. Bail out return if lookahead.type != 'error': sym = symstack[-1] if sym.type == 'error': # Hmmm. Error is on top of stack, we'll just nuke input # symbol and continue lookahead = None continue t = YaccSymbol() t.type = 'error' if hasattr(lookahead,"lineno"): t.lineno = lookahead.lineno t.value = lookahead lookaheadstack.append(lookahead) lookahead = t else: symstack.pop() statestack.pop() continue # Call an error function here raise RuntimeError, "yacc: internal parser error!!!\n" # ----------------------------------------------------------------------------- # === Parser Construction === # # The following functions and variables are used to implement the yacc() function # itself. This is pretty hairy stuff involving lots of error checking, # construction of LR items, kernels, and so forth. Although a lot of # this work is done using global variables, the resulting Parser object # is completely self contained--meaning that it is safe to repeatedly # call yacc() with different grammars in the same application. # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # validate_file() # # This function checks to see if there are duplicated p_rulename() functions # in the parser module file. Without this function, it is really easy for # users to make mistakes by cutting and pasting code fragments (and it's a real # bugger to try and figure out why the resulting parser doesn't work). Therefore, # we just do a little regular expression pattern matching of def statements # to try and detect duplicates. # ----------------------------------------------------------------------------- def validate_file(filename): base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea. Assume it's okay. try: f = open(filename) lines = f.readlines() f.close() except IOError: return 1 # Oh well # Match def p_funcname( fre = re.compile(r'\s*def\s+(p_[a-zA-Z_0-9]*)\(') counthash = { } linen = 1 noerror = 1 for l in lines: m = fre.match(l) if m: name = m.group(1) prev = counthash.get(name) if not prev: counthash[name] = linen else: sys.stderr.write("%s:%d: Function %s redefined. Previously defined on line %d\n" % (filename,linen,name,prev)) noerror = 0 linen += 1 return noerror # This function looks for functions that might be grammar rules, but which don't have the proper p_suffix. def validate_dict(d): for n,v in d.items(): if n[0:2] == 'p_' and type(v) in (types.FunctionType, types.MethodType): continue if n[0:2] == 't_': continue if n[0:2] == 'p_': sys.stderr.write("yacc: Warning. '%s' not defined as a function\n" % n) if 1 and isinstance(v,types.FunctionType) and v.func_code.co_argcount == 1: try: doc = v.__doc__.split(" ") if doc[1] == ':': sys.stderr.write("%s:%d: Warning. Possible grammar rule '%s' defined without p_ prefix.\n" % (v.func_code.co_filename, v.func_code.co_firstlineno,n)) except StandardError: pass # ----------------------------------------------------------------------------- # === GRAMMAR FUNCTIONS === # # The following global variables and functions are used to store, manipulate, # and verify the grammar rules specified by the user. # ----------------------------------------------------------------------------- # Initialize all of the global variables used during grammar construction def initialize_vars(): global Productions, Prodnames, Prodmap, Terminals global Nonterminals, First, Follow, Precedence, LRitems global Errorfunc, Signature, Requires # LALR(1) globals global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical Productions = [None] # A list of all of the productions. The first # entry is always reserved for the purpose of # building an augmented grammar Prodnames = { } # A dictionary mapping the names of nonterminals to a list of all # productions of that nonterminal. Prodmap = { } # A dictionary that is only used to detect duplicate # productions. Terminals = { } # A dictionary mapping the names of terminal symbols to a # list of the rules where they are used. Nonterminals = { } # A dictionary mapping names of nonterminals to a list # of rule numbers where they are used. First = { } # A dictionary of precomputed FIRST(x) symbols Follow = { } # A dictionary of precomputed FOLLOW(x) symbols Precedence = { } # Precedence rules for each terminal. Contains tuples of the # form ('right',level) or ('nonassoc', level) or ('left',level) LRitems = [ ] # A list of all LR items for the grammar. These are the # productions with the "dot" like E -> E . PLUS E Errorfunc = None # User defined error handler Signature = md5.new() # Digital signature of the grammar rules, precedence # and other information. Used to determined when a # parsing table needs to be regenerated. Requires = { } # Requires list # LALR(1) Initialization Prodempty = { } # A dictionary of all productions that have an empty rule # of the form P : TReductions = { } # A dictionary of precomputer reductions from # nonterminals to terminals NTReductions = { } # A dictionary of precomputed reductions from # nonterminals to nonterminals GotoSetNum = { } # A dictionary that remembers goto sets based on # the state number and symbol Canonical = { } # A list of LR item sets. A LR item set is a list of LR # items that represent the state of the parser # File objects used when creating the parser.out debugging file global _vf, _vfc _vf = cStringIO.StringIO() _vfc = cStringIO.StringIO() # ----------------------------------------------------------------------------- # class Production: # # This class stores the raw information about a single production or grammar rule. # It has a few required attributes: # # name - Name of the production (nonterminal) # prod - A list of symbols making up its production # number - Production number. # # In addition, a few additional attributes are used to help with debugging or # optimization of table generation. # # file - File where production action is defined. # lineno - Line number where action is defined # func - Action function # prec - Precedence level # lr_next - Next LR item. Example, if we are ' E -> E . PLUS E' # then lr_next refers to 'E -> E PLUS . E' # lr_index - LR item index (location of the ".") in the prod list. # lookaheads - LALR lookahead symbols for this item # len - Length of the production (number of symbols on right hand side) # ----------------------------------------------------------------------------- class Production: def __init__(self,**kw): for k,v in kw.items(): setattr(self,k,v) self.lr_index = -1 self.lr0_added = 0 # Flag indicating whether or not added to LR0 closure self.lr1_added = 0 # Flag indicating whether or not added to LR1 self.usyms = [ ] self.lookaheads = { } self.lk_added = { } self.setnumbers = [ ] def __str__(self): if self.prod: s = "%s -> %s" % (self.name," ".join(self.prod)) else: s = "%s -> " % self.name return s def __repr__(self): return str(self) # Compute lr_items from the production def lr_item(self,n): if n > len(self.prod): return None p = Production() p.name = self.name p.prod = list(self.prod) p.number = self.number p.lr_index = n p.lookaheads = { } p.setnumbers = self.setnumbers p.prod.insert(n,".") p.prod = tuple(p.prod) p.len = len(p.prod) p.usyms = self.usyms # Precompute list of productions immediately following try: p.lrafter = Prodnames[p.prod[n+1]] except (IndexError,KeyError),e: p.lrafter = [] try: p.lrbefore = p.prod[n-1] except IndexError: p.lrbefore = None return p class MiniProduction: pass # Utility function def is_identifier(s): for c in s: if not (c.isalnum() or c == '_'): return 0 return 1 # ----------------------------------------------------------------------------- # add_production() # # Given an action function, this function assembles a production rule. # The production rule is assumed to be found in the function's docstring. # This rule has the general syntax: # # name1 ::= production1 # | production2 # | production3 # ... # | productionn # name2 ::= production1 # | production2 # ... # ----------------------------------------------------------------------------- def add_production(f,file,line,prodname,syms): if Terminals.has_key(prodname): sys.stderr.write("%s:%d: Illegal rule name '%s'. Already defined as a token.\n" % (file,line,prodname)) return -1 if prodname == 'error': sys.stderr.write("%s:%d: Illegal rule name '%s'. error is a reserved word.\n" % (file,line,prodname)) return -1 if not is_identifier(prodname): sys.stderr.write("%s:%d: Illegal rule name '%s'\n" % (file,line,prodname)) return -1 for s in syms: if not is_identifier(s) and s != '%prec': sys.stderr.write("%s:%d: Illegal name '%s' in rule '%s'\n" % (file,line,s, prodname)) return -1 # See if the rule is already in the rulemap map = "%s -> %s" % (prodname,syms) if Prodmap.has_key(map): m = Prodmap[map] sys.stderr.write("%s:%d: Duplicate rule %s.\n" % (file,line, m)) sys.stderr.write("%s:%d: Previous definition at %s:%d\n" % (file,line, m.file, m.line)) return -1 p = Production() p.name = prodname p.prod = syms p.file = file p.line = line p.func = f p.number = len(Productions) Productions.append(p) Prodmap[map] = p if not Nonterminals.has_key(prodname): Nonterminals[prodname] = [ ] # Add all terminals to Terminals i = 0 while i < len(p.prod): t = p.prod[i] if t == '%prec': try: precname = p.prod[i+1] except IndexError: sys.stderr.write("%s:%d: Syntax error. Nothing follows %%prec.\n" % (p.file,p.line)) return -1 prec = Precedence.get(precname,None) if not prec: sys.stderr.write("%s:%d: Nothing known about the precedence of '%s'\n" % (p.file,p.line,precname)) return -1 else: p.prec = prec del p.prod[i] del p.prod[i] continue if Terminals.has_key(t): Terminals[t].append(p.number) # Is a terminal. We'll assign a precedence to p based on this if not hasattr(p,"prec"): p.prec = Precedence.get(t,('right',0)) else: if not Nonterminals.has_key(t): Nonterminals[t] = [ ] Nonterminals[t].append(p.number) i += 1 if not hasattr(p,"prec"): p.prec = ('right',0) # Set final length of productions p.len = len(p.prod) p.prod = tuple(p.prod) # Calculate unique syms in the production p.usyms = [ ] for s in p.prod: if s not in p.usyms: p.usyms.append(s) # Add to the global productions list try: Prodnames[p.name].append(p) except KeyError: Prodnames[p.name] = [ p ] return 0 # Given a raw rule function, this function rips out its doc string # and adds rules to the grammar def add_function(f): line = f.func_code.co_firstlineno file = f.func_code.co_filename error = 0 if isinstance(f,types.MethodType): reqdargs = 2 else: reqdargs = 1 if f.func_code.co_argcount > reqdargs: sys.stderr.write("%s:%d: Rule '%s' has too many arguments.\n" % (file,line,f.__name__)) return -1 if f.func_code.co_argcount < reqdargs: sys.stderr.write("%s:%d: Rule '%s' requires an argument.\n" % (file,line,f.__name__)) return -1 if f.__doc__: # Split the doc string into lines pstrings = f.__doc__.splitlines() lastp = None dline = line for ps in pstrings: dline += 1 p = ps.split() if not p: continue try: if p[0] == '|': # This is a continuation of a previous rule if not lastp: sys.stderr.write("%s:%d: Misplaced '|'.\n" % (file,dline)) return -1 prodname = lastp if len(p) > 1: syms = p[1:] else: syms = [ ] else: prodname = p[0] lastp = prodname assign = p[1] if len(p) > 2: syms = p[2:] else: syms = [ ] if assign != ':' and assign != '::=': sys.stderr.write("%s:%d: Syntax error. Expected ':'\n" % (file,dline)) return -1 e = add_production(f,file,dline,prodname,syms) error += e except StandardError: sys.stderr.write("%s:%d: Syntax error in rule '%s'\n" % (file,dline,ps)) error -= 1 else: sys.stderr.write("%s:%d: No documentation string specified in function '%s'\n" % (file,line,f.__name__)) return error # Cycle checking code (Michael Dyck) def compute_reachable(): ''' Find each symbol that can be reached from the start symbol. Print a warning for any nonterminals that can't be reached. (Unused terminals have already had their warning.) ''' Reachable = { } for s in Terminals.keys() + Nonterminals.keys(): Reachable[s] = 0 mark_reachable_from( Productions[0].prod[0], Reachable ) for s in Nonterminals.keys(): if not Reachable[s]: sys.stderr.write("yacc: Symbol '%s' is unreachable.\n" % s) def mark_reachable_from(s, Reachable): ''' Mark all symbols that are reachable from symbol s. ''' if Reachable[s]: # We've already reached symbol s. return Reachable[s] = 1 for p in Prodnames.get(s,[]): for r in p.prod: mark_reachable_from(r, Reachable) # ----------------------------------------------------------------------------- # compute_terminates() # # This function looks at the various parsing rules and tries to detect # infinite recursion cycles (grammar rules where there is no possible way # to derive a string of only terminals). # ----------------------------------------------------------------------------- def compute_terminates(): ''' Raise an error for any symbols that don't terminate. ''' Terminates = {} # Terminals: for t in Terminals.keys(): Terminates[t] = 1 Terminates['$'] = 1 # Nonterminals: # Initialize to false: for n in Nonterminals.keys(): Terminates[n] = 0 # Then propagate termination until no change: while 1: some_change = 0 for (n,pl) in Prodnames.items(): # Nonterminal n terminates iff any of its productions terminates. for p in pl: # Production p terminates iff all of its rhs symbols terminate. for s in p.prod: if not Terminates[s]: # The symbol s does not terminate, # so production p does not terminate. p_terminates = 0 break else: # didn't break from the loop, # so every symbol s terminates # so production p terminates. p_terminates = 1 if p_terminates: # symbol n terminates! if not Terminates[n]: Terminates[n] = 1 some_change = 1 # Don't need to consider any more productions for this n. break if not some_change: break some_error = 0 for (s,terminates) in Terminates.items(): if not terminates: if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': # s is used-but-not-defined, and we've already warned of that, # so it would be overkill to say that it's also non-terminating. pass else: sys.stderr.write("yacc: Infinite recursion detected for symbol '%s'.\n" % s) some_error = 1 return some_error # ----------------------------------------------------------------------------- # verify_productions() # # This function examines all of the supplied rules to see if they seem valid. # ----------------------------------------------------------------------------- def verify_productions(cycle_check=1): error = 0 for p in Productions: if not p: continue for s in p.prod: if not Prodnames.has_key(s) and not Terminals.has_key(s) and s != 'error': sys.stderr.write("%s:%d: Symbol '%s' used, but not defined as a token or a rule.\n" % (p.file,p.line,s)) error = 1 continue unused_tok = 0 # Now verify all of the tokens if yaccdebug: _vf.write("Unused terminals:\n\n") for s,v in Terminals.items(): if s != 'error' and not v: sys.stderr.write("yacc: Warning. Token '%s' defined, but not used.\n" % s) if yaccdebug: _vf.write(" %s\n"% s) unused_tok += 1 # Print out all of the productions if yaccdebug: _vf.write("\nGrammar\n\n") for i in range(1,len(Productions)): _vf.write("Rule %-5d %s\n" % (i, Productions[i])) unused_prod = 0 # Verify the use of all productions for s,v in Nonterminals.items(): if not v: p = Prodnames[s][0] sys.stderr.write("%s:%d: Warning. Rule '%s' defined, but not used.\n" % (p.file,p.line, s)) unused_prod += 1 if unused_tok == 1: sys.stderr.write("yacc: Warning. There is 1 unused token.\n") if unused_tok > 1: sys.stderr.write("yacc: Warning. There are %d unused tokens.\n" % unused_tok) if unused_prod == 1: sys.stderr.write("yacc: Warning. There is 1 unused rule.\n") if unused_prod > 1: sys.stderr.write("yacc: Warning. There are %d unused rules.\n" % unused_prod) if yaccdebug: _vf.write("\nTerminals, with rules where they appear\n\n") ks = Terminals.keys() ks.sort() for k in ks: _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Terminals[k]]))) _vf.write("\nNonterminals, with rules where they appear\n\n") ks = Nonterminals.keys() ks.sort() for k in ks: _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Nonterminals[k]]))) if (cycle_check): compute_reachable() error += compute_terminates() # error += check_cycles() return error # ----------------------------------------------------------------------------- # build_lritems() # # This function walks the list of productions and builds a complete set of the # LR items. The LR items are stored in two ways: First, they are uniquely # numbered and placed in the list _lritems. Second, a linked list of LR items # is built for each production. For example: # # E -> E PLUS E # # Creates the list # # [E -> . E PLUS E, E -> E . PLUS E, E -> E PLUS . E, E -> E PLUS E . ] # ----------------------------------------------------------------------------- def build_lritems(): for p in Productions: lastlri = p lri = p.lr_item(0) i = 0 while 1: lri = p.lr_item(i) lastlri.lr_next = lri if not lri: break lri.lr_num = len(LRitems) LRitems.append(lri) lastlri = lri i += 1 # In order for the rest of the parser generator to work, we need to # guarantee that no more lritems are generated. Therefore, we nuke # the p.lr_item method. (Only used in debugging) # Production.lr_item = None # ----------------------------------------------------------------------------- # add_precedence() # # Given a list of precedence rules, add to the precedence table. # ----------------------------------------------------------------------------- def add_precedence(plist): plevel = 0 error = 0 for p in plist: plevel += 1 try: prec = p[0] terms = p[1:] if prec != 'left' and prec != 'right' and prec != 'nonassoc': sys.stderr.write("yacc: Invalid precedence '%s'\n" % prec) return -1 for t in terms: if Precedence.has_key(t): sys.stderr.write("yacc: Precedence already specified for terminal '%s'\n" % t) error += 1 continue Precedence[t] = (prec,plevel) except: sys.stderr.write("yacc: Invalid precedence table.\n") error += 1 return error # ----------------------------------------------------------------------------- # augment_grammar() # # Compute the augmented grammar. This is just a rule S' -> start where start # is the starting symbol. # ----------------------------------------------------------------------------- def augment_grammar(start=None): if not start: start = Productions[1].name Productions[0] = Production(name="S'",prod=[start],number=0,len=1,prec=('right',0),func=None) Productions[0].usyms = [ start ] Nonterminals[start].append(0) # ------------------------------------------------------------------------- # first() # # Compute the value of FIRST1(beta) where beta is a tuple of symbols. # # During execution of compute_first1, the result may be incomplete. # Afterward (e.g., when called from compute_follow()), it will be complete. # ------------------------------------------------------------------------- def first(beta): # We are computing First(x1,x2,x3,...,xn) result = [ ] for x in beta: x_produces_empty = 0 # Add all the non- symbols of First[x] to the result. for f in First[x]: if f == '': x_produces_empty = 1 else: if f not in result: result.append(f) if x_produces_empty: # We have to consider the next x in beta, # i.e. stay in the loop. pass else: # We don't have to consider any further symbols in beta. break else: # There was no 'break' from the loop, # so x_produces_empty was true for all x in beta, # so beta produces empty as well. result.append('') return result # FOLLOW(x) # Given a non-terminal. This function computes the set of all symbols # that might follow it. Dragon book, p. 189. def compute_follow(start=None): # Add '$' to the follow list of the start symbol for k in Nonterminals.keys(): Follow[k] = [ ] if not start: start = Productions[1].name Follow[start] = [ '$' ] while 1: didadd = 0 for p in Productions[1:]: # Here is the production set for i in range(len(p.prod)): B = p.prod[i] if Nonterminals.has_key(B): # Okay. We got a non-terminal in a production fst = first(p.prod[i+1:]) hasempty = 0 for f in fst: if f != '' and f not in Follow[B]: Follow[B].append(f) didadd = 1 if f == '': hasempty = 1 if hasempty or i == (len(p.prod)-1): # Add elements of follow(a) to follow(b) for f in Follow[p.name]: if f not in Follow[B]: Follow[B].append(f) didadd = 1 if not didadd: break if 0 and yaccdebug: _vf.write('\nFollow:\n') for k in Nonterminals.keys(): _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in Follow[k]]))) # ------------------------------------------------------------------------- # compute_first1() # # Compute the value of FIRST1(X) for all symbols # ------------------------------------------------------------------------- def compute_first1(): # Terminals: for t in Terminals.keys(): First[t] = [t] First['$'] = ['$'] First['#'] = ['#'] # what's this for? # Nonterminals: # Initialize to the empty set: for n in Nonterminals.keys(): First[n] = [] # Then propagate symbols until no change: while 1: some_change = 0 for n in Nonterminals.keys(): for p in Prodnames[n]: for f in first(p.prod): if f not in First[n]: First[n].append( f ) some_change = 1 if not some_change: break if 0 and yaccdebug: _vf.write('\nFirst:\n') for k in Nonterminals.keys(): _vf.write("%-20s : %s\n" % (k, " ".join([str(s) for s in First[k]]))) # ----------------------------------------------------------------------------- # === SLR Generation === # # The following functions are used to construct SLR (Simple LR) parsing tables # as described on p.221-229 of the dragon book. # ----------------------------------------------------------------------------- # Global variables for the LR parsing engine def lr_init_vars(): global _lr_action, _lr_goto, _lr_method global _lr_goto_cache _lr_action = { } # Action table _lr_goto = { } # Goto table _lr_method = "Unknown" # LR method used _lr_goto_cache = { } # Compute the LR(0) closure operation on I, where I is a set of LR(0) items. # prodlist is a list of productions. _add_count = 0 # Counter used to detect cycles def lr0_closure(I): global _add_count _add_count += 1 prodlist = Productions # Add everything in I to J J = I[:] didadd = 1 while didadd: didadd = 0 for j in J: for x in j.lrafter: if x.lr0_added == _add_count: continue # Add B --> .G to J J.append(x.lr_next) x.lr0_added = _add_count didadd = 1 return J # Compute the LR(0) goto function goto(I,X) where I is a set # of LR(0) items and X is a grammar symbol. This function is written # in a way that guarantees uniqueness of the generated goto sets # (i.e. the same goto set will never be returned as two different Python # objects). With uniqueness, we can later do fast set comparisons using # id(obj) instead of element-wise comparison. def lr0_goto(I,x): # First we look for a previously cached entry g = _lr_goto_cache.get((id(I),x),None) if g: return g # Now we generate the goto set in a way that guarantees uniqueness # of the result s = _lr_goto_cache.get(x,None) if not s: s = { } _lr_goto_cache[x] = s gs = [ ] for p in I: n = p.lr_next if n and n.lrbefore == x: s1 = s.get(id(n),None) if not s1: s1 = { } s[id(n)] = s1 gs.append(n) s = s1 g = s.get('$',None) if not g: if gs: g = lr0_closure(gs) s['$'] = g else: s['$'] = gs _lr_goto_cache[(id(I),x)] = g return g # Added for LALR(1) # Given a setnumber of an lr0 state and a symbol return the setnumber of the goto state def lr0_goto_setnumber(I_setnumber, x): global Canonical global GotoSetNum if GotoSetNum.has_key((I_setnumber, x)): setnumber = GotoSetNum[(I_setnumber, x)] else: gset = lr0_goto(Canonical[I_setnumber], x) if not gset: return -1 else: gsetlen = len(gset) for i in xrange(len(gset[0].setnumbers)): inall = 1 for item in gset: if not item.setnumbers[i]: inall = 0 break if inall and len(Canonical[i]) == gsetlen: setnumber = i break # Note: DB. I added this to improve performance. # Not sure if this breaks the algorithm (it doesn't appear to). GotoSetNum[(I_setnumber, x)] = setnumber return setnumber # Compute the kernel of a set of LR(0) items def lr0_kernel(I): KI = [ ] for p in I: if p.name == "S'" or p.lr_index > 0 or p.len == 0: KI.append(p) return KI _lr0_cidhash = { } # Compute the LR(0) sets of item function def lr0_items(): C = [ lr0_closure([Productions[0].lr_next]) ] i = 0 for I in C: _lr0_cidhash[id(I)] = i i += 1 # Loop over the items in C and each grammar symbols i = 0 while i < len(C): I = C[i] i += 1 # Collect all of the symbols that could possibly be in the goto(I,X) sets asyms = { } for ii in I: for s in ii.usyms: asyms[s] = None for x in asyms.keys(): g = lr0_goto(I,x) if not g: continue if _lr0_cidhash.has_key(id(g)): continue _lr0_cidhash[id(g)] = len(C) C.append(g) return C # ----------------------------------------------------------------------------- # slr_parse_table() # # This function constructs an SLR table. # ----------------------------------------------------------------------------- def slr_parse_table(): global _lr_method goto = _lr_goto # Goto array action = _lr_action # Action array actionp = { } # Action production array (temporary) _lr_method = "SLR" n_srconflict = 0 n_rrconflict = 0 if yaccdebug: sys.stderr.write("yacc: Generating SLR parsing table...\n") _vf.write("\n\nParsing method: SLR\n\n") # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states C = lr0_items() # Build the parser table, state by state st = 0 for I in C: # Loop over each production in I actlist = [ ] # List of actions if yaccdebug: _vf.write("\nstate %d\n\n" % st) for p in I: _vf.write(" (%d) %s\n" % (p.number, str(p))) _vf.write("\n") for p in I: try: if p.prod[-1] == ".": if p.name == "S'": # Start symbol. Accept! action[st,"$"] = 0 actionp[st,"$"] = p else: # We are at the end of a production. Reduce! for a in Follow[p.name]: actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) r = action.get((st,a),None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number actionp[st,a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the shift if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p # sys.stderr.write("Reduce/reduce conflict in state %d\n" % st) n_rrconflict += 1 _vfc.write("reduce/reduce conflict in state %d resolved using rule %d (%s).\n" % (st, actionp[st,a].number, actionp[st,a])) _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d (%s).\n" % (a,actionp[st,a].number, actionp[st,a])) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = -p.number actionp[st,a] = p else: i = p.lr_index a = p.prod[i+1] # Get symbol right after the "." if Terminals.has_key(a): g = lr0_goto(I,a) j = _lr0_cidhash.get(id(g),-1) if j >= 0: # We are in a shift state actlist.append((a,p,"shift and go to state %d" % j)) r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: sys.stderr.write("Shift/shift conflict in state %d\n" % st) elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift rprec,rlevel = Productions[actionp[st,a].number].prec sprec,slevel = Precedence.get(a,('right',0)) if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): # We decide to shift here... highest precedence to shift action[st,a] = j actionp[st,a] = p if not slevel and not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = j actionp[st,a] = p except StandardError,e: raise YaccError, "Hosed in slr_parse_table", e # Print the actions associated with each terminal if yaccdebug: _actprint = { } for a,p,m in actlist: if action.has_key((st,a)): if p is actionp[st,a]: _vf.write(" %-15s %s\n" % (a,m)) _actprint[(a,m)] = 1 _vf.write("\n") for a,p,m in actlist: if action.has_key((st,a)): if p is not actionp[st,a]: if not _actprint.has_key((a,m)): _vf.write(" ! %-15s [ %s ]\n" % (a,m)) _actprint[(a,m)] = 1 # Construct the goto table for this state if yaccdebug: _vf.write("\n") nkeys = { } for ii in I: for s in ii.usyms: if Nonterminals.has_key(s): nkeys[s] = None for n in nkeys.keys(): g = lr0_goto(I,n) j = _lr0_cidhash.get(id(g),-1) if j >= 0: goto[st,n] = j if yaccdebug: _vf.write(" %-30s shift and go to state %d\n" % (n,j)) st += 1 if yaccdebug: if n_srconflict == 1: sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) if n_srconflict > 1: sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) if n_rrconflict == 1: sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) if n_rrconflict > 1: sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) # ----------------------------------------------------------------------------- # ==== LALR(1) Parsing ==== # FINISHED! 5/20/2003 by Elias Ioup # ----------------------------------------------------------------------------- # Compute the lr1_closure of a set I. I is a list of productions and setnumber # is the state that you want the lr items that are made from the to come from. _lr1_add_count = 0 def lr1_closure(I, setnumber = 0): global _add_count global Nonterminals _add_count += 1 prodlist = Productions # Add everything in I to J J = I[:] Jhash = { } for j in J: Jhash[id(j)] = 1 didadd = 1 while didadd: didadd = 0 for j in J: jprod = j.prod jlr_index = j.lr_index jprodslice = jprod[jlr_index+2:] if jlr_index < len(jprod) - 1 and Nonterminals.has_key(jprod[jlr_index+1]): first_syms = [] if j.lk_added.setdefault(setnumber, 0) < len(j.lookaheads[setnumber]): for a in j.lookaheads[setnumber][j.lk_added[setnumber]:]: # find b in FIRST(Xa) if j = [A->a.BX,a] temp_first_syms = first(jprodslice + (a,)) for x in temp_first_syms: if x not in first_syms: first_syms.append(x) j.lk_added[setnumber] = len(j.lookaheads[setnumber]) for x in j.lrafter: # Add B --> .G to J if x.lr_next.lookaheads.has_key(setnumber): _xlook = x.lr_next.lookaheads[setnumber] for s in first_syms: if s not in _xlook: _xlook.append(s) didadd = 1 else: x.lr_next.lookaheads[setnumber] = first_syms didadd = 1 nid = id(x.lr_next) if not Jhash.has_key(nid): J.append(x.lr_next) Jhash[nid] = 1 return J def add_lookaheads(K): spontaneous = [] propogate = [] for setnumber in range(len(K)): for kitem in K[setnumber]: kitem.lookaheads[setnumber] = ['#'] J = lr1_closure([kitem], setnumber) # find the lookaheads that are spontaneously created from closures # and the propogations of lookaheads between lr items for item in J: if item.lr_index < len(item.prod)-1: for lookahead in item.lookaheads[setnumber]: goto_setnumber = lr0_goto_setnumber(setnumber, item.prod[item.lr_index+1]) next = None if lookahead != '#': if item.lr_next in K[goto_setnumber]: next = item.lr_next if next: spontaneous.append((next, (lookahead, goto_setnumber))) else: if goto_setnumber > -1: if item.lr_next in K[goto_setnumber]: next = item.lr_next if next: propogate.append(((kitem, setnumber), (next, goto_setnumber))) for x in K[setnumber]: x.lookaheads[setnumber] = [] for x in spontaneous: if x[1][0] not in x[0].lookaheads[x[1][1]]: x[0].lookaheads[x[1][1]].append(x[1][0]) K[0][0].lookaheads[0] = ['$'] pitems = {} for x in propogate: if pitems.has_key(x[0]): pitems[x[0]].append(x[1]) else: pitems[x[0]] = [] pitems[x[0]].append(x[1]) # propogate the lookaheads that were spontaneously generated # based on the propogations produced above stop = 0 while not stop: stop = 1 kindex = 0 for set in K: for item in set: pkey = (item, kindex) if pitems.has_key(pkey): for propogation in pitems[pkey]: gitem = propogation[0] gsetnumber = propogation[1] glookaheads = gitem.lookaheads[gsetnumber] for lookahead in item.lookaheads[kindex]: if lookahead not in glookaheads: glookaheads.append(lookahead) stop = 0 kindex += 1 def ReduceNonterminals(): global Nonterminals global TReductions global NTReductions for nt in Nonterminals.keys(): TReductions[nt] = [] NTReductions[nt] = [] for nt in Nonterminals.keys(): terms = ReduceToTerminals(nt) TReductions[nt].extend(terms) if not NTReductions.has_key(nt): ReduceToNonterminals(nt) def ReduceToTerminals(nt): global Prodnames global Terminals reducedterminals = [] for p in Prodnames[nt]: if len(p.prod) > 0: if Terminals.has_key(p.prod[0]): if p.prod[0] not in reducedterminals: reducedterminals.append(p.prod[0]) else: if p.prod[0] != nt: terms = ReduceToTerminals(p.prod[0]) for t in terms: if t not in reducedterminals: reducedterminals.append(t) return reducedterminals def ReduceToNonterminals(nt): global Prodnames global Nonterminals global NTReductions reducednonterminals = [] for p in Prodnames[nt]: if len(p.prod) > 0: if Nonterminals.has_key(p.prod[0]): if p.prod[0] not in reducednonterminals: reducednonterminals.append(p.prod[0]) if p.prod[0] != nt: if not NTReductions.has_key(p.prod[0]): ReduceToNonterminals(p.prod[0]) nterms = NTReductions[p.prod[0]] for nt in nterms: if nt not in reducednonterminals: reducednonterminals.append(nt) NTReductions[nt] = reducednonterminals # ----------------------------------------------------------------------------- # lalr_parse_table() # # This function constructs an LALR table. # ----------------------------------------------------------------------------- def lalr_parse_table(): global _lr_method goto = _lr_goto # Goto array action = _lr_action # Action array actionp = { } # Action production array (temporary) goto_cache = _lr_goto_cache cid_hash = _lr0_cidhash _lr_method = "LALR" n_srconflict = 0 n_rrconflict = 0 if yaccdebug: sys.stderr.write("yacc: Generating LALR(1) parsing table...\n") _vf.write("\n\nParsing method: LALR(1)\n\n") # Step 1: Construct C = { I0, I1, ... IN}, collection of LR(0) items # This determines the number of states C = lr0_items() global Canonical Canonical = C ### # Create the kernel states. ### K = [] setC = [0]*len(C) for x in C: K.append(lr0_kernel(x)) for y in x: y.setnumbers = setC[:] _cindex = 0 for x in C: for y in x: y.lookaheads[_cindex] = [] y.setnumbers[_cindex] = 1 _cindex = _cindex + 1 ### # Add lookaheads to the lr items ### add_lookaheads(K) ### # Do the reductions for parsing first and keep them in globals ### ReduceNonterminals() global TReductions global NTReductions global Prodempty EmptyAncestors = {} for y in Prodempty.keys(): EmptyAncestors[y] = [] for x in NTReductions.items(): for y in x[1]: if Prodempty.has_key(y): EmptyAncestors[y].append(x[0]) # Build the parser table, state by state st = 0 for I in C: # Loop over each production in I actlist = [ ] # List of actions acthash = { } idI = id(I) if yaccdebug: _vf.write("\nstate %d\n\n" % st) for p in I: _vf.write(" (%d) %s\n" % (p.number, str(p))) _vf.write("\n") global First for p in I: try: if p.prod[-1] == ".": if p.name == "S'": # Start symbol. Accept! action[st,"$"] = 0 actionp[st,"$"] = p elif len(p.prod) == 0: ancestors = EmptyAncestors[p.name] for i in ancestors: for s in K: if i in s: input_list = [] plist = Productions[i.name] for x in plist: if len(x.prod) > 0 and x.prod[0] == p.name: n = p.prod[1:] d = x.prod[lr_index+2:] for l in x.lookaheads.items(): flist = First[tuple(n+d+[l])] for f in flist: if f not in input_list and f in p.lookaheads[st]: input_list.append(f) # We are at the end of a production. Reduce! #print "input_list: %s" % input_list #print "Follow[p.name]: %s" % Follow[p.name] for a in input_list: actlist.append((a,p,"reduce using rule %d (%s) " % (p.number,p))) r = action.get((st,a),None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number actionp[st,a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the shift if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p # print "Reduce/reduce conflict in state %d" % st n_rrconflict += 1 _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = -p.number actionp[st,a] = p break # break out of the for s in K loop because we only want to make # sure that a production is in the Kernel else: # We are at the end of a production. Reduce! for a in p.lookaheads[st]: actlist.append((a,p,"reduce using rule %d (%s)" % (p.number,p))) r = action.get((st,a),None) if r is not None: # Whoa. Have a shift/reduce or reduce/reduce conflict if r > 0: # Need to decide on shift or reduce here # By default we favor shifting. Need to add # some precedence rules here. sprec,slevel = Productions[actionp[st,a].number].prec rprec,rlevel = Precedence.get(a,('right',0)) if (slevel < rlevel) or ((slevel == rlevel) and (rprec == 'left')): # We really need to reduce here. action[st,a] = -p.number actionp[st,a] = p if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) n_srconflict += 1 elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the shift if not slevel and not rlevel: _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) n_srconflict +=1 elif r < 0: # Reduce/reduce conflict. In this case, we favor the rule # that was defined first in the grammar file oldp = Productions[-r] pp = Productions[p.number] if oldp.line > pp.line: action[st,a] = -p.number actionp[st,a] = p # print "Reduce/reduce conflict in state %d" % st n_rrconflict += 1 _vfc.write("reduce/reduce conflict in state %d resolved using rule %d.\n" % (st, actionp[st,a].number)) _vf.write(" ! reduce/reduce conflict for %s resolved using rule %d.\n" % (a,actionp[st,a].number)) else: print "Unknown conflict in state %d" % st else: action[st,a] = -p.number actionp[st,a] = p else: i = p.lr_index a = p.prod[i+1] # Get symbol right after the "." if Terminals.has_key(a): g = goto_cache[(idI,a)] j = cid_hash.get(id(g),-1) if j >= 0: # We are in a shift state _k = (a,j) if not acthash.has_key(_k): actlist.append((a,p,"shift and go to state %d" % j)) acthash[_k] = 1 r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: sys.stderr.write("Shift/shift conflict in state %d\n" % st) elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift rprec,rlevel = Productions[actionp[st,a].number].prec sprec,slevel = Precedence.get(a,('right',0)) if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): # We decide to shift here... highest precedence to shift action[st,a] = j actionp[st,a] = p if not slevel and not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = j actionp[st,a] = p else: nonterminal = a term_list = TReductions[nonterminal] # DB: This loop gets executed a lot. Try to optimize for a in term_list: g = goto_cache[(idI,a)] j = cid_hash[id(g)] if j >= 0: # We are in a shift state # Don't put repeated shift actions on action list (performance hack) _k = (a,j) if not acthash.has_key(_k): actlist.append((a,p,"shift and go to state "+str(j))) acthash[_k] = 1 r = action.get((st,a),None) if r is not None: # Whoa have a shift/reduce or shift/shift conflict if r > 0: if r != j: sys.stderr.write("Shift/shift conflict in state %d\n" % st) continue elif r < 0: # Do a precedence check. # - if precedence of reduce rule is higher, we reduce. # - if precedence of reduce is same and left assoc, we reduce. # - otherwise we shift rprec,rlevel = Productions[actionp[st,a].number].prec sprec,slevel = Precedence.get(a,('right',0)) if (slevel > rlevel) or ((slevel == rlevel) and (rprec != 'left')): # We decide to shift here... highest precedence to shift action[st,a] = j actionp[st,a] = p if not slevel and not rlevel: n_srconflict += 1 _vfc.write("shift/reduce conflict in state %d resolved as shift.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as shift.\n" % a) elif (slevel == rlevel) and (rprec == 'nonassoc'): action[st,a] = None else: # Hmmm. Guess we'll keep the reduce if not slevel and not rlevel: n_srconflict +=1 _vfc.write("shift/reduce conflict in state %d resolved as reduce.\n" % st) _vf.write(" ! shift/reduce conflict for %s resolved as reduce.\n" % a) else: sys.stderr.write("Unknown conflict in state %d\n" % st) else: action[st,a] = j actionp[st,a] = p except StandardError,e: raise YaccError, "Hosed in lalr_parse_table", e # Print the actions associated with each terminal if yaccdebug: for a,p,m in actlist: if action.has_key((st,a)): if p is actionp[st,a]: _vf.write(" %-15s %s\n" % (a,m)) _vf.write("\n") for a,p,m in actlist: if action.has_key((st,a)): if p is not actionp[st,a]: _vf.write(" ! %-15s [ %s ]\n" % (a,m)) # Construct the goto table for this state nkeys = { } for ii in I: for s in ii.usyms: if Nonterminals.has_key(s): nkeys[s] = None # Construct the goto table for this state for n in nkeys.keys(): g = lr0_goto(I,n) j = cid_hash.get(id(g),-1) if j >= 0: goto[st,n] = j if yaccdebug: _vf.write(" %-30s shift and go to state %d\n" % (n,j)) st += 1 if yaccdebug: if n_srconflict == 1: sys.stderr.write("yacc: %d shift/reduce conflict\n" % n_srconflict) if n_srconflict > 1: sys.stderr.write("yacc: %d shift/reduce conflicts\n" % n_srconflict) if n_rrconflict == 1: sys.stderr.write("yacc: %d reduce/reduce conflict\n" % n_rrconflict) if n_rrconflict > 1: sys.stderr.write("yacc: %d reduce/reduce conflicts\n" % n_rrconflict) # ----------------------------------------------------------------------------- # ==== LR Utility functions ==== # ----------------------------------------------------------------------------- # ----------------------------------------------------------------------------- # _lr_write_tables() # # This function writes the LR parsing tables to a file # ----------------------------------------------------------------------------- def lr_write_tables(modulename=tab_module,outputdir=''): filename = os.path.join(outputdir,modulename) + ".py" try: f = open(filename,"w") f.write(""" # %s # This file is automatically generated. Do not edit. _lr_method = %s _lr_signature = %s """ % (filename, repr(_lr_method), repr(Signature.digest()))) # Change smaller to 0 to go back to original tables smaller = 1 # Factor out names to try and make smaller if smaller: items = { } for k,v in _lr_action.items(): i = items.get(k[1]) if not i: i = ([],[]) items[k[1]] = i i[0].append(k[0]) i[1].append(v) f.write("\n_lr_action_items = {") for k,v in items.items(): f.write("%r:([" % k) for i in v[0]: f.write("%r," % i) f.write("],[") for i in v[1]: f.write("%r," % i) f.write("]),") f.write("}\n") f.write(""" _lr_action = { } for _k, _v in _lr_action_items.items(): for _x,_y in zip(_v[0],_v[1]): _lr_action[(_x,_k)] = _y del _lr_action_items """) else: f.write("\n_lr_action = { "); for k,v in _lr_action.items(): f.write("(%r,%r):%r," % (k[0],k[1],v)) f.write("}\n"); if smaller: # Factor out names to try and make smaller items = { } for k,v in _lr_goto.items(): i = items.get(k[1]) if not i: i = ([],[]) items[k[1]] = i i[0].append(k[0]) i[1].append(v) f.write("\n_lr_goto_items = {") for k,v in items.items(): f.write("%r:([" % k) for i in v[0]: f.write("%r," % i) f.write("],[") for i in v[1]: f.write("%r," % i) f.write("]),") f.write("}\n") f.write(""" _lr_goto = { } for _k, _v in _lr_goto_items.items(): for _x,_y in zip(_v[0],_v[1]): _lr_goto[(_x,_k)] = _y del _lr_goto_items """) else: f.write("\n_lr_goto = { "); for k,v in _lr_goto.items(): f.write("(%r,%r):%r," % (k[0],k[1],v)) f.write("}\n"); # Write production table f.write("_lr_productions = [\n") for p in Productions: if p: if (p.func): f.write(" (%r,%d,%r,%r,%d),\n" % (p.name, p.len, p.func.__name__,p.file,p.line)) else: f.write(" (%r,%d,None,None,None),\n" % (p.name, p.len)) else: f.write(" None,\n") f.write("]\n") f.close() except IOError,e: print "Unable to create '%s'" % filename print e return def lr_read_tables(module=tab_module,optimize=0): global _lr_action, _lr_goto, _lr_productions, _lr_method try: exec "import %s as parsetab" % module if (optimize) or (Signature.digest() == parsetab._lr_signature): _lr_action = parsetab._lr_action _lr_goto = parsetab._lr_goto _lr_productions = parsetab._lr_productions _lr_method = parsetab._lr_method return 1 else: return 0 except (ImportError,AttributeError): return 0 # ----------------------------------------------------------------------------- # yacc(module) # # Build the parser module # ----------------------------------------------------------------------------- def yacc(method=default_lr, debug=yaccdebug, module=None, tabmodule=tab_module, start=None, check_recursion=1, optimize=0,write_tables=1,debugfile=debug_file,outputdir=''): global yaccdebug yaccdebug = debug initialize_vars() files = { } error = 0 # Add starting symbol to signature if start: Signature.update(start) # Add parsing method to signature Signature.update(method) # If a "module" parameter was supplied, extract its dictionary. # Note: a module may in fact be an instance as well. if module: # User supplied a module object. if isinstance(module, types.ModuleType): ldict = module.__dict__ elif isinstance(module, types.InstanceType): _items = [(k,getattr(module,k)) for k in dir(module)] ldict = { } for i in _items: ldict[i[0]] = i[1] else: raise ValueError,"Expected a module" else: # No module given. We might be able to get information from the caller. # Throw an exception and unwind the traceback to get the globals try: raise RuntimeError except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame f = f.f_back # Walk out to our calling function ldict = f.f_globals # Grab its globals dictionary # If running in optimized mode. We're going to if (optimize and lr_read_tables(tabmodule,1)): # Read parse table del Productions[:] for p in _lr_productions: if not p: Productions.append(None) else: m = MiniProduction() m.name = p[0] m.len = p[1] m.file = p[3] m.line = p[4] if p[2]: m.func = ldict[p[2]] Productions.append(m) else: # Get the tokens map if (module and isinstance(module,types.InstanceType)): tokens = getattr(module,"tokens",None) else: tokens = ldict.get("tokens",None) if not tokens: raise YaccError,"module does not define a list 'tokens'" if not (isinstance(tokens,types.ListType) or isinstance(tokens,types.TupleType)): raise YaccError,"tokens must be a list or tuple." # Check to see if a requires dictionary is defined. requires = ldict.get("require",None) if requires: if not (isinstance(requires,types.DictType)): raise YaccError,"require must be a dictionary." for r,v in requires.items(): try: if not (isinstance(v,types.ListType)): raise TypeError v1 = [x.split(".") for x in v] Requires[r] = v1 except StandardError: print "Invalid specification for rule '%s' in require. Expected a list of strings" % r # Build the dictionary of terminals. We a record a 0 in the # dictionary to track whether or not a terminal is actually # used in the grammar if 'error' in tokens: print "yacc: Illegal token 'error'. Is a reserved word." raise YaccError,"Illegal token name" for n in tokens: if Terminals.has_key(n): print "yacc: Warning. Token '%s' multiply defined." % n Terminals[n] = [ ] Terminals['error'] = [ ] # Get the precedence map (if any) prec = ldict.get("precedence",None) if prec: if not (isinstance(prec,types.ListType) or isinstance(prec,types.TupleType)): raise YaccError,"precedence must be a list or tuple." add_precedence(prec) Signature.update(repr(prec)) for n in tokens: if not Precedence.has_key(n): Precedence[n] = ('right',0) # Default, right associative, 0 precedence # Look for error handler ef = ldict.get('p_error',None) if ef: if isinstance(ef,types.FunctionType): ismethod = 0 elif isinstance(ef, types.MethodType): ismethod = 1 else: raise YaccError,"'p_error' defined, but is not a function or method." eline = ef.func_code.co_firstlineno efile = ef.func_code.co_filename files[efile] = None if (ef.func_code.co_argcount != 1+ismethod): raise YaccError,"%s:%d: p_error() requires 1 argument." % (efile,eline) global Errorfunc Errorfunc = ef else: print "yacc: Warning. no p_error() function is defined." # Get the list of built-in functions with p_ prefix symbols = [ldict[f] for f in ldict.keys() if (type(ldict[f]) in (types.FunctionType, types.MethodType) and ldict[f].__name__[:2] == 'p_' and ldict[f].__name__ != 'p_error')] # Check for non-empty symbols if len(symbols) == 0: raise YaccError,"no rules of the form p_rulename are defined." # Sort the symbols by line number symbols.sort(lambda x,y: cmp(x.func_code.co_firstlineno,y.func_code.co_firstlineno)) # Add all of the symbols to the grammar for f in symbols: if (add_function(f)) < 0: error += 1 else: files[f.func_code.co_filename] = None # Make a signature of the docstrings for f in symbols: if f.__doc__: Signature.update(f.__doc__) lr_init_vars() if error: raise YaccError,"Unable to construct parser." if not lr_read_tables(tabmodule): # Validate files for filename in files.keys(): if not validate_file(filename): error = 1 # Validate dictionary validate_dict(ldict) if start and not Prodnames.has_key(start): raise YaccError,"Bad starting symbol '%s'" % start augment_grammar(start) error = verify_productions(cycle_check=check_recursion) otherfunc = [ldict[f] for f in ldict.keys() if (type(f) in (types.FunctionType,types.MethodType) and ldict[f].__name__[:2] != 'p_')] if error: raise YaccError,"Unable to construct parser." build_lritems() compute_first1() compute_follow(start) if method == 'SLR': slr_parse_table() elif method == 'LALR': lalr_parse_table() else: raise YaccError, "Unknown parsing method '%s'" % method if write_tables: lr_write_tables(tabmodule,outputdir) if yaccdebug: try: f = open(os.path.join(outputdir,debugfile),"w") f.write(_vfc.getvalue()) f.write("\n\n") f.write(_vf.getvalue()) f.close() except IOError,e: print "yacc: can't create '%s'" % debugfile,e # Made it here. Create a parser object and set up its internal state. # Set global parse() method to bound method of parser object. p = Parser("xyzzy") p.productions = Productions p.errorfunc = Errorfunc p.action = _lr_action p.goto = _lr_goto p.method = _lr_method p.require = Requires global parse parse = p.parse # Clean up all of the globals we created if (not optimize): yacc_cleanup() return p # yacc_cleanup function. Delete all of the global variables # used during table construction def yacc_cleanup(): global _lr_action, _lr_goto, _lr_method, _lr_goto_cache del _lr_action, _lr_goto, _lr_method, _lr_goto_cache global Productions, Prodnames, Prodmap, Terminals global Nonterminals, First, Follow, Precedence, LRitems global Errorfunc, Signature, Requires global Prodempty, TReductions, NTReductions, GotoSetNum, Canonical del Productions, Prodnames, Prodmap, Terminals del Nonterminals, First, Follow, Precedence, LRitems del Errorfunc, Signature, Requires del Prodempty, TReductions, NTReductions, GotoSetNum, Canonical global _vf, _vfc del _vf, _vfc # Stub that raises an error if parsing is attempted without first calling yacc() def parse(*args,**kwargs): raise YaccError, "yacc: No parser built with yacc()" ================================================ FILE: src/kenlm/COPYING ================================================ GNU GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU General Public License is a free, copyleft license for software and other kinds of works. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, the GNU General Public License is intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. We, the Free Software Foundation, use the GNU General Public License for most of our software; it applies also to any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. To protect your rights, we need to prevent others from denying you these rights or asking you to surrender the rights. Therefore, you have certain responsibilities if you distribute copies of the software, or if you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether gratis or for a fee, you must pass on to the recipients the same freedoms that you received. You must make sure that they, too, receive or can get the source code. And you must show them these terms so they know their rights. Developers that use the GNU GPL protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License giving you legal permission to copy, distribute and/or modify it. For the developers' and authors' protection, the GPL clearly explains that there is no warranty for this free software. For both users' and authors' sake, the GPL requires that modified versions be marked as changed, so that their problems will not be attributed erroneously to authors of previous versions. Some devices are designed to deny users access to install or run modified versions of the software inside them, although the manufacturer can do so. This is fundamentally incompatible with the aim of protecting users' freedom to change the software. The systematic pattern of such abuse occurs in the area of products for individuals to use, which is precisely where it is most unacceptable. Therefore, we have designed this version of the GPL to prohibit the practice for those products. If such problems arise substantially in other domains, we stand ready to extend this provision to those domains in future versions of the GPL, as needed to protect the freedom of users. Finally, every program is threatened constantly by software patents. States should not allow patents to restrict development and use of software on general-purpose computers, but in those that do, we wish to avoid the special danger that patents applied to a free program could make it effectively proprietary. To prevent this, the GPL assures that patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Use with the GNU Affero General Public License. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU Affero General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the special requirements of the GNU Affero General Public License, section 13, concerning interaction through a network will apply to the combination as such. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If the program does terminal interaction, make it output a short notice like this when it starts in an interactive mode: Copyright (C) This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate parts of the General Public License. Of course, your program's commands might be different; for a GUI interface, you would use an "about box". You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU GPL, see . The GNU General Public License does not permit incorporating your program into proprietary programs. If your program is a subroutine library, you may consider it more useful to permit linking proprietary applications with the library. If this is what you want to do, use the GNU Lesser General Public License instead of this License. But first, please read . ================================================ FILE: src/kenlm/COPYING.LESSER ================================================ GNU LESSER GENERAL PUBLIC LICENSE Version 3, 29 June 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. This version of the GNU Lesser General Public License incorporates the terms and conditions of version 3 of the GNU General Public License, supplemented by the additional permissions listed below. 0. Additional Definitions. As used herein, "this License" refers to version 3 of the GNU Lesser General Public License, and the "GNU GPL" refers to version 3 of the GNU General Public License. "The Library" refers to a covered work governed by this License, other than an Application or a Combined Work as defined below. An "Application" is any work that makes use of an interface provided by the Library, but which is not otherwise based on the Library. Defining a subclass of a class defined by the Library is deemed a mode of using an interface provided by the Library. A "Combined Work" is a work produced by combining or linking an Application with the Library. The particular version of the Library with which the Combined Work was made is also called the "Linked Version". The "Minimal Corresponding Source" for a Combined Work means the Corresponding Source for the Combined Work, excluding any source code for portions of the Combined Work that, considered in isolation, are based on the Application, and not on the Linked Version. The "Corresponding Application Code" for a Combined Work means the object code and/or source code for the Application, including any data and utility programs needed for reproducing the Combined Work from the Application, but excluding the System Libraries of the Combined Work. 1. Exception to Section 3 of the GNU GPL. You may convey a covered work under sections 3 and 4 of this License without being bound by section 3 of the GNU GPL. 2. Conveying Modified Versions. If you modify a copy of the Library, and, in your modifications, a facility refers to a function or data to be supplied by an Application that uses the facility (other than as an argument passed when the facility is invoked), then you may convey a copy of the modified version: a) under this License, provided that you make a good faith effort to ensure that, in the event an Application does not supply the function or data, the facility still operates, and performs whatever part of its purpose remains meaningful, or b) under the GNU GPL, with none of the additional permissions of this License applicable to that copy. 3. Object Code Incorporating Material from Library Header Files. The object code form of an Application may incorporate material from a header file that is part of the Library. You may convey such object code under terms of your choice, provided that, if the incorporated material is not limited to numerical parameters, data structure layouts and accessors, or small macros, inline functions and templates (ten or fewer lines in length), you do both of the following: a) Give prominent notice with each copy of the object code that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the object code with a copy of the GNU GPL and this license document. 4. Combined Works. You may convey a Combined Work under terms of your choice that, taken together, effectively do not restrict modification of the portions of the Library contained in the Combined Work and reverse engineering for debugging such modifications, if you also do each of the following: a) Give prominent notice with each copy of the Combined Work that the Library is used in it and that the Library and its use are covered by this License. b) Accompany the Combined Work with a copy of the GNU GPL and this license document. c) For a Combined Work that displays copyright notices during execution, include the copyright notice for the Library among these notices, as well as a reference directing the user to the copies of the GNU GPL and this license document. d) Do one of the following: 0) Convey the Minimal Corresponding Source under the terms of this License, and the Corresponding Application Code in a form suitable for, and under terms that permit, the user to recombine or relink the Application with a modified version of the Linked Version to produce a modified Combined Work, in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source. 1) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (a) uses at run time a copy of the Library already present on the user's computer system, and (b) will operate properly with a modified version of the Library that is interface-compatible with the Linked Version. e) Provide Installation Information, but only if you would otherwise be required to provide such information under section 6 of the GNU GPL, and only to the extent that such information is necessary to install and execute a modified version of the Combined Work produced by recombining or relinking the Application with a modified version of the Linked Version. (If you use option 4d0, the Installation Information must accompany the Minimal Corresponding Source and Corresponding Application Code. If you use option 4d1, you must provide the Installation Information in the manner specified by section 6 of the GNU GPL for conveying Corresponding Source.) 5. Combined Libraries. You may place library facilities that are a work based on the Library side by side in a single library together with other library facilities that are not Applications and are not covered by this License, and convey such a combined library under terms of your choice, if you do both of the following: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities, conveyed under the terms of this License. b) Give prominent notice with the combined library that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 6. Revised Versions of the GNU Lesser General Public License. The Free Software Foundation may publish revised and/or new versions of the GNU Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library as you received it specifies that a certain numbered version of the GNU Lesser General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that published version or of any later version published by the Free Software Foundation. If the Library as you received it does not specify a version number of the GNU Lesser General Public License, you may choose any version of the GNU Lesser General Public License ever published by the Free Software Foundation. If the Library as you received it specifies that a proxy can decide whether future versions of the GNU Lesser General Public License shall apply, that proxy's public statement of acceptance of any version is permanent authorization for you to choose that version for the Library. ================================================ FILE: src/kenlm/LICENSE ================================================ Most of the code here is licensed under the LGPL. There are exceptions which have their own licenses, listed below. See comments in those files for more details. util/murmur_hash.cc is under the MIT license. util/string_piece.hh and util/string_piece.cc are Google code and contains its own license. For the rest: Avenue code is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Avenue code is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with Avenue code. If not, see . ================================================ FILE: src/kenlm/README ================================================ Language model inference code by Kenneth Heafield The official website is http://kheafield.com/code/kenlm/. If you're a decoder developer, please download the latest version from there instead of copying from another decoder. Two data structures are supported: probing and trie. Probing is a probing hash table with keys that ere 64-bit hashes of n-grams and floats as values. Trie is a fairly standard trie but with bit-level packing so it uses the minimum number of bits to store word indices and pointers. The trie node entries are sorted by word index. Probing is the fastest and uses the most memory. Trie uses the least memory and a bit slower. With trie, resident memory is 58% of IRST's smallest version and 21% of SRI's compact version. Simultaneously, trie CPU's use is 81% of IRST's fastest version and 84% of SRI's fast version. KenLM's probing hash table implementation goes even faster at the expense of using more memory. See http://kheafield.com/code/kenlm/benchmark/. Binary format via mmap is supported. Run ./build_binary to make one then pass the binary file name to the appropriate Model constructor. Currently, it assumes POSIX APIs for errno, sterror_r, open, close, mmap, munmap, ftruncate, fstat, lseek, and read. This is tested on Linux and the non-UNIX Mac OS X. I welcome submissions porting (via #ifdef) to other systems (e.g. Windows) but proudly have no machine on which to test it. A brief note to Mac OS X users: your gcc is too old to recognize the pack pragma. The warning effectively means that, on 64-bit machines, the model will use 16 bytes instead of 12 bytes per n-gram of maximum order (those of lower order are already 16 bytes) in the probing and sorted models. The trie is not impacted by this. FOR DEVELOPERS Copy the code and distribute with your decoder. - It does not depend on Boost or ICU. If you use ICU, define HAVE_ICU in util/have.hh (uncomment the line) to avoid a name conflict. Defining HAVE_BOOST will let you hash StringPiece. - Most people have zlib. If you don't want to depend on that, comment out #define HAVE_ZLIB in util/have.hh. This will disable loading gzipped ARPA files. - Look at compile.sh and reimplement using your build system. - Use either the interface in lm/model.hh or lm/virtual_interface.hh. Interface documentation is in comments of lm/virtual_interface.hh (including for lm/model.hh). - See lm/config.hh for tuning options. - I recommend copying the code and distributing it with your decoder. However, please send improvements to me so that they can be integrated into the package. Also included: A wrapper to SRI with the same interface. The name was Hieu Hoang's idea, not mine. ================================================ FILE: src/kenlm/build_jnilib.sh ================================================ #!/bin/bash # based on compile_query_only.sh echo Compiling source files rm {lm,util}/*.o 2>/dev/null set -e CXXFLAGS="-I. -O3 -DNDEBUG -DKENLM_MAX_ORDER=6 -fPIC $CXXFLAGS" #Grab all cc files in these directories except those ending in test.cc or main.cc objects="" for i in util/double-conversion/*.cc util/*.cc lm/*.cc; do if [ "${i%test.cc}" == "$i" ] && [ "${i%main.cc}" == "$i" ]; then g++ $CXXFLAGS -c $i -o ${i%.cc}.o objects="$objects ${i%.cc}.o" fi done echo Compiling binaries into bin mkdir -p bin g++ $CXXFLAGS lm/build_binary_main.cc $objects -o bin/build_binary g++ $CXXFLAGS lm/query_main.cc $objects -o bin/query g++ $CXXFLAGS lm/kenlm_max_order_main.cc -o bin/kenlm_max_order echo Compiling JNI library and moving it to openccg/lib g++ $CXXFLAGS jni/wrap.cc -I $JAVA_HOME/include -I $JAVA_HOME/include/linux $objects -shared -Wl,-soname,libken.so -o libken.so -lz -Wno-deprecated -pthread mv libken.so ../../lib/. ================================================ FILE: src/kenlm/clean_query_only.sh ================================================ #!/bin/bash rm -rf {lm,util,util/double-conversion}/*.o bin/{query,kenlm_max_order,build_binary} ================================================ FILE: src/kenlm/jni/wrap.cc ================================================ #include "lm/enumerate_vocab.hh" #include "lm/model.hh" #include "util/murmur_hash.hh" #include #include #include #include #include // Grr. Everybody's compiler is slightly different and I'm trying to not depend on boost. #include // This is needed to compile on OS X Lion / gcc 4.2.1 namespace __gnu_cxx { template<> struct hash { size_t operator()(unsigned long long int __x) const { return __x; } }; } // Verify that jint and lm::ngram::WordIndex are the same size. If this breaks // for you, there's a need to revise probString. namespace { template struct StaticCheck {}; template<> struct StaticCheck { typedef bool StaticAssertionPassed; }; typedef StaticCheck::StaticAssertionPassed FloatSize; // Vocab ids above what the vocabulary knows about are unknown and should // be mapped to that. void MapArray(const std::vector& map, jint *begin, jint *end) { for (jint *i = begin; i < end; ++i) { *i = map[*i]; } } char *PieceCopy(const StringPiece &str) { char *ret = (char*) malloc(str.size() + 1); memcpy(ret, str.data(), str.size()); ret[str.size()] = 0; return ret; } // Rather than handle several different instantiations over JNI, we'll just // do virtual calls C++-side. class VirtualBase { public: virtual ~VirtualBase() { } virtual float Prob(jint *begin, jint *end) const = 0; virtual float ProbString(jint * const begin, jint * const end, jint start) const = 0; virtual uint8_t Order() const = 0; virtual bool RegisterWord(const StringPiece& word, const int wd_id) = 0; protected: VirtualBase() { } private: }; template class VirtualImpl: public VirtualBase { public: VirtualImpl(const char *name, float fake_oov_cost) : m_(name), fake_oov_cost_(fake_oov_cost) { // Insert unknown id mapping. map_.push_back(0); } ~VirtualImpl() { } float Prob(jint * const begin, jint * const end) const { MapArray(map_, begin, end); std::reverse(begin, end - 1); lm::ngram::State ignored; return *(end - 1) ? m_.FullScoreForgotState( reinterpret_cast(begin), reinterpret_cast(end - 1), *(end - 1), ignored).prob : fake_oov_cost_; } float ProbString(jint * const begin, jint * const end, jint start) const { MapArray(map_, begin, end); float prob; lm::ngram::State state; if (start == 0) { prob = 0; state = m_.NullContextState(); } else { std::reverse(begin, begin + start); prob = m_.FullScoreForgotState( reinterpret_cast(begin), reinterpret_cast(begin + start), begin[start], state).prob; if (begin[start] == 0) prob = fake_oov_cost_; ++start; } lm::ngram::State state2; for (const jint *i = begin + start;;) { if (i >= end) break; float got = m_.Score(state, *i, state2); prob += *(i++) ? got : fake_oov_cost_; if (i >= end) break; got = m_.Score(state2, *i, state); prob += *(i++) ? got : fake_oov_cost_; } return prob; } uint8_t Order() const { return m_.Order(); } bool RegisterWord(const StringPiece& word, const int wd_id) { if (map_.size() <= wd_id) { map_.resize(wd_id + 1, 0); } bool already_present = false; if (map_[wd_id] != 0) already_present = true; map_[wd_id] = m_.GetVocabulary().Index(word); return already_present; } private: Model m_; float fake_oov_cost_; std::vector map_; }; VirtualBase *ConstructModel(const char *file_name, float fake_oov_cost) { using namespace lm::ngram; ModelType model_type; if (!RecognizeBinary(file_name, model_type)) model_type = HASH_PROBING; switch (model_type) { case HASH_PROBING: return new VirtualImpl(file_name, fake_oov_cost); case TRIE_SORTED: return new VirtualImpl(file_name, fake_oov_cost); case ARRAY_TRIE_SORTED: return new VirtualImpl(file_name, fake_oov_cost); case QUANT_TRIE_SORTED: return new VirtualImpl(file_name, fake_oov_cost); case QUANT_ARRAY_TRIE_SORTED: return new VirtualImpl(file_name, fake_oov_cost); default: UTIL_THROW( lm::FormatLoadException, "Unrecognized file format " << (unsigned) model_type << " in file " << file_name); } } } // namespace extern "C" { JNIEXPORT jlong JNICALL Java_opennlp_ccg_ngrams_kenlm_jni_KenLM_construct( JNIEnv *env, jclass, jstring file_name, jfloat fake_oov_cost) { const char *str = env->GetStringUTFChars(file_name, 0); if (!str) return 0; jlong ret; try { ret = reinterpret_cast(ConstructModel(str, fake_oov_cost)); } catch (std::exception &e) { std::cerr << e.what() << std::endl; abort(); } env->ReleaseStringUTFChars(file_name, str); return ret; } JNIEXPORT void JNICALL Java_opennlp_ccg_ngrams_kenlm_jni_KenLM_destroy( JNIEnv *env, jclass, jlong pointer) { delete reinterpret_cast(pointer); } JNIEXPORT jint JNICALL Java_opennlp_ccg_ngrams_kenlm_jni_KenLM_order( JNIEnv *env, jclass, jlong pointer) { return reinterpret_cast(pointer)->Order(); } JNIEXPORT jboolean JNICALL Java_opennlp_ccg_ngrams_kenlm_jni_KenLM_registerWord( JNIEnv *env, jclass, jlong pointer, jstring word, jint id) { const char *str = env->GetStringUTFChars(word, 0); if (!str) return false; jint ret; try { ret = reinterpret_cast(pointer)->RegisterWord(str, id); } catch (std::exception &e) { std::cerr << e.what() << std::endl; abort(); } env->ReleaseStringUTFChars(word, str); return ret; } JNIEXPORT jfloat JNICALL Java_opennlp_ccg_ngrams_kenlm_jni_KenLM_prob( JNIEnv *env, jclass, jlong pointer, jintArray arr) { jint length = env->GetArrayLength(arr); if (length <= 0) return 0.0; // GCC only. jint values[length]; env->GetIntArrayRegion(arr, 0, length, values); return reinterpret_cast(pointer)->Prob(values, values + length); } JNIEXPORT jfloat JNICALL Java_opennlp_ccg_ngrams_kenlm_jni_KenLM_probString( JNIEnv *env, jclass, jlong pointer, jintArray arr, jint start) { jint length = env->GetArrayLength(arr); if (length <= start) return 0.0; // GCC only. jint values[length]; env->GetIntArrayRegion(arr, 0, length, values); return reinterpret_cast(pointer)->ProbString(values, values + length, start); } } // extern ================================================ FILE: src/kenlm/lm/Jamfile ================================================ # If you need higher order, change this option # Having this limit means that State can be # (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of # sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ; if ( $(max-order) != 6 ) { echo "Setting KenLM maximum n-gram order to $(max-order)" ; } max-order = KENLM_MAX_ORDER=$(max-order) ; path-constant ORDER-LOG : bin/order.log ; update-if-changed $(ORDER-LOG) $(max-order) ; max-order += $(ORDER-LOG) ; fakelib kenlm : [ glob *.cc : *main.cc *test.cc ] ../util//kenutil : .. $(max-order) : : .. $(max-order) ; import testing ; run left_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ; run model_test.cc kenlm /top//boost_unit_test_framework : : test.arpa test_nounk.arpa ; run partial_test.cc kenlm /top//boost_unit_test_framework : : test.arpa ; exe query : query_main.cc kenlm ../util//kenutil ; exe build_binary : build_binary_main.cc kenlm ../util//kenutil ; exe fragment : fragment_main.cc kenlm ; alias programs : query build_binary fragment filter//filter : multi:builder//lmplz ; ================================================ FILE: src/kenlm/lm/bhiksha.cc ================================================ #include "lm/bhiksha.hh" #include "lm/config.hh" #include "util/file.hh" #include "util/exception.hh" #include namespace lm { namespace ngram { namespace trie { DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) : next_(util::BitsMask::ByMax(max_next)) {} const uint8_t kArrayBhikshaVersion = 0; // TODO: put this in binary file header instead when I change the binary file format again. void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) { uint8_t version; uint8_t configured_bits; util::ReadOrThrow(fd, &version, 1); util::ReadOrThrow(fd, &configured_bits, 1); if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion); config.pointer_bhiksha_bits = configured_bits; } namespace { // Find argmin_{chopped \in [0, RequiredBits(max_next)]} ChoppedDelta(max_offset) uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) { uint8_t required = util::RequiredBits(max_next); uint8_t best_chop = 0; int64_t lowest_change = std::numeric_limits::max(); // There are probably faster ways but I don't care because this is only done once per order at construction time. for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) { int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */ - max_offset * static_cast(chop); /* savings in bits*/ if (change < lowest_change) { lowest_change = change; best_chop = chop; } } return best_chop; } std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &config) { uint8_t required = util::RequiredBits(max_next); uint8_t chopping = ChopBits(max_offset, max_next, config); return (max_next >> (required - chopping)) + 1 /* we store 0 too */; } } // namespace uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) { return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */; } uint8_t ArrayBhiksha::InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config) { return util::RequiredBits(max_next) - ChopBits(max_offset, max_next, config); } namespace { void *AlignTo8(void *from) { uint8_t *val = reinterpret_cast(from); std::size_t remainder = reinterpret_cast(val) & 7; if (!remainder) return val; return val + 8 - remainder; } } // namespace ArrayBhiksha::ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_next, const Config &config) : next_inline_(util::BitsMask::ByBits(InlineBits(max_offset, max_next, config))), offset_begin_(reinterpret_cast(AlignTo8(base)) + 1 /* 8-byte header */), offset_end_(offset_begin_ + ArrayCount(max_offset, max_next, config)), write_to_(reinterpret_cast(AlignTo8(base)) + 1 /* 8-byte header */ + 1 /* first entry is 0 */), original_base_(base) {} void ArrayBhiksha::FinishedLoading(const Config &config) { // *offset_begin_ = 0 but without a const_cast. *(write_to_ - (write_to_ - offset_begin_)) = 0; if (write_to_ != offset_end_) UTIL_THROW(util::Exception, "Did not get all the array entries that were expected."); uint8_t *head_write = reinterpret_cast(original_base_); *(head_write++) = kArrayBhikshaVersion; *(head_write++) = config.pointer_bhiksha_bits; } void ArrayBhiksha::LoadedBinary() { } } // namespace trie } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/bhiksha.hh ================================================ /* Simple implementation of * @inproceedings{bhikshacompression, * author={Bhiksha Raj and Ed Whittaker}, * year={2003}, * title={Lossless Compression of Language Model Structure and Word Identifiers}, * booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing}, * pages={388--391}, * } * * Currently only used for next pointers. */ #ifndef LM_BHIKSHA__ #define LM_BHIKSHA__ #include #include #include "lm/model_type.hh" #include "lm/trie.hh" #include "util/bit_packing.hh" #include "util/sorted_uniform.hh" namespace lm { namespace ngram { struct Config; namespace trie { class DontBhiksha { public: static const ModelType kModelTypeAdd = static_cast(0); static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {} static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { return util::RequiredBits(max_next); } DontBhiksha(const void *base, uint64_t max_offset, uint64_t max_next, const Config &config); void ReadNext(const void *base, uint64_t bit_offset, uint64_t /*index*/, uint8_t total_bits, NodeRange &out) const { out.begin = util::ReadInt57(base, bit_offset, next_.bits, next_.mask); out.end = util::ReadInt57(base, bit_offset + total_bits, next_.bits, next_.mask); //assert(out.end >= out.begin); } void WriteNext(void *base, uint64_t bit_offset, uint64_t /*index*/, uint64_t value) { util::WriteInt57(base, bit_offset, next_.bits, value); } void FinishedLoading(const Config &/*config*/) {} void LoadedBinary() {} uint8_t InlineBits() const { return next_.bits; } private: util::BitsMask next_; }; class ArrayBhiksha { public: static const ModelType kModelTypeAdd = kArrayAdd; static void UpdateConfigFromBinary(int fd, Config &config); static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); ArrayBhiksha(void *base, uint64_t max_offset, uint64_t max_value, const Config &config); void ReadNext(const void *base, uint64_t bit_offset, uint64_t index, uint8_t total_bits, NodeRange &out) const { const uint64_t *begin_it = util::BinaryBelow(util::IdentityAccessor(), offset_begin_, offset_end_, index); const uint64_t *end_it; for (end_it = begin_it; (end_it < offset_end_) && (*end_it <= index + 1); ++end_it) {} --end_it; out.begin = ((begin_it - offset_begin_) << next_inline_.bits) | util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask); out.end = ((end_it - offset_begin_) << next_inline_.bits) | util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask); //assert(out.end >= out.begin); } void WriteNext(void *base, uint64_t bit_offset, uint64_t index, uint64_t value) { uint64_t encode = value >> next_inline_.bits; for (; write_to_ <= offset_begin_ + encode; ++write_to_) *write_to_ = index; util::WriteInt57(base, bit_offset, next_inline_.bits, value & next_inline_.mask); } void FinishedLoading(const Config &config); void LoadedBinary(); uint8_t InlineBits() const { return next_inline_.bits; } private: const util::BitsMask next_inline_; const uint64_t *const offset_begin_; const uint64_t *const offset_end_; uint64_t *write_to_; void *original_base_; }; } // namespace trie } // namespace ngram } // namespace lm #endif // LM_BHIKSHA__ ================================================ FILE: src/kenlm/lm/binary_format.cc ================================================ #include "lm/binary_format.hh" #include "lm/lm_exception.hh" #include "util/file.hh" #include "util/file_piece.hh" #include #include #include #include #include namespace lm { namespace ngram { namespace { const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version"; const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0"; // This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed). const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; const long int kMagicVersion = 5; // Old binary files built on 32-bit machines have this header. // TODO: eliminate with next binary release. struct OldSanity { char magic[sizeof(kMagicBytes)]; float zero_f, one_f, minus_half_f; WordIndex one_word_index, max_word_index; uint64_t one_uint64; void SetToReference() { std::memset(this, 0, sizeof(OldSanity)); std::memcpy(magic, kMagicBytes, sizeof(magic)); zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; one_word_index = 1; max_word_index = std::numeric_limits::max(); one_uint64 = 1; } }; // Test values aligned to 8 bytes. struct Sanity { char magic[ALIGN8(sizeof(kMagicBytes))]; float zero_f, one_f, minus_half_f; WordIndex one_word_index, max_word_index, padding_to_8; uint64_t one_uint64; void SetToReference() { std::memset(this, 0, sizeof(Sanity)); std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes)); zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; one_word_index = 1; max_word_index = std::numeric_limits::max(); padding_to_8 = 0; one_uint64 = 1; } }; const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; std::size_t TotalHeaderSize(unsigned char order) { return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); } void WriteHeader(void *to, const Parameters ¶ms) { Sanity header = Sanity(); header.SetToReference(); std::memcpy(to, &header, sizeof(Sanity)); char *out = reinterpret_cast(to) + sizeof(Sanity); *reinterpret_cast(out) = params.fixed; out += sizeof(FixedWidthParameters); uint64_t *counts = reinterpret_cast(out); for (std::size_t i = 0; i < params.counts.size(); ++i) { counts[i] = params.counts[i]; } } } // namespace uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) { if (config.write_mmap) { std::size_t total = TotalHeaderSize(order) + memory_size; backing.file.reset(util::CreateOrThrow(config.write_mmap)); if (config.write_method == Config::WRITE_MMAP) { backing.vocab.reset(util::MapZeroedWrite(backing.file.get(), total), total, util::scoped_memory::MMAP_ALLOCATED); } else { util::ResizeOrThrow(backing.file.get(), 0); util::MapAnonymous(total, backing.vocab); } strncpy(reinterpret_cast(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order)); return reinterpret_cast(backing.vocab.get()) + TotalHeaderSize(order); } else { util::MapAnonymous(memory_size, backing.vocab); return reinterpret_cast(backing.vocab.get()); } } uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing) { std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad; if (config.write_mmap) { // Grow the file to accomodate the search, using zeros. try { util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size); } catch (util::ErrnoException &e) { e << " for file " << config.write_mmap; throw e; } if (config.write_method == Config::WRITE_AFTER) { util::MapAnonymous(memory_size, backing.search); return reinterpret_cast(backing.search.get()); } // mmap it now. // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. std::size_t page_size = util::SizePage(); std::size_t alignment_cruft = adjusted_vocab % page_size; backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); return reinterpret_cast(backing.search.get()) + alignment_cruft; } else { util::MapAnonymous(memory_size, backing.search); return reinterpret_cast(backing.search.get()); } } void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, std::size_t vocab_pad, Backing &backing) { if (!config.write_mmap) return; switch (config.write_method) { case Config::WRITE_MMAP: util::SyncOrThrow(backing.vocab.get(), backing.vocab.size()); util::SyncOrThrow(backing.search.get(), backing.search.size()); break; case Config::WRITE_AFTER: util::SeekOrThrow(backing.file.get(), 0); util::WriteOrThrow(backing.file.get(), backing.vocab.get(), backing.vocab.size()); util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad); util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size()); util::FSyncOrThrow(backing.file.get()); break; } // header and vocab share the same mmap. The header is written here because we know the counts. Parameters params = Parameters(); params.counts = counts; params.fixed.order = counts.size(); params.fixed.probing_multiplier = config.probing_multiplier; params.fixed.model_type = model_type; params.fixed.has_vocabulary = config.include_vocab; params.fixed.search_version = search_version; WriteHeader(backing.vocab.get(), params); if (config.write_method == Config::WRITE_AFTER) { util::SeekOrThrow(backing.file.get(), 0); util::WriteOrThrow(backing.file.get(), backing.vocab.get(), TotalHeaderSize(counts.size())); } } namespace detail { bool IsBinaryFormat(int fd) { const uint64_t size = util::SizeFile(fd); if (size == util::kBadSize || (size <= static_cast(sizeof(Sanity)))) return false; // Try reading the header. util::scoped_memory memory; try { util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory); } catch (const util::Exception &e) { return false; } Sanity reference_header = Sanity(); reference_header.SetToReference(); if (!memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true; if (!memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) { UTIL_THROW(FormatLoadException, "This binary file did not finish building"); } if (!memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) { char *end_ptr; const char *begin_version = static_cast(memory.get()) + strlen(kMagicBeforeVersion); long int version = strtol(begin_version, &end_ptr, 10); if ((end_ptr != begin_version) && version != kMagicVersion) { UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary"); } OldSanity old_sanity = OldSanity(); old_sanity.SetToReference(); UTIL_THROW_IF(!memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable."); UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture"); } return false; } void ReadHeader(int fd, Parameters &out) { util::SeekOrThrow(fd, sizeof(Sanity)); util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed)); if (out.fixed.probing_multiplier < 1.0) UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0."); out.counts.resize(static_cast(out.fixed.order)); if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); } void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) { if (params.fixed.model_type != model_type) { if (static_cast(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *))) UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast(params.fixed.model_type) << " but this is not implemented for in this inference code."); UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]); } UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version); } void SeekPastHeader(int fd, const Parameters ¶ms) { util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); } uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) { const uint64_t file_size = util::SizeFile(backing.file.get()); // The header is smaller than a page, so we have to map the whole header as well. std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size); if (file_size != util::kBadSize && static_cast(file_size) < total_map) UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); util::MapRead(config.load_method, backing.file.get(), 0, total_map, backing.search); if (config.enumerate_vocab && !params.fixed.has_vocabulary) UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); // Seek to vocabulary words util::SeekOrThrow(backing.file.get(), total_map); return reinterpret_cast(backing.search.get()) + TotalHeaderSize(params.counts.size()); } void ComplainAboutARPA(const Config &config, ModelType model_type) { if (config.write_mmap || !config.messages) return; if (config.arpa_complain == Config::ALL) { *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl; } else if (config.arpa_complain == Config::EXPENSIVE && (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) { *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl; } } } // namespace detail bool RecognizeBinary(const char *file, ModelType &recognized) { util::scoped_fd fd(util::OpenReadOrThrow(file)); if (!detail::IsBinaryFormat(fd.get())) return false; Parameters params; detail::ReadHeader(fd.get(), params); recognized = params.fixed.model_type; return true; } } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/binary_format.hh ================================================ #ifndef LM_BINARY_FORMAT__ #define LM_BINARY_FORMAT__ #include "lm/config.hh" #include "lm/model_type.hh" #include "lm/read_arpa.hh" #include "util/file_piece.hh" #include "util/mmap.hh" #include "util/scoped.hh" #include #include #include namespace lm { namespace ngram { /*Inspect a file to determine if it is a binary lm. If not, return false. * If so, return true and set recognized to the type. This is the only API in * this header designed for use by decoder authors. */ bool RecognizeBinary(const char *file, ModelType &recognized); struct FixedWidthParameters { unsigned char order; float probing_multiplier; // What type of model is this? ModelType model_type; // Does the end of the file have the actual strings in the vocabulary? bool has_vocabulary; unsigned int search_version; }; // This is a macro instead of an inline function so constants can be assigned using it. #define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) // Parameters stored in the header of a binary file. struct Parameters { FixedWidthParameters fixed; std::vector counts; }; struct Backing { // File behind memory, if any. util::scoped_fd file; // Vocabulary lookup table. Not to be confused with the vocab words themselves. util::scoped_memory vocab; // Raw block of memory backing the language model data structures util::scoped_memory search; }; // Create just enough of a binary file to write vocabulary to it. uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing); // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin. uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t memory_size, Backing &backing); // Write header to binary file. This is done last to prevent incomplete files // from loading. void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, std::size_t vocab_pad, Backing &backing); namespace detail { bool IsBinaryFormat(int fd); void ReadHeader(int fd, Parameters ¶ms); void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms); void SeekPastHeader(int fd, const Parameters ¶ms); uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing); void ComplainAboutARPA(const Config &config, ModelType model_type); } // namespace detail template void LoadLM(const char *file, const Config &config, To &to) { Backing &backing = to.MutableBacking(); backing.file.reset(util::OpenReadOrThrow(file)); try { if (detail::IsBinaryFormat(backing.file.get())) { Parameters params; detail::ReadHeader(backing.file.get(), params); detail::MatchCheck(To::kModelType, To::kVersion, params); // Replace the run-time configured probing_multiplier with the one in the file. Config new_config(config); new_config.probing_multiplier = params.fixed.probing_multiplier; detail::SeekPastHeader(backing.file.get(), params); To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config); uint64_t memory_size = To::Size(params.counts, new_config); uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing); to.InitializeFromBinary(start, params, new_config, backing.file.get()); } else { detail::ComplainAboutARPA(config, To::kModelType); to.InitializeFromARPA(file, config); } } catch (util::Exception &e) { e << " File: " << file; throw; } } } // namespace ngram } // namespace lm #endif // LM_BINARY_FORMAT__ ================================================ FILE: src/kenlm/lm/blank.hh ================================================ #ifndef LM_BLANK__ #define LM_BLANK__ #include #include #include namespace lm { namespace ngram { /* Suppose "foo bar" appears with zero backoff but there is no trigram * beginning with these words. Then, when scoring "foo bar", the model could * return out_state containing "bar" or even null context if "bar" also has no * backoff and is never followed by another word. Then the backoff is set to * kNoExtensionBackoff. If the n-gram might be extended, then out_state must * contain the full n-gram, in which case kExtensionBackoff is set. In any * case, if an n-gram has non-zero backoff, the full state is returned so * backoff can be properly charged. * These differ only in sign bit because the backoff is in fact zero in either * case. */ const float kNoExtensionBackoff = -0.0; const float kExtensionBackoff = 0.0; const uint64_t kNoExtensionQuant = 0; const uint64_t kExtensionQuant = 1; inline void SetExtension(float &backoff) { if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff; } // This compiles down nicely. inline bool HasExtension(const float &backoff) { typedef union { float f; uint32_t i; } UnionValue; UnionValue compare, interpret; compare.f = kNoExtensionBackoff; interpret.f = backoff; return compare.i != interpret.i; } } // namespace ngram } // namespace lm #endif // LM_BLANK__ ================================================ FILE: src/kenlm/lm/build_binary_main.cc ================================================ #include "lm/model.hh" #include "lm/sizes.hh" #include "util/file_piece.hh" #include "util/usage.hh" #include #include #include #include #include #include #include #include #ifdef WIN32 #include "util/getopt.hh" #else #include #endif namespace lm { namespace ngram { namespace { void Usage(const char *name, const char *default_mem) { std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have and .\n" "-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" "-w mmap|after determines how writing is done.\n" " mmap maps the binary file and writes to it. Default for trie.\n" " after allocates anonymous memory, builds, and writes. Default for probing.\n" "-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n" " model files. order1.arpa must be an ARPA file. All others may be ARPA or\n" " the same data structure as being built. All files must have the same\n" " vocabulary. For probing, the unigrams must be in the same order.\n\n" "type is either probing or trie. Default is probing.\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" "trie is a straightforward trie with bit-level packing. It uses the least\n" "memory and is still faster than SRI or IRST. Building the trie format uses an\n" "on-disk sort to save memory.\n" "-T is the temporary directory prefix. Default is the output file name.\n" "-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n" " with GNU sort. The number is followed by a unit: \% for percent of physical\n" " memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n" " Default unit is K for Kilobytes.\n" "-q turns quantization on and sets the number of bits (e.g. -q 8).\n" "-b sets backoff quantization bits. Requires -q and defaults to that value.\n" "-a compresses pointers using an array of offsets. The parameter is the\n" " maximum number of bits encoded by the array. Memory is minimized subject\n" " to the maximum, so pick 255 to minimize memory.\n\n" "Get a memory estimate by passing an ARPA file without an output file name.\n"; exit(1); } // I could really use boost::lexical_cast right about now. float ParseFloat(const char *from) { char *end; float ret = strtod(from, &end); if (*end) throw util::ParseNumberException(from); return ret; } unsigned long int ParseUInt(const char *from) { char *end; unsigned long int ret = strtoul(from, &end, 10); if (*end) throw util::ParseNumberException(from); return ret; } uint8_t ParseBitCount(const char *from) { unsigned long val = ParseUInt(from); if (val > 25) { util::ParseNumberException e(from); e << " bit counts are limited to 25."; } return val; } void ParseFileList(const char *from, std::vector &to) { to.clear(); while (true) { const char *i; for (i = from; *i && *i != ' '; ++i) {} to.push_back(std::string(from, i - from)); if (!*i) break; from = i + 1; } } void ProbingQuantizationUnsupported() { std::cerr << "Quantization is only implemented in the trie data structure." << std::endl; exit(1); } } // namespace ngram } // namespace lm } // namespace int main(int argc, char *argv[]) { using namespace lm::ngram; const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G"; try { bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false; lm::ngram::Config config; config.building_memory = util::ParseSize(default_mem); int opt; while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); if (!set_backoff_bits) config.backoff_bits = config.prob_bits; quantize = true; break; case 'b': config.backoff_bits = ParseBitCount(optarg); set_backoff_bits = true; break; case 'a': config.pointer_bhiksha_bits = ParseBitCount(optarg); bhiksha = true; break; case 'u': config.unknown_missing_logprob = ParseFloat(optarg); break; case 'p': config.probing_multiplier = ParseFloat(optarg); break; case 't': // legacy case 'T': config.temporary_directory_prefix = optarg; break; case 'm': // legacy config.building_memory = ParseUInt(optarg) * 1048576; break; case 'S': config.building_memory = std::min(static_cast(std::numeric_limits::max()), util::ParseSize(optarg)); break; case 'w': set_write_method = true; if (!strcmp(optarg, "mmap")) { config.write_method = Config::WRITE_MMAP; } else if (!strcmp(optarg, "after")) { config.write_method = Config::WRITE_AFTER; } else { Usage(argv[0], default_mem); } break; case 's': config.sentence_marker_missing = lm::SILENT; break; case 'i': config.positive_log_probability = lm::SILENT; break; case 'r': rest = true; ParseFileList(optarg, config.rest_lower_files); config.rest_function = Config::REST_LOWER; break; default: Usage(argv[0], default_mem); } } if (!quantize && set_backoff_bits) { std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl; abort(); } if (optind + 1 == argc) { ShowSizes(argv[optind], config); return 0; } const char *model_type; const char *from_file; if (optind + 2 == argc) { model_type = "probing"; from_file = argv[optind]; config.write_mmap = argv[optind + 1]; } else if (optind + 3 == argc) { model_type = argv[optind]; from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; } else { Usage(argv[0], default_mem); } if (!strcmp(model_type, "probing")) { if (!set_write_method) config.write_method = Config::WRITE_AFTER; if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); if (rest) { RestProbingModel(from_file, config); } else { ProbingModel(from_file, config); } } else if (!strcmp(model_type, "trie")) { if (rest) { std::cerr << "Rest + trie is not supported yet." << std::endl; return 1; } if (!set_write_method) config.write_method = Config::WRITE_MMAP; if (quantize) { if (bhiksha) { QuantArrayTrieModel(from_file, config); } else { QuantTrieModel(from_file, config); } } else { if (bhiksha) { ArrayTrieModel(from_file, config); } else { TrieModel(from_file, config); } } } else { Usage(argv[0], default_mem); } } catch (const std::exception &e) { std::cerr << e.what() << std::endl; std::cerr << "ERROR" << std::endl; return 1; } std::cerr << "SUCCESS" << std::endl; return 0; } ================================================ FILE: src/kenlm/lm/config.cc ================================================ #include "lm/config.hh" #include namespace lm { namespace ngram { Config::Config() : show_progress(true), messages(&std::cerr), enumerate_vocab(NULL), unknown_missing(COMPLAIN), sentence_marker_missing(THROW_UP), positive_log_probability(THROW_UP), unknown_missing_logprob(-100.0), probing_multiplier(1.5), building_memory(1073741824ULL), // 1 GB temporary_directory_prefix(NULL), arpa_complain(ALL), write_mmap(NULL), write_method(WRITE_AFTER), include_vocab(true), rest_function(REST_MAX), prob_bits(8), backoff_bits(8), pointer_bhiksha_bits(22), load_method(util::POPULATE_OR_READ) {} } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/config.hh ================================================ #ifndef LM_CONFIG__ #define LM_CONFIG__ #include "lm/lm_exception.hh" #include "util/mmap.hh" #include #include #include /* Configuration for ngram model. Separate header to reduce pollution. */ namespace lm { class EnumerateVocab; namespace ngram { struct Config { // EFFECTIVE FOR BOTH ARPA AND BINARY READS // (default true) print progress bar to messages bool show_progress; // Where to log messages including the progress bar. Set to NULL for // silence. std::ostream *messages; std::ostream *ProgressMessages() const { return show_progress ? messages : 0; } // This will be called with every string in the vocabulary. See // enumerate_vocab.hh for more detail. Config does not take ownership; you // are still responsible for deleting it (or stack allocating). EnumerateVocab *enumerate_vocab; // ONLY EFFECTIVE WHEN READING ARPA // What to do when isn't in the provided model. WarningAction unknown_missing; // What to do when or is missing from the model. // If THROW_UP, the exception will be of type util::SpecialWordMissingException. WarningAction sentence_marker_missing; // What to do with a positive log probability. For COMPLAIN and SILENT, map // to 0. WarningAction positive_log_probability; // The probability to substitute for if it's missing from the model. // No effect if the model has or unknown_missing == THROW_UP. float unknown_missing_logprob; // Size multiplier for probing hash table. Must be > 1. Space is linear in // this. Time is probing_multiplier / (probing_multiplier - 1). No effect // for sorted variant. // If you find yourself setting this to a low number, consider using the // TrieModel which has lower memory consumption. float probing_multiplier; // Amount of memory to use for building. The actual memory usage will be // higher since this just sets sort buffer size. Only applies to trie // models. std::size_t building_memory; // Template for temporary directory appropriate for passing to mkdtemp. // The characters XXXXXX are appended before passing to mkdtemp. Only // applies to trie. If NULL, defaults to write_mmap. If that's NULL, // defaults to input file name. const char *temporary_directory_prefix; // Level of complaining to do when loading from ARPA instead of binary format. enum ARPALoadComplain {ALL, EXPENSIVE, NONE}; ARPALoadComplain arpa_complain; // While loading an ARPA file, also write out this binary format file. Set // to NULL to disable. const char *write_mmap; enum WriteMethod { WRITE_MMAP, // Map the file directly. WRITE_AFTER // Write after we're done. }; WriteMethod write_method; // Include the vocab in the binary file? Only effective if write_mmap != NULL. bool include_vocab; // Left rest options. Only used when the model includes rest costs. enum RestFunction { REST_MAX, // Maximum of any score to the left REST_LOWER, // Use lower-order files given below. }; RestFunction rest_function; // Only used for REST_LOWER. std::vector rest_lower_files; // Quantization options. Only effective for QuantTrieModel. One value is // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used // to quantize (and one of the remaining backoffs will be 0). uint8_t prob_bits, backoff_bits; // Bhiksha compression (simple form). Only works with trie. uint8_t pointer_bhiksha_bits; // ONLY EFFECTIVE WHEN READING BINARY // How to get the giant array into memory: lazy mmap, populate, read etc. // See util/mmap.hh for details of MapMethod. util::LoadMethod load_method; // Set defaults. Config(); }; } /* namespace ngram */ } /* namespace lm */ #endif // LM_CONFIG__ ================================================ FILE: src/kenlm/lm/enumerate_vocab.hh ================================================ #ifndef LM_ENUMERATE_VOCAB__ #define LM_ENUMERATE_VOCAB__ #include "lm/word_index.hh" #include "util/string_piece.hh" namespace lm { /* If you need the actual strings in the vocabulary, inherit from this class * and implement Add. Then put a pointer in Config.enumerate_vocab; it does * not take ownership. Add is called once per vocab word. index starts at 0 * and increases by 1 each time. This is only used by the Model constructor; * the pointer is not retained by the class. */ class EnumerateVocab { public: virtual ~EnumerateVocab() {} virtual void Add(WordIndex index, const StringPiece &str) = 0; protected: EnumerateVocab() {} }; } // namespace lm #endif // LM_ENUMERATE_VOCAB__ ================================================ FILE: src/kenlm/lm/facade.hh ================================================ #ifndef LM_FACADE__ #define LM_FACADE__ #include "lm/virtual_interface.hh" #include "util/string_piece.hh" #include namespace lm { namespace base { // Common model interface that depends on knowing the specific classes. // Curiously recurring template pattern. template class ModelFacade : public Model { public: typedef StateT State; typedef VocabularyT Vocabulary; // Default Score function calls FullScore. Model can override this. float Score(const State &in_state, const WordIndex new_word, State &out_state) const { return static_cast(this)->FullScore(in_state, new_word, out_state).prob; } /* Translate from void* to State */ FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const { return static_cast(this)->FullScore( *reinterpret_cast(in_state), new_word, *reinterpret_cast(out_state)); } float Score(const void *in_state, const WordIndex new_word, void *out_state) const { return static_cast(this)->Score( *reinterpret_cast(in_state), new_word, *reinterpret_cast(out_state)); } const State &BeginSentenceState() const { return begin_sentence_; } const State &NullContextState() const { return null_context_; } const Vocabulary &GetVocabulary() const { return *static_cast(&BaseVocabulary()); } protected: ModelFacade() : Model(sizeof(State)) {} virtual ~ModelFacade() {} // begin_sentence and null_context can disappear after. vocab should stay. void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) { begin_sentence_ = begin_sentence; null_context_ = null_context; begin_sentence_memory_ = &begin_sentence_; null_context_memory_ = &null_context_; base_vocab_ = &vocab; order_ = order; } private: State begin_sentence_, null_context_; }; } // mamespace base } // namespace lm #endif // LM_FACADE__ ================================================ FILE: src/kenlm/lm/fragment_main.cc ================================================ #include "lm/binary_format.hh" #include "lm/model.hh" #include "lm/left.hh" #include "util/tokenize_piece.hh" template void Query(const char *name) { Model model(name); std::string line; lm::ngram::ChartState ignored; while (getline(std::cin, line)) { lm::ngram::RuleScore scorer(model, ignored); for (util::TokenIter i(line, ' '); i; ++i) { scorer.Terminal(model.GetVocabulary().Index(*i)); } std::cout << scorer.Finish() << '\n'; } } int main(int argc, char *argv[]) { if (argc != 2) { std::cerr << "Expected model file name." << std::endl; return 1; } const char *name = argv[1]; lm::ngram::ModelType model_type = lm::ngram::PROBING; lm::ngram::RecognizeBinary(name, model_type); switch (model_type) { case lm::ngram::PROBING: Query(name); break; case lm::ngram::REST_PROBING: Query(name); break; default: std::cerr << "Model type not supported yet." << std::endl; } } ================================================ FILE: src/kenlm/lm/kenlm_max_order_main.cc ================================================ #include "lm/max_order.hh" #include int main(int argc, char *argv[]) { std::cerr << "KenLM was compiled with a maximum supported n-gram order set to " << KENLM_MAX_ORDER << "." << std::endl; } ================================================ FILE: src/kenlm/lm/left.hh ================================================ /* Efficient left and right language model state for sentence fragments. * Intended usage: * Store ChartState with every chart entry. * To do a rule application: * 1. Make a ChartState object for your new entry. * 2. Construct RuleScore. * 3. Going from left to right, call Terminal or NonTerminal. * For terminals, just pass the vocab id. * For non-terminals, pass that non-terminal's ChartState. * If your decoder expects scores inclusive of subtree scores (i.e. you * label entries with the highest-scoring path), pass the non-terminal's * score as prob. * If your decoder expects relative scores and will walk the chart later, * pass prob = 0.0. * In other words, the only effect of prob is that it gets added to the * returned log probability. * 4. Call Finish. It returns the log probability. * * There's a couple more details: * Do not pass to Terminal as it is formally not a word in the sentence, * only context. Instead, call BeginSentence. If called, it should be the * first call after RuleScore is constructed (since is always the * leftmost). * * If the leftmost RHS is a non-terminal, it's faster to call BeginNonTerminal. * * Hashing and sorting comparison operators are provided. All state objects * are POD. If you intend to use memcmp on raw state objects, you must call * ZeroRemaining first, as the value of array entries beyond length is * otherwise undefined. * * Usage is of course not limited to chart decoding. Anything that generates * sentence fragments missing left context could benefit. For example, a * phrase-based decoder could pre-score phrases, storing ChartState with each * phrase, even if hypotheses are generated left-to-right. */ #ifndef LM_LEFT__ #define LM_LEFT__ #include "lm/max_order.hh" #include "lm/state.hh" #include "lm/return.hh" #include "util/murmur_hash.hh" #include namespace lm { namespace ngram { template class RuleScore { public: explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(&out), left_done_(false), prob_(0.0) { out.left.length = 0; out.right.length = 0; } void BeginSentence() { out_->right = model_.BeginSentenceState(); // out_->left is empty. left_done_ = true; } void Terminal(WordIndex word) { State copy(out_->right); FullScoreReturn ret(model_.FullScore(copy, word, out_->right)); if (left_done_) { prob_ += ret.prob; return; } if (ret.independent_left) { prob_ += ret.prob; left_done_ = true; return; } out_->left.pointers[out_->left.length++] = ret.extend_left; prob_ += ret.rest; if (out_->right.length != copy.length + 1) left_done_ = true; } // Faster version of NonTerminal for the case where the rule begins with a non-terminal. void BeginNonTerminal(const ChartState &in, float prob = 0.0) { prob_ = prob; *out_ = in; left_done_ = in.left.full; } void NonTerminal(const ChartState &in, float prob = 0.0) { prob_ += prob; if (!in.left.length) { if (in.left.full) { for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i; left_done_ = true; out_->right = in.right; } return; } if (!out_->right.length) { out_->right = in.right; if (left_done_) { prob_ += model_.UnRest(in.left.pointers, in.left.pointers + in.left.length, 1); return; } if (out_->left.length) { left_done_ = true; } else { out_->left = in.left; left_done_ = in.left.full; } return; } float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; float *back = backoffs, *back2 = backoffs2; unsigned char next_use = out_->right.length; // First word if (ExtendLeft(in, next_use, 1, out_->right.backoff, back)) return; // Words after the first, so extending a bigram to begin with for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) { if (ExtendLeft(in, next_use, extend_length, back, back2)) return; std::swap(back, back2); } if (in.left.full) { for (const float *i = back; i != back + next_use; ++i) prob_ += *i; left_done_ = true; out_->right = in.right; return; } // Right state was minimized, so it's already independent of the new words to the left. if (in.right.length < in.left.length) { out_->right = in.right; return; } // Shift exisiting words down. for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) { *(i + in.right.length) = *i; } // Add words from in.right. std::copy(in.right.words, in.right.words + in.right.length, out_->right.words); // Assemble backoff composed on the existing state's backoff followed by the new state's backoff. std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff); std::copy(back, back + next_use, out_->right.backoff + in.right.length); out_->right.length = in.right.length + next_use; } float Finish() { // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram. out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1); return prob_; } void Reset() { prob_ = 0.0; left_done_ = false; out_->left.length = 0; out_->right.length = 0; } void Reset(ChartState &replacement) { out_ = &replacement; Reset(); } private: bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) { ProcessRet(model_.ExtendLeft( out_->right.words, out_->right.words + next_use, // Words to extend into back_in, // Backoffs to use in.left.pointers[extend_length - 1], extend_length, // Words to be extended back_out, // Backoffs for the next score next_use)); // Length of n-gram to use in next scoring. if (next_use != out_->right.length) { left_done_ = true; if (!next_use) { // Early exit. out_->right = in.right; prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1); return true; } } // Continue scoring. return false; } void ProcessRet(const FullScoreReturn &ret) { if (left_done_) { prob_ += ret.prob; return; } if (ret.independent_left) { prob_ += ret.prob; left_done_ = true; return; } out_->left.pointers[out_->left.length++] = ret.extend_left; prob_ += ret.rest; } const M &model_; ChartState *out_; bool left_done_; float prob_; }; } // namespace ngram } // namespace lm #endif // LM_LEFT__ ================================================ FILE: src/kenlm/lm/left_test.cc ================================================ #include "lm/left.hh" #include "lm/model.hh" #include "util/tokenize_piece.hh" #include #define BOOST_TEST_MODULE LeftTest #include #include namespace lm { namespace ngram { namespace { #define Term(word) score.Terminal(m.GetVocabulary().Index(word)); #define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value); // Apparently some Boost versions use templates and are pretty strict about types matching. #define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast(ref), static_cast(value), static_cast(tol)); template void Short(const M &m) { ChartState base; { RuleScore score(m, base); Term("more"); Term("loin"); SLOPPY_CHECK_CLOSE(-1.206319 - 0.3561665, score.Finish(), 0.001); } BOOST_CHECK(base.left.full); BOOST_CHECK_EQUAL(2, base.left.length); BOOST_CHECK_EQUAL(1, base.right.length); VCheck("loin", base.right.words[0]); ChartState more_left; { RuleScore score(m, more_left); Term("little"); score.NonTerminal(base, -1.206319 - 0.3561665); // p(little more loin | null context) SLOPPY_CHECK_CLOSE(-1.56538, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(3, more_left.left.length); BOOST_CHECK_EQUAL(1, more_left.right.length); VCheck("loin", more_left.right.words[0]); BOOST_CHECK(more_left.left.full); ChartState shorter; { RuleScore score(m, shorter); Term("to"); score.NonTerminal(base, -1.206319 - 0.3561665); SLOPPY_CHECK_CLOSE(-0.30103 - 1.687872 - 1.206319 - 0.3561665, score.Finish(), 0.01); } BOOST_CHECK_EQUAL(1, shorter.left.length); BOOST_CHECK_EQUAL(1, shorter.right.length); VCheck("loin", shorter.right.words[0]); BOOST_CHECK(shorter.left.full); } template void Charge(const M &m) { ChartState base; { RuleScore score(m, base); Term("on"); Term("more"); SLOPPY_CHECK_CLOSE(-1.509559 -0.4771212 -1.206319, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(1, base.left.length); BOOST_CHECK_EQUAL(1, base.right.length); VCheck("more", base.right.words[0]); BOOST_CHECK(base.left.full); ChartState extend; { RuleScore score(m, extend); Term("looking"); score.NonTerminal(base, -1.509559 -0.4771212 -1.206319); SLOPPY_CHECK_CLOSE(-3.91039, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, extend.left.length); BOOST_CHECK_EQUAL(1, extend.right.length); VCheck("more", extend.right.words[0]); BOOST_CHECK(extend.left.full); ChartState tobos; { RuleScore score(m, tobos); score.BeginSentence(); score.NonTerminal(extend, -3.91039); SLOPPY_CHECK_CLOSE(-3.471169, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(0, tobos.left.length); BOOST_CHECK_EQUAL(1, tobos.right.length); } template float LeftToRight(const M &m, const std::vector &words, bool begin_sentence = false) { float ret = 0.0; State right = begin_sentence ? m.BeginSentenceState() : m.NullContextState(); for (std::vector::const_iterator i = words.begin(); i != words.end(); ++i) { State copy(right); ret += m.Score(copy, *i, right); } return ret; } template float RightToLeft(const M &m, const std::vector &words, bool begin_sentence = false) { float ret = 0.0; ChartState state; state.left.length = 0; state.right.length = 0; state.left.full = false; for (std::vector::const_reverse_iterator i = words.rbegin(); i != words.rend(); ++i) { ChartState copy(state); RuleScore score(m, state); score.Terminal(*i); score.NonTerminal(copy, ret); ret = score.Finish(); } if (begin_sentence) { ChartState copy(state); RuleScore score(m, state); score.BeginSentence(); score.NonTerminal(copy, ret); ret = score.Finish(); } return ret; } template float TreeMiddle(const M &m, const std::vector &words, bool begin_sentence = false) { std::vector > states(words.size()); for (unsigned int i = 0; i < words.size(); ++i) { RuleScore score(m, states[i].first); score.Terminal(words[i]); states[i].second = score.Finish(); } while (states.size() > 1) { std::vector > upper((states.size() + 1) / 2); for (unsigned int i = 0; i < states.size() / 2; ++i) { RuleScore score(m, upper[i].first); score.NonTerminal(states[i*2].first, states[i*2].second); score.NonTerminal(states[i*2+1].first, states[i*2+1].second); upper[i].second = score.Finish(); } if (states.size() % 2) { upper.back() = states.back(); } std::swap(states, upper); } if (states.empty()) return 0.0; if (begin_sentence) { ChartState ignored; RuleScore score(m, ignored); score.BeginSentence(); score.NonTerminal(states.front().first, states.front().second); return score.Finish(); } else { return states.front().second; } } template void LookupVocab(const M &m, const StringPiece &str, std::vector &out) { out.clear(); for (util::TokenIter i(str, ' '); i; ++i) { out.push_back(m.GetVocabulary().Index(*i)); } } #define TEXT_TEST(str) \ LookupVocab(m, str, words); \ expect = LeftToRight(m, words, rest); \ SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \ SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \ // Build sentences, or parts thereof, from right to left. template void GrowBig(const M &m, bool rest = false) { std::vector words; float expect; TEXT_TEST("in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown "); TEXT_TEST("on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown "); TEXT_TEST("on a little more loin also would consider higher to look good"); TEXT_TEST("more loin also would consider higher to look good"); TEXT_TEST("more loin also would consider higher to look"); TEXT_TEST("also would consider higher to look"); TEXT_TEST("also would consider higher"); TEXT_TEST("would consider higher to look"); TEXT_TEST("consider higher to look"); TEXT_TEST("consider higher to"); TEXT_TEST("consider higher"); } template void GrowSmall(const M &m, bool rest = false) { std::vector words; float expect; TEXT_TEST("in biarritz watching considering looking . "); TEXT_TEST("in biarritz watching considering looking ."); TEXT_TEST("in biarritz"); } template void AlsoWouldConsiderHigher(const M &m) { ChartState also; { RuleScore score(m, also); score.Terminal(m.GetVocabulary().Index("also")); SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); } ChartState would; { RuleScore score(m, would); score.Terminal(m.GetVocabulary().Index("would")); SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); } ChartState combine_also_would; { RuleScore score(m, combine_also_would); score.NonTerminal(also, -1.687872); score.NonTerminal(would, -1.687872); SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, combine_also_would.right.length); ChartState also_would; { RuleScore score(m, also_would); score.Terminal(m.GetVocabulary().Index("also")); score.Terminal(m.GetVocabulary().Index("would")); SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, also_would.right.length); ChartState consider; { RuleScore score(m, consider); score.Terminal(m.GetVocabulary().Index("consider")); SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(1, consider.left.length); BOOST_CHECK_EQUAL(1, consider.right.length); BOOST_CHECK(!consider.left.full); ChartState higher; float higher_score; { RuleScore score(m, higher); score.Terminal(m.GetVocabulary().Index("higher")); higher_score = score.Finish(); } SLOPPY_CHECK_CLOSE(-1.509559, higher_score, 0.001); BOOST_CHECK_EQUAL(1, higher.left.length); BOOST_CHECK_EQUAL(1, higher.right.length); BOOST_CHECK(!higher.left.full); VCheck("higher", higher.right.words[0]); SLOPPY_CHECK_CLOSE(-0.30103, higher.right.backoff[0], 0.001); ChartState consider_higher; { RuleScore score(m, consider_higher); score.NonTerminal(consider, -1.687872); score.NonTerminal(higher, higher_score); SLOPPY_CHECK_CLOSE(-1.509559 - 1.687872 - 0.30103, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(2, consider_higher.left.length); BOOST_CHECK(!consider_higher.left.full); ChartState full; { RuleScore score(m, full); score.NonTerminal(combine_also_would, -1.687872 - 2.0); score.NonTerminal(consider_higher, -1.509559 - 1.687872 - 0.30103); SLOPPY_CHECK_CLOSE(-10.6879, score.Finish(), 0.001); } BOOST_CHECK_EQUAL(4, full.right.length); } #define CHECK_SCORE(str, val) \ { \ float got = val; \ std::vector indices; \ LookupVocab(m, str, indices); \ SLOPPY_CHECK_CLOSE(LeftToRight(m, indices), got, 0.001); \ } template void FullGrow(const M &m) { std::vector words; LookupVocab(m, "in biarritz watching considering looking . ", words); ChartState lexical[7]; float lexical_scores[7]; for (unsigned int i = 0; i < 7; ++i) { RuleScore score(m, lexical[i]); score.Terminal(words[i]); lexical_scores[i] = score.Finish(); } CHECK_SCORE("in", lexical_scores[0]); CHECK_SCORE("biarritz", lexical_scores[1]); CHECK_SCORE("watching", lexical_scores[2]); CHECK_SCORE("", lexical_scores[6]); ChartState l1[4]; float l1_scores[4]; { RuleScore score(m, l1[0]); score.NonTerminal(lexical[0], lexical_scores[0]); score.NonTerminal(lexical[1], lexical_scores[1]); CHECK_SCORE("in biarritz", l1_scores[0] = score.Finish()); } { RuleScore score(m, l1[1]); score.NonTerminal(lexical[2], lexical_scores[2]); score.NonTerminal(lexical[3], lexical_scores[3]); CHECK_SCORE("watching considering", l1_scores[1] = score.Finish()); } { RuleScore score(m, l1[2]); score.NonTerminal(lexical[4], lexical_scores[4]); score.NonTerminal(lexical[5], lexical_scores[5]); CHECK_SCORE("looking .", l1_scores[2] = score.Finish()); } BOOST_CHECK_EQUAL(l1[2].left.length, 1); l1[3] = lexical[6]; l1_scores[3] = lexical_scores[6]; ChartState l2[2]; float l2_scores[2]; { RuleScore score(m, l2[0]); score.NonTerminal(l1[0], l1_scores[0]); score.NonTerminal(l1[1], l1_scores[1]); CHECK_SCORE("in biarritz watching considering", l2_scores[0] = score.Finish()); } { RuleScore score(m, l2[1]); score.NonTerminal(l1[2], l1_scores[2]); score.NonTerminal(l1[3], l1_scores[3]); CHECK_SCORE("looking . ", l2_scores[1] = score.Finish()); } BOOST_CHECK_EQUAL(l2[1].left.length, 1); BOOST_CHECK(l2[1].left.full); ChartState top; { RuleScore score(m, top); score.NonTerminal(l2[0], l2_scores[0]); score.NonTerminal(l2[1], l2_scores[1]); CHECK_SCORE("in biarritz watching considering looking . ", score.Finish()); } } const char *FileLocation() { if (boost::unit_test::framework::master_test_suite().argc < 2) { return "test.arpa"; } return boost::unit_test::framework::master_test_suite().argv[1]; } template void Everything() { Config config; config.messages = NULL; M m(FileLocation(), config); Short(m); Charge(m); GrowBig(m); AlsoWouldConsiderHigher(m); GrowSmall(m); FullGrow(m); } BOOST_AUTO_TEST_CASE(ProbingAll) { Everything(); } BOOST_AUTO_TEST_CASE(TrieAll) { Everything(); } BOOST_AUTO_TEST_CASE(QuantTrieAll) { Everything(); } BOOST_AUTO_TEST_CASE(ArrayQuantTrieAll) { Everything(); } BOOST_AUTO_TEST_CASE(ArrayTrieAll) { Everything(); } BOOST_AUTO_TEST_CASE(RestProbing) { Config config; config.messages = NULL; RestProbingModel m(FileLocation(), config); GrowBig(m, true); } } // namespace } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/lm_exception.cc ================================================ #include "lm/lm_exception.hh" #include #include namespace lm { ConfigException::ConfigException() throw() {} ConfigException::~ConfigException() throw() {} LoadException::LoadException() throw() {} LoadException::~LoadException() throw() {} FormatLoadException::FormatLoadException() throw() {} FormatLoadException::~FormatLoadException() throw() {} VocabLoadException::VocabLoadException() throw() {} VocabLoadException::~VocabLoadException() throw() {} SpecialWordMissingException::SpecialWordMissingException() throw() {} SpecialWordMissingException::~SpecialWordMissingException() throw() {} } // namespace lm ================================================ FILE: src/kenlm/lm/lm_exception.hh ================================================ #ifndef LM_LM_EXCEPTION__ #define LM_LM_EXCEPTION__ // Named to avoid conflict with util/exception.hh. #include "util/exception.hh" #include "util/string_piece.hh" #include #include namespace lm { typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction; class ConfigException : public util::Exception { public: ConfigException() throw(); ~ConfigException() throw(); }; class LoadException : public util::Exception { public: virtual ~LoadException() throw(); protected: LoadException() throw(); }; class FormatLoadException : public LoadException { public: FormatLoadException() throw(); ~FormatLoadException() throw(); }; class VocabLoadException : public LoadException { public: virtual ~VocabLoadException() throw(); VocabLoadException() throw(); }; class SpecialWordMissingException : public VocabLoadException { public: explicit SpecialWordMissingException() throw(); ~SpecialWordMissingException() throw(); }; } // namespace lm #endif // LM_LM_EXCEPTION ================================================ FILE: src/kenlm/lm/max_order.hh ================================================ /* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. * If not, this is the default maximum order. * Having this limit means that State can be * (kMaxOrder - 1) * sizeof(float) bytes instead of * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead */ #ifndef KENLM_ORDER_MESSAGE #define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh." #endif ================================================ FILE: src/kenlm/lm/model.cc ================================================ #include "lm/model.hh" #include "lm/blank.hh" #include "lm/lm_exception.hh" #include "lm/search_hashed.hh" #include "lm/search_trie.hh" #include "lm/read_arpa.hh" #include "util/have.hh" #include "util/murmur_hash.hh" #include #include #include #include #include namespace lm { namespace ngram { namespace detail { template const ModelType GenericModel::kModelType = Search::kModelType; template uint64_t GenericModel::Size(const std::vector &counts, const Config &config) { return VocabularyT::Size(counts[0], config) + Search::Size(counts, config); } template void GenericModel::SetupMemory(void *base, const std::vector &counts, const Config &config) { size_t goal_size = util::CheckOverflow(Size(counts, config)); uint8_t *start = static_cast(base); size_t allocated = VocabularyT::Size(counts[0], config); vocab_.SetupMemory(start, allocated, counts[0], config); start += allocated; start = search_.SetupMemory(start, counts, config); if (static_cast(start - static_cast(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast(base)) << " but Size says they should take " << goal_size); } template GenericModel::GenericModel(const char *file, const Config &config) { LoadLM(file, config, *this); // g++ prints warnings unless these are fully initialized. State begin_sentence = State(); begin_sentence.length = 1; begin_sentence.words[0] = vocab_.BeginSentence(); typename Search::Node ignored_node; bool ignored_independent_left; uint64_t ignored_extend_left; begin_sentence.backoff[0] = search_.LookupUnigram(begin_sentence.words[0], ignored_node, ignored_independent_left, ignored_extend_left).Backoff(); State null_context = State(); null_context.length = 0; P::Init(begin_sentence, null_context, vocab_, search_.Order()); } namespace { void CheckCounts(const std::vector &counts) { UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); if (sizeof(uint64_t) > sizeof(std::size_t)) { for (std::vector::const_iterator i = counts.begin(); i != counts.end(); ++i) { UTIL_THROW_IF(*i > static_cast(std::numeric_limits::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); } } } } // namespace template void GenericModel::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { CheckCounts(params.counts); SetupMemory(start, params.counts, config); vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); search_.LoadedBinary(); } template void GenericModel::InitializeFromARPA(const char *file, const Config &config) { // Backing file is the ARPA. Steal it so we can make the backing file the mmap output if any. util::FilePiece f(backing_.file.release(), file, config.ProgressMessages()); try { std::vector counts; // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. ReadARPACounts(f, counts); CheckCounts(counts); if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config); if (config.write_mmap) { WriteWordsWrapper wrap(config.enumerate_vocab); vocab_.ConfigureEnumerate(&wrap, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config)); } else { vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); } if (!vocab_.SawUnk()) { assert(config.unknown_missing != THROW_UP); // Default probabilities for unknown. search_.UnknownUnigram().backoff = 0.0; search_.UnknownUnigram().prob = config.unknown_missing_logprob; } FinishFile(config, kModelType, kVersion, counts, vocab_.UnkCountChangePadding(), backing_); } catch (util::Exception &e) { e << " Byte: " << f.Offset(); throw; } } template void GenericModel::UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); Search::UpdateConfigFromBinary(fd, counts, config); } template FullScoreReturn GenericModel::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { ret.prob += *i; } return ret; } template FullScoreReturn GenericModel::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const { context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); FullScoreReturn ret = ScoreExceptBackoff(context_rbegin, context_rend, new_word, out_state); // Add the backoff weights for n-grams of order start to (context_rend - context_rbegin). unsigned char start = ret.ngram_length; if (context_rend - context_rbegin < static_cast(start)) return ret; bool independent_left; uint64_t extend_left; typename Search::Node node; if (start <= 1) { ret.prob += search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff(); start = 2; } else if (!search_.FastMakeNode(context_rbegin, context_rbegin + start - 1, node)) { return ret; } // i is the order of the backoff we're looking for. unsigned char order_minus_2 = start - 2; for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++order_minus_2) { typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left)); if (!p.Found()) break; ret.prob += p.Backoff(); } return ret; } template void GenericModel::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const { // Generate a state from context. context_rend = std::min(context_rend, context_rbegin + P::Order() - 1); if (context_rend == context_rbegin) { out_state.length = 0; return; } typename Search::Node node; bool independent_left; uint64_t extend_left; out_state.backoff[0] = search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff(); out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; float *backoff_out = out_state.backoff + 1; unsigned char order_minus_2 = 0; for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++order_minus_2) { typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left)); if (!p.Found()) { std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); return; } *backoff_out = p.Backoff(); if (HasExtension(*backoff_out)) out_state.length = i - context_rbegin + 1; } std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words); } template FullScoreReturn GenericModel::ExtendLeft( const WordIndex *add_rbegin, const WordIndex *add_rend, const float *backoff_in, uint64_t extend_pointer, unsigned char extend_length, float *backoff_out, unsigned char &next_use) const { FullScoreReturn ret; typename Search::Node node; if (extend_length == 1) { typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast(extend_pointer), node, ret.independent_left, ret.extend_left)); ret.rest = ptr.Rest(); ret.prob = ptr.Prob(); assert(!ret.independent_left); } else { typename Search::MiddlePointer ptr(search_.Unpack(extend_pointer, extend_length, node)); ret.rest = ptr.Rest(); ret.prob = ptr.Prob(); ret.extend_left = extend_pointer; // If this function is called, then it does depend on left words. ret.independent_left = false; } float subtract_me = ret.rest; ret.ngram_length = extend_length; next_use = extend_length; ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret); next_use -= extend_length; // Charge backoffs. for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b; ret.prob -= subtract_me; ret.rest -= subtract_me; return ret; } namespace { // Do a paraonoid copy of history, assuming new_word has already been copied // (hence the -1). out_state.length could be zero so I avoided using // std::copy. void CopyRemainingHistory(const WordIndex *from, State &out_state) { WordIndex *out = out_state.words + 1; const WordIndex *in_end = from + static_cast(out_state.length) - 1; for (const WordIndex *in = from; in < in_end; ++in, ++out) *out = *in; } } // namespace /* Ugly optimized function. Produce a score excluding backoff. * The search goes in increasing order of ngram length. * Context goes backward, so context_begin is the word immediately preceeding * new_word. */ template FullScoreReturn GenericModel::ScoreExceptBackoff( const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const { assert(new_word < vocab_.Bound()); FullScoreReturn ret; // ret.ngram_length contains the last known non-blank ngram length. ret.ngram_length = 1; typename Search::Node node; typename Search::UnigramPointer uni(search_.LookupUnigram(new_word, node, ret.independent_left, ret.extend_left)); out_state.backoff[0] = uni.Backoff(); ret.prob = uni.Prob(); ret.rest = uni.Rest(); // This is the length of the context that should be used for continuation to the right. out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0; // We'll write the word anyway since it will probably be used and does no harm being there. out_state.words[0] = new_word; if (context_rbegin == context_rend) return ret; ResumeScore(context_rbegin, context_rend, 0, node, out_state.backoff + 1, out_state.length, ret); CopyRemainingHistory(context_rbegin, out_state); return ret; } template void GenericModel::ResumeScore(const WordIndex *hist_iter, const WordIndex *const context_rend, unsigned char order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const { for (; ; ++order_minus_2, ++hist_iter, ++backoff_out) { if (hist_iter == context_rend) return; if (ret.independent_left) return; if (order_minus_2 == P::Order() - 2) break; typename Search::MiddlePointer pointer(search_.LookupMiddle(order_minus_2, *hist_iter, node, ret.independent_left, ret.extend_left)); if (!pointer.Found()) return; *backoff_out = pointer.Backoff(); ret.prob = pointer.Prob(); ret.rest = pointer.Rest(); ret.ngram_length = order_minus_2 + 2; if (HasExtension(*backoff_out)) { next_use = ret.ngram_length; } } ret.independent_left = true; typename Search::LongestPointer longest(search_.LookupLongest(*hist_iter, node)); if (longest.Found()) { ret.prob = longest.Prob(); ret.rest = ret.prob; // There is no blank in longest_. ret.ngram_length = P::Order(); } } template float GenericModel::InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { float ret; typename Search::Node node; if (first_length == 1) { if (pointers_begin >= pointers_end) return 0.0; bool independent_left; uint64_t extend_left; typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast(*pointers_begin), node, independent_left, extend_left)); ret = ptr.Prob() - ptr.Rest(); ++first_length; ++pointers_begin; } else { ret = 0.0; } for (const uint64_t *i = pointers_begin; i < pointers_end; ++i, ++first_length) { typename Search::MiddlePointer ptr(search_.Unpack(*i, first_length, node)); ret += ptr.Prob() - ptr.Rest(); } return ret; } template class GenericModel, ProbingVocabulary>; template class GenericModel, ProbingVocabulary>; template class GenericModel, SortedVocabulary>; template class GenericModel, SortedVocabulary>; template class GenericModel, SortedVocabulary>; template class GenericModel, SortedVocabulary>; } // namespace detail } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/model.hh ================================================ #ifndef LM_MODEL__ #define LM_MODEL__ #include "lm/bhiksha.hh" #include "lm/binary_format.hh" #include "lm/config.hh" #include "lm/facade.hh" #include "lm/quantize.hh" #include "lm/search_hashed.hh" #include "lm/search_trie.hh" #include "lm/state.hh" #include "lm/value.hh" #include "lm/vocab.hh" #include "lm/weights.hh" #include "util/murmur_hash.hh" #include #include #include namespace util { class FilePiece; } namespace lm { namespace ngram { namespace detail { // Should return the same results as SRI. // ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts. template class GenericModel : public base::ModelFacade, State, VocabularyT> { private: typedef base::ModelFacade, State, VocabularyT> P; public: // This is the model type returned by RecognizeBinary. static const ModelType kModelType; static const unsigned int kVersion = Search::kVersion; /* Get the size of memory that will be mapped given ngram counts. This * does not include small non-mapped control structures, such as this class * itself. */ static uint64_t Size(const std::vector &counts, const Config &config = Config()); /* Load the model from a file. It may be an ARPA or binary file. Binary * files must have the format expected by this class or you'll get an * exception. So TrieModel can only load ARPA or binary created by * TrieModel. To classify binary files, call RecognizeBinary in * lm/binary_format.hh. */ explicit GenericModel(const char *file, const Config &config = Config()); /* Score p(new_word | in_state) and incorporate new_word into out_state. * Note that in_state and out_state must be different references: * &in_state != &out_state. */ FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const; /* Slower call without in_state. Try to remember state, but sometimes it * would cost too much memory or your decoder isn't setup properly. * To use this function, make an array of WordIndex containing the context * vocabulary ids in reverse order. Then, pass the bounds of the array: * [context_rbegin, context_rend). The new_word is not part of the context * array unless you intend to repeat words. */ FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; /* Get the state for a context. Don't use this if you can avoid it. Use * BeginSentenceState or EmptyContextState and extend from those. If * you're only going to use this state to call FullScore once, use * FullScoreForgotState. * To use this function, make an array of WordIndex containing the context * vocabulary ids in reverse order. Then, pass the bounds of the array: * [context_rbegin, context_rend). */ void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const; /* More efficient version of FullScore where a partial n-gram has already * been scored. * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE. */ FullScoreReturn ExtendLeft( // Additional context in reverse order. This will update add_rend to const WordIndex *add_rbegin, const WordIndex *add_rend, // Backoff weights to use. const float *backoff_in, // extend_left returned by a previous query. uint64_t extend_pointer, // Length of n-gram that the pointer corresponds to. unsigned char extend_length, // Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)] float *backoff_out, // Amount of additional content that should be considered by the next call. unsigned char &next_use) const; /* Return probabilities minus rest costs for an array of pointers. The * first length should be the length of the n-gram to which pointers_begin * points. */ float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const { // Compiler should optimize this if away. return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0; } private: friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel &to); static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config); FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const; // Score bigrams and above. Do not include backoff. void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const; // Appears after Size in the cc file. void SetupMemory(void *start, const std::vector &counts, const Config &config); void InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd); void InitializeFromARPA(const char *file, const Config &config); float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const; Backing &MutableBacking() { return backing_; } Backing backing_; VocabularyT vocab_; Search search_; }; } // namespace detail // Instead of typedef, inherit. This allows the Model etc to be forward declared. // Oh the joys of C and C++. #define LM_COMMA() , #define LM_NAME_MODEL(name, from)\ class name : public from {\ public:\ name(const char *file, const Config &config = Config()) : from(file, config) {}\ }; LM_NAME_MODEL(ProbingModel, detail::GenericModel LM_COMMA() ProbingVocabulary>); LM_NAME_MODEL(RestProbingModel, detail::GenericModel LM_COMMA() ProbingVocabulary>); LM_NAME_MODEL(TrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); LM_NAME_MODEL(QuantTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel LM_COMMA() SortedVocabulary>); // Default implementation. No real reason for it to be the default. typedef ::lm::ngram::ProbingVocabulary Vocabulary; typedef ProbingModel Model; } // namespace ngram } // namespace lm #endif // LM_MODEL__ ================================================ FILE: src/kenlm/lm/model_test.cc ================================================ #include "lm/model.hh" #include #include #define BOOST_TEST_MODULE ModelTest #include #include // Apparently some Boost versions use templates and are pretty strict about types matching. #define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast(ref), static_cast(value), static_cast(tol)); namespace lm { namespace ngram { std::ostream &operator<<(std::ostream &o, const State &state) { o << "State length " << static_cast(state.length) << ':'; for (const WordIndex *i = state.words; i < state.words + state.length; ++i) { o << ' ' << *i; } return o; } namespace { // Stupid bjam reverses the command line arguments randomly. const char *TestLocation() { if (boost::unit_test::framework::master_test_suite().argc < 3) { return "test.arpa"; } char **argv = boost::unit_test::framework::master_test_suite().argv; return argv[strstr(argv[1], "nounk") ? 2 : 1]; } const char *TestNoUnkLocation() { if (boost::unit_test::framework::master_test_suite().argc < 3) { return "test_nounk.arpa"; } char **argv = boost::unit_test::framework::master_test_suite().argv; return argv[strstr(argv[1], "nounk") ? 1 : 2]; } template State GetState(const Model &model, const char *word, const State &in) { WordIndex context[in.length + 1]; context[0] = model.GetVocabulary().Index(word); std::copy(in.words, in.words + in.length, context + 1); State ret; model.GetState(context, context + in.length + 1, ret); return ret; } #define StartTest(word, ngram, score, indep_left) \ ret = model.FullScore( \ state, \ model.GetVocabulary().Index(word), \ out);\ SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ BOOST_CHECK_EQUAL(static_cast(ngram), ret.ngram_length); \ BOOST_CHECK_GE(std::min(ngram, 5 - 1), out.length); \ BOOST_CHECK_EQUAL(indep_left, ret.independent_left); \ BOOST_CHECK_EQUAL(out, GetState(model, word, state)); #define AppendTest(word, ngram, score, indep_left) \ StartTest(word, ngram, score, indep_left) \ state = out; template void Starters(const M &model) { FullScoreReturn ret; Model::State state(model.BeginSentenceState()); Model::State out; StartTest("looking", 2, -0.4846522, true); // , probability plus backoff StartTest(",", 1, -1.383514 + -0.4149733, true); // probability plus backoff StartTest("this_is_not_found", 1, -1.995635 + -0.4149733, true); } template void Continuation(const M &model) { FullScoreReturn ret; Model::State state(model.BeginSentenceState()); Model::State out; AppendTest("looking", 2, -0.484652, true); AppendTest("on", 3, -0.348837, true); AppendTest("a", 4, -0.0155266, true); AppendTest("little", 5, -0.00306122, true); State preserve = state; AppendTest("the", 1, -4.04005, true); AppendTest("biarritz", 1, -1.9889, true); AppendTest("not_found", 1, -2.29666, true); AppendTest("more", 1, -1.20632 - 20.0, true); AppendTest(".", 2, -0.51363, true); AppendTest("", 3, -0.0191651, true); BOOST_CHECK_EQUAL(0, state.length); state = preserve; AppendTest("more", 5, -0.00181395, true); BOOST_CHECK_EQUAL(4, state.length); AppendTest("loin", 5, -0.0432557, true); BOOST_CHECK_EQUAL(1, state.length); } template void Blanks(const M &model) { FullScoreReturn ret; State state(model.NullContextState()); State out; AppendTest("also", 1, -1.687872, false); AppendTest("would", 2, -2, true); AppendTest("consider", 3, -3, true); State preserve = state; AppendTest("higher", 4, -4, true); AppendTest("looking", 5, -5, true); BOOST_CHECK_EQUAL(1, state.length); state = preserve; // also would consider not_found AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true); state = model.NullContextState(); // higher looking is a blank. AppendTest("higher", 1, -1.509559, false); AppendTest("looking", 2, -1.285941 - 0.30103, false); State higher_looking = state; BOOST_CHECK_EQUAL(1, state.length); AppendTest("not_found", 1, -1.995635 - 0.4771212, true); state = higher_looking; // higher looking consider AppendTest("consider", 1, -1.687872 - 0.4771212, true); state = model.NullContextState(); AppendTest("would", 1, -1.687872, false); BOOST_CHECK_EQUAL(1, state.length); AppendTest("consider", 2, -1.687872 -0.30103, false); BOOST_CHECK_EQUAL(2, state.length); AppendTest("higher", 3, -1.509559 - 0.30103, false); BOOST_CHECK_EQUAL(3, state.length); AppendTest("looking", 4, -1.285941 - 0.30103, false); } template void Unknowns(const M &model) { FullScoreReturn ret; State state(model.NullContextState()); State out; AppendTest("not_found", 1, -1.995635, false); State preserve = state; AppendTest("not_found2", 2, -15.0, true); AppendTest("not_found3", 2, -15.0 - 2.0, true); state = preserve; AppendTest("however", 2, -4, true); AppendTest("not_found3", 3, -6, true); } template void MinimalState(const M &model) { FullScoreReturn ret; State state(model.NullContextState()); State out; AppendTest("baz", 1, -6.535897, true); BOOST_CHECK_EQUAL(0, state.length); state = model.NullContextState(); AppendTest("foo", 1, -3.141592, true); BOOST_CHECK_EQUAL(1, state.length); AppendTest("bar", 2, -6.0, true); // Has to include the backoff weight. BOOST_CHECK_EQUAL(1, state.length); AppendTest("bar", 1, -2.718281 + 3.0, true); BOOST_CHECK_EQUAL(1, state.length); state = model.NullContextState(); AppendTest("to", 1, -1.687872, false); AppendTest("look", 2, -0.2922095, true); BOOST_CHECK_EQUAL(2, state.length); AppendTest("good", 3, -7, true); } template void ExtendLeftTest(const M &model) { State right; FullScoreReturn little(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("little"), right)); const float kLittleProb = -1.285941; SLOPPY_CHECK_CLOSE(kLittleProb, little.prob, 0.001); unsigned char next_use; float backoff_out[4]; FullScoreReturn extend_none(model.ExtendLeft(NULL, NULL, NULL, little.extend_left, 1, NULL, next_use)); BOOST_CHECK_EQUAL(0, next_use); BOOST_CHECK_EQUAL(little.extend_left, extend_none.extend_left); SLOPPY_CHECK_CLOSE(little.prob - little.rest, extend_none.prob, 0.001); BOOST_CHECK_EQUAL(1, extend_none.ngram_length); const WordIndex a = model.GetVocabulary().Index("a"); float backoff_in = 3.14; // a little FullScoreReturn extend_a(model.ExtendLeft(&a, &a + 1, &backoff_in, little.extend_left, 1, backoff_out, next_use)); BOOST_CHECK_EQUAL(1, next_use); SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001); SLOPPY_CHECK_CLOSE(-0.09132547 - little.rest, extend_a.prob, 0.001); BOOST_CHECK_EQUAL(2, extend_a.ngram_length); BOOST_CHECK(!extend_a.independent_left); const WordIndex on = model.GetVocabulary().Index("on"); FullScoreReturn extend_on(model.ExtendLeft(&on, &on + 1, &backoff_in, extend_a.extend_left, 2, backoff_out, next_use)); BOOST_CHECK_EQUAL(1, next_use); SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[0], 0.001); SLOPPY_CHECK_CLOSE(-0.0283603 - (extend_a.rest + little.rest), extend_on.prob, 0.001); BOOST_CHECK_EQUAL(3, extend_on.ngram_length); BOOST_CHECK(!extend_on.independent_left); const WordIndex both[2] = {a, on}; float backoff_in_arr[4]; FullScoreReturn extend_both(model.ExtendLeft(both, both + 2, backoff_in_arr, little.extend_left, 1, backoff_out, next_use)); BOOST_CHECK_EQUAL(2, next_use); SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001); SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[1], 0.001); SLOPPY_CHECK_CLOSE(-0.0283603 - little.rest, extend_both.prob, 0.001); BOOST_CHECK_EQUAL(3, extend_both.ngram_length); BOOST_CHECK(!extend_both.independent_left); BOOST_CHECK_EQUAL(extend_on.extend_left, extend_both.extend_left); } #define StatelessTest(word, provide, ngram, score) \ ret = model.FullScoreForgotState(indices + num_words - word, indices + num_words - word + provide, indices[num_words - word - 1], state); \ SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ BOOST_CHECK_EQUAL(static_cast(ngram), ret.ngram_length); \ model.GetState(indices + num_words - word, indices + num_words - word + provide, before); \ ret = model.FullScore(before, indices[num_words - word - 1], out); \ BOOST_CHECK(state == out); \ SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \ BOOST_CHECK_EQUAL(static_cast(ngram), ret.ngram_length); template void Stateless(const M &model) { const char *words[] = {"", "looking", "on", "a", "little", "the", "biarritz", "not_found", "more", ".", ""}; const size_t num_words = sizeof(words) / sizeof(const char*); // Silience "array subscript is above array bounds" when extracting end pointer. WordIndex indices[num_words + 1]; for (unsigned int i = 0; i < num_words; ++i) { indices[num_words - 1 - i] = model.GetVocabulary().Index(words[i]); } FullScoreReturn ret; State state, out, before; ret = model.FullScoreForgotState(indices + num_words - 1, indices + num_words, indices[num_words - 2], state); SLOPPY_CHECK_CLOSE(-0.484652, ret.prob, 0.001); StatelessTest(1, 1, 2, -0.484652); // looking StatelessTest(1, 2, 2, -0.484652); // on AppendTest("on", 3, -0.348837, true); StatelessTest(2, 3, 3, -0.348837); StatelessTest(2, 2, 3, -0.348837); StatelessTest(2, 1, 2, -0.4638903); // a StatelessTest(3, 4, 4, -0.0155266); // little AppendTest("little", 5, -0.00306122, true); StatelessTest(4, 5, 5, -0.00306122); // the AppendTest("the", 1, -4.04005, true); StatelessTest(5, 5, 1, -4.04005); // No context of the. StatelessTest(5, 0, 1, -1.687872); // biarritz StatelessTest(6, 1, 1, -1.9889); // not found StatelessTest(7, 1, 1, -2.29666); StatelessTest(7, 0, 1, -1.995635); WordIndex unk[1]; unk[0] = 0; model.GetState(unk, unk + 1, state); BOOST_CHECK_EQUAL(1, state.length); BOOST_CHECK_EQUAL(static_cast(0), state.words[0]); } template void NoUnkCheck(const M &model) { WordIndex unk_index = 0; State state; FullScoreReturn ret = model.FullScoreForgotState(&unk_index, &unk_index + 1, unk_index, state); SLOPPY_CHECK_CLOSE(-100.0, ret.prob, 0.001); } template void Everything(const M &m) { Starters(m); Continuation(m); Blanks(m); Unknowns(m); MinimalState(m); ExtendLeftTest(m); Stateless(m); } class ExpectEnumerateVocab : public EnumerateVocab { public: ExpectEnumerateVocab() {} void Add(WordIndex index, const StringPiece &str) { BOOST_CHECK_EQUAL(seen.size(), index); seen.push_back(std::string(str.data(), str.length())); } void Check(const base::Vocabulary &vocab) { BOOST_CHECK_EQUAL(37ULL, seen.size()); BOOST_REQUIRE(!seen.empty()); BOOST_CHECK_EQUAL("", seen[0]); for (WordIndex i = 0; i < seen.size(); ++i) { BOOST_CHECK_EQUAL(i, vocab.Index(seen[i])); } } void Clear() { seen.clear(); } std::vector seen; }; template void LoadingTest() { Config config; config.arpa_complain = Config::NONE; config.messages = NULL; config.probing_multiplier = 2.0; { ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; ModelT m(TestLocation(), config); enumerate.Check(m.GetVocabulary()); BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); Everything(m); } { ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; ModelT m(TestNoUnkLocation(), config); enumerate.Check(m.GetVocabulary()); BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); NoUnkCheck(m); } } BOOST_AUTO_TEST_CASE(probing) { LoadingTest(); } BOOST_AUTO_TEST_CASE(trie) { LoadingTest(); } BOOST_AUTO_TEST_CASE(quant_trie) { LoadingTest(); } BOOST_AUTO_TEST_CASE(bhiksha_trie) { LoadingTest(); } BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) { LoadingTest(); } template void BinaryTest() { Config config; config.write_mmap = "test.binary"; config.messages = NULL; ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; { ModelT copy_model(TestLocation(), config); enumerate.Check(copy_model.GetVocabulary()); enumerate.Clear(); Everything(copy_model); } config.write_mmap = NULL; ModelType type; BOOST_REQUIRE(RecognizeBinary("test.binary", type)); BOOST_CHECK_EQUAL(ModelT::kModelType, type); { ModelT binary("test.binary", config); enumerate.Check(binary.GetVocabulary()); Everything(binary); } unlink("test.binary"); // Now test without . config.write_mmap = "test_nounk.binary"; config.messages = NULL; enumerate.Clear(); { ModelT copy_model(TestNoUnkLocation(), config); enumerate.Check(copy_model.GetVocabulary()); enumerate.Clear(); NoUnkCheck(copy_model); } config.write_mmap = NULL; { ModelT binary(TestNoUnkLocation(), config); enumerate.Check(binary.GetVocabulary()); NoUnkCheck(binary); } unlink("test_nounk.binary"); } BOOST_AUTO_TEST_CASE(write_and_read_probing) { BinaryTest(); } BOOST_AUTO_TEST_CASE(write_and_read_rest_probing) { BinaryTest(); } BOOST_AUTO_TEST_CASE(write_and_read_trie) { BinaryTest(); } BOOST_AUTO_TEST_CASE(write_and_read_quant_trie) { BinaryTest(); } BOOST_AUTO_TEST_CASE(write_and_read_array_trie) { BinaryTest(); } BOOST_AUTO_TEST_CASE(write_and_read_quant_array_trie) { BinaryTest(); } BOOST_AUTO_TEST_CASE(rest_max) { Config config; config.arpa_complain = Config::NONE; config.messages = NULL; RestProbingModel model(TestLocation(), config); State state, out; FullScoreReturn ret(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("."), state)); SLOPPY_CHECK_CLOSE(-0.2705918, ret.rest, 0.001); SLOPPY_CHECK_CLOSE(-0.01916512, model.FullScore(state, model.GetVocabulary().EndSentence(), out).rest, 0.001); } } // namespace } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/model_type.hh ================================================ #ifndef LM_MODEL_TYPE__ #define LM_MODEL_TYPE__ namespace lm { namespace ngram { /* Not the best numbering system, but it grew this way for historical reasons * and I want to preserve existing binary files. */ typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType; // Historical names. const ModelType HASH_PROBING = PROBING; const ModelType TRIE_SORTED = TRIE; const ModelType QUANT_TRIE_SORTED = QUANT_TRIE; const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE; const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE; const static ModelType kQuantAdd = static_cast(QUANT_TRIE - TRIE); const static ModelType kArrayAdd = static_cast(ARRAY_TRIE - TRIE); } // namespace ngram } // namespace lm #endif // LM_MODEL_TYPE__ ================================================ FILE: src/kenlm/lm/ngram_query.hh ================================================ #ifndef LM_NGRAM_QUERY__ #define LM_NGRAM_QUERY__ #include "lm/enumerate_vocab.hh" #include "lm/model.hh" #include "util/usage.hh" #include #include #include #include #include namespace lm { namespace ngram { template void Query(const Model &model, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) { std::cerr << "Loading statistics:\n"; util::PrintUsage(std::cerr); typename Model::State state, out; lm::FullScoreReturn ret; std::string word; while (in_stream) { state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); float total = 0.0; bool got = false; unsigned int oov = 0; while (in_stream >> word) { got = true; lm::WordIndex vocab = model.GetVocabulary().Index(word); if (vocab == 0) ++oov; ret = model.FullScore(state, vocab, out); total += ret.prob; out_stream << word << '=' << vocab << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; state = out; char c; while (true) { c = in_stream.get(); if (!in_stream) break; if (c == '\n') break; if (!isspace(c)) { in_stream.unget(); break; } } if (c == '\n') break; } if (!got && !in_stream) break; if (sentence_context) { ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); total += ret.prob; out_stream << "=" << model.GetVocabulary().EndSentence() << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; } out_stream << "Total: " << total << " OOV: " << oov << '\n'; } std::cerr << "After queries:\n"; util::PrintUsage(std::cerr); } template void Query(const char *file, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) { Config config; M model(file, config); Query(model, sentence_context, in_stream, out_stream); } } // namespace ngram } // namespace lm #endif // LM_NGRAM_QUERY__ ================================================ FILE: src/kenlm/lm/partial.hh ================================================ #ifndef LM_PARTIAL__ #define LM_PARTIAL__ #include "lm/return.hh" #include "lm/state.hh" #include #include namespace lm { namespace ngram { struct ExtendReturn { float adjust; bool make_full; unsigned char next_use; }; template ExtendReturn ExtendLoop( const Model &model, unsigned char seen, const WordIndex *add_rbegin, const WordIndex *add_rend, const float *backoff_start, const uint64_t *pointers, const uint64_t *pointers_end, uint64_t *&pointers_write, float *backoff_write) { unsigned char add_length = add_rend - add_rbegin; float backoff_buf[2][KENLM_MAX_ORDER - 1]; float *backoff_in = backoff_buf[0], *backoff_out = backoff_buf[1]; std::copy(backoff_start, backoff_start + add_length, backoff_in); ExtendReturn value; value.make_full = false; value.adjust = 0.0; value.next_use = add_length; unsigned char i = 0; unsigned char length = pointers_end - pointers; // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities. if (pointers_write) { // Using full context, writing to new left state. for (; i < length; ++i) { FullScoreReturn ret(model.ExtendLeft( add_rbegin, add_rbegin + value.next_use, backoff_in, pointers[i], i + seen + 1, backoff_out, value.next_use)); std::swap(backoff_in, backoff_out); if (ret.independent_left) { value.adjust += ret.prob; value.make_full = true; ++i; break; } value.adjust += ret.rest; *pointers_write++ = ret.extend_left; if (value.next_use != add_length) { value.make_full = true; ++i; break; } } } // Using some of the new context. for (; i < length && value.next_use; ++i) { FullScoreReturn ret(model.ExtendLeft( add_rbegin, add_rbegin + value.next_use, backoff_in, pointers[i], i + seen + 1, backoff_out, value.next_use)); std::swap(backoff_in, backoff_out); value.adjust += ret.prob; } float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1); // Using none of the new context. value.adjust += unrest; std::copy(backoff_in, backoff_in + value.next_use, backoff_write); return value; } template float RevealBefore(const Model &model, const Right &reveal, const unsigned char seen, bool reveal_full, Left &left, Right &right) { assert(seen < reveal.length || reveal_full); uint64_t *pointers_write = reveal_full ? NULL : left.pointers; float backoff_buffer[KENLM_MAX_ORDER - 1]; ExtendReturn value(ExtendLoop( model, seen, reveal.words + seen, reveal.words + reveal.length, reveal.backoff + seen, left.pointers, left.pointers + left.length, pointers_write, left.full ? backoff_buffer : (right.backoff + right.length))); if (reveal_full) { left.length = 0; value.make_full = true; } else { left.length = pointers_write - left.pointers; value.make_full |= (left.length == model.Order() - 1); } if (left.full) { for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; } else { // If left wasn't full when it came in, put words into right state. std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length); right.length += value.next_use; left.full = value.make_full || (right.length == model.Order() - 1); } return value.adjust; } template float RevealAfter(const Model &model, Left &left, Right &right, const Left &reveal, unsigned char seen) { assert(seen < reveal.length || reveal.full); uint64_t *pointers_write = left.full ? NULL : (left.pointers + left.length); ExtendReturn value(ExtendLoop( model, seen, right.words, right.words + right.length, right.backoff, reveal.pointers + seen, reveal.pointers + reveal.length, pointers_write, right.backoff)); if (reveal.full) { for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += right.backoff[i]; right.length = 0; value.make_full = true; } else { right.length = value.next_use; value.make_full |= (right.length == model.Order() - 1); } if (!left.full) { left.length = pointers_write - left.pointers; left.full = value.make_full || (left.length == model.Order() - 1); } return value.adjust; } template float Subsume(const Model &model, Left &first_left, const Right &first_right, const Left &second_left, Right &second_right, const unsigned int between_length) { assert(first_right.length < KENLM_MAX_ORDER); assert(second_left.length < KENLM_MAX_ORDER); assert(between_length < KENLM_MAX_ORDER - 1); uint64_t *pointers_write = first_left.full ? NULL : (first_left.pointers + first_left.length); float backoff_buffer[KENLM_MAX_ORDER - 1]; ExtendReturn value(ExtendLoop( model, between_length, first_right.words, first_right.words + first_right.length, first_right.backoff, second_left.pointers, second_left.pointers + second_left.length, pointers_write, second_left.full ? backoff_buffer : (second_right.backoff + second_right.length))); if (second_left.full) { for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i]; } else { std::copy(first_right.words, first_right.words + value.next_use, second_right.words + second_right.length); second_right.length += value.next_use; value.make_full |= (second_right.length == model.Order() - 1); } if (!first_left.full) { first_left.length = pointers_write - first_left.pointers; first_left.full = value.make_full || second_left.full || (first_left.length == model.Order() - 1); } assert(first_left.length < KENLM_MAX_ORDER); assert(second_right.length < KENLM_MAX_ORDER); return value.adjust; } } // namespace ngram } // namespace lm #endif // LM_PARTIAL__ ================================================ FILE: src/kenlm/lm/partial_test.cc ================================================ #include "lm/partial.hh" #include "lm/left.hh" #include "lm/model.hh" #include "util/tokenize_piece.hh" #define BOOST_TEST_MODULE PartialTest #include #include namespace lm { namespace ngram { namespace { const char *TestLocation() { if (boost::unit_test::framework::master_test_suite().argc < 2) { return "test.arpa"; } return boost::unit_test::framework::master_test_suite().argv[1]; } Config SilentConfig() { Config config; config.arpa_complain = Config::NONE; config.messages = NULL; return config; } struct ModelFixture { ModelFixture() : m(TestLocation(), SilentConfig()) {} RestProbingModel m; }; BOOST_FIXTURE_TEST_SUITE(suite, ModelFixture) BOOST_AUTO_TEST_CASE(SimpleBefore) { Left left; left.full = false; left.length = 0; Right right; right.length = 0; Right reveal; reveal.length = 1; WordIndex period = m.GetVocabulary().Index("."); reveal.words[0] = period; reveal.backoff[0] = -0.845098; BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 0, false, left, right), 0.001); BOOST_CHECK_EQUAL(0, left.length); BOOST_CHECK(!left.full); BOOST_CHECK_EQUAL(1, right.length); BOOST_CHECK_EQUAL(period, right.words[0]); BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001); WordIndex more = m.GetVocabulary().Index("more"); reveal.words[1] = more; reveal.backoff[1] = -0.4771212; reveal.length = 2; BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 1, false, left, right), 0.001); BOOST_CHECK_EQUAL(0, left.length); BOOST_CHECK(!left.full); BOOST_CHECK_EQUAL(2, right.length); BOOST_CHECK_EQUAL(period, right.words[0]); BOOST_CHECK_EQUAL(more, right.words[1]); BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001); BOOST_CHECK_CLOSE(-0.4771212, right.backoff[1], 0.001); } BOOST_AUTO_TEST_CASE(AlsoWouldConsider) { WordIndex would = m.GetVocabulary().Index("would"); WordIndex consider = m.GetVocabulary().Index("consider"); ChartState current; current.left.length = 1; current.left.pointers[0] = would; current.left.full = false; current.right.length = 1; current.right.words[0] = would; current.right.backoff[0] = -0.30103; Left after; after.full = false; after.length = 1; after.pointers[0] = consider; // adjustment for would consider BOOST_CHECK_CLOSE(-1.687872 - -0.2922095 - 0.30103, RevealAfter(m, current.left, current.right, after, 0), 0.001); BOOST_CHECK_EQUAL(2, current.left.length); BOOST_CHECK_EQUAL(would, current.left.pointers[0]); BOOST_CHECK_EQUAL(false, current.left.full); WordIndex also = m.GetVocabulary().Index("also"); Right before; before.length = 1; before.words[0] = also; before.backoff[0] = -0.30103; // r(would) = -0.2922095 [i would], r(would -> consider) = -1.988902 [b(would) + p(consider)] // p(also -> would) = -2, p(also would -> consider) = -3 BOOST_CHECK_CLOSE(-2 + 0.2922095 -3 + 1.988902, RevealBefore(m, before, 0, false, current.left, current.right), 0.001); BOOST_CHECK_EQUAL(0, current.left.length); BOOST_CHECK(current.left.full); BOOST_CHECK_EQUAL(2, current.right.length); BOOST_CHECK_EQUAL(would, current.right.words[0]); BOOST_CHECK_EQUAL(also, current.right.words[1]); } BOOST_AUTO_TEST_CASE(EndSentence) { WordIndex loin = m.GetVocabulary().Index("loin"); WordIndex period = m.GetVocabulary().Index("."); WordIndex eos = m.GetVocabulary().EndSentence(); ChartState between; between.left.length = 1; between.left.pointers[0] = eos; between.left.full = true; between.right.length = 0; Right before; before.words[0] = period; before.words[1] = loin; before.backoff[0] = -0.845098; before.backoff[1] = 0.0; before.length = 1; BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001); BOOST_CHECK_EQUAL(0, between.left.length); } float ScoreFragment(const RestProbingModel &model, unsigned int *begin, unsigned int *end, ChartState &out) { RuleScore scorer(model, out); for (unsigned int *i = begin; i < end; ++i) { scorer.Terminal(*i); } return scorer.Finish(); } void CheckAdjustment(const RestProbingModel &model, float expect, const Right &before_in, bool before_full, ChartState between, const Left &after_in) { Right before(before_in); Left after(after_in); after.full = false; float got = 0.0; for (unsigned int i = 1; i < 5; ++i) { if (before_in.length >= i) { before.length = i; got += RevealBefore(model, before, i - 1, false, between.left, between.right); } if (after_in.length >= i) { after.length = i; got += RevealAfter(model, between.left, between.right, after, i - 1); } } if (after_in.full) { after.full = true; got += RevealAfter(model, between.left, between.right, after, after.length); } if (before_full) { got += RevealBefore(model, before, before.length, true, between.left, between.right); } // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this. BOOST_CHECK(fabs(expect - got) < 0.001); } void FullDivide(const RestProbingModel &model, StringPiece str) { std::vector indices; for (util::TokenIter i(str, ' '); i; ++i) { indices.push_back(model.GetVocabulary().Index(*i)); } ChartState full_state; float full = ScoreFragment(model, &indices.front(), &indices.back() + 1, full_state); ChartState before_state; before_state.left.full = false; RuleScore before_scorer(model, before_state); float before_score = 0.0; for (unsigned int before = 0; before < indices.size(); ++before) { for (unsigned int after = before; after <= indices.size(); ++after) { ChartState after_state, between_state; float after_score = ScoreFragment(model, &indices.front() + after, &indices.front() + indices.size(), after_state); float between_score = ScoreFragment(model, &indices.front() + before, &indices.front() + after, between_state); CheckAdjustment(model, full - before_score - after_score - between_score, before_state.right, before_state.left.full, between_state, after_state.left); } before_scorer.Terminal(indices[before]); before_score = before_scorer.Finish(); } } BOOST_AUTO_TEST_CASE(Strings) { FullDivide(m, "also would consider"); FullDivide(m, "looking on a little more loin . "); FullDivide(m, "in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown "); } BOOST_AUTO_TEST_SUITE_END() } // namespace } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/quantize.cc ================================================ /* Quantize into bins of equal size as described in * M. Federico and N. Bertoldi. 2006. How many bits are needed * to store probabilities for phrase-based translation? In Proc. * of the Workshop on Statistical Machine Translation, pages * 94–101, New York City, June. Association for Computa- * tional Linguistics. */ #include "lm/quantize.hh" #include "lm/binary_format.hh" #include "lm/lm_exception.hh" #include "util/file.hh" #include #include namespace lm { namespace ngram { namespace { void MakeBins(std::vector &values, float *centers, uint32_t bins) { std::sort(values.begin(), values.end()); std::vector::const_iterator start = values.begin(), finish; for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) { finish = values.begin() + ((values.size() * static_cast(i + 1)) / bins); if (finish == start) { // zero length bucket. *centers = i ? *(centers - 1) : -std::numeric_limits::infinity(); } else { *centers = std::accumulate(start, finish, 0.0) / static_cast(finish - start); } } } const char kSeparatelyQuantizeVersion = 2; } // namespace void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector &/*counts*/, Config &config) { char version; util::ReadOrThrow(fd, &version, 1); util::ReadOrThrow(fd, &config.prob_bits, 1); util::ReadOrThrow(fd, &config.backoff_bits, 1); if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion); util::AdvanceOrThrow(fd, -3); } void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) { prob_bits_ = config.prob_bits; backoff_bits_ = config.backoff_bits; // We need the reserved values. if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero"); if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero"); if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast(config.prob_bits) << " bits."); if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast(config.backoff_bits) << " bits."); // Reserve 8 byte header for bit counts. actual_base_ = static_cast(base); float *start = reinterpret_cast(actual_base_ + 8); for (unsigned char i = 0; i < order - 2; ++i) { tables_[i][0] = Bins(prob_bits_, start); start += (1ULL << prob_bits_); tables_[i][1] = Bins(backoff_bits_, start); start += (1ULL << backoff_bits_); } longest_ = tables_[order - 2][0] = Bins(prob_bits_, start); } void SeparatelyQuantize::Train(uint8_t order, std::vector &prob, std::vector &backoff) { TrainProb(order, prob); // Backoff float *centers = tables_[order - 2][1].Populate(); *(centers++) = kNoExtensionBackoff; *(centers++) = kExtensionBackoff; MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2); } void SeparatelyQuantize::TrainProb(uint8_t order, std::vector &prob) { float *centers = tables_[order - 2][0].Populate(); MakeBins(prob, centers, (1ULL << prob_bits_)); } void SeparatelyQuantize::FinishedLoading(const Config &config) { uint8_t *actual_base = actual_base_; *(actual_base++) = kSeparatelyQuantizeVersion; // version *(actual_base++) = config.prob_bits; *(actual_base++) = config.backoff_bits; } } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/quantize.hh ================================================ #ifndef LM_QUANTIZE_H__ #define LM_QUANTIZE_H__ #include "lm/blank.hh" #include "lm/config.hh" #include "lm/max_order.hh" #include "lm/model_type.hh" #include "util/bit_packing.hh" #include #include #include #include namespace lm { namespace ngram { struct Config; /* Store values directly and don't quantize. */ class DontQuantize { public: static const ModelType kModelTypeAdd = static_cast(0); static void UpdateConfigFromBinary(int, const std::vector &, Config &) {} static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } static uint8_t MiddleBits(const Config &/*config*/) { return 63; } static uint8_t LongestBits(const Config &/*config*/) { return 31; } class MiddlePointer { public: MiddlePointer(const DontQuantize & /*quant*/, unsigned char /*order_minus_2*/, util::BitAddress address) : address_(address) {} MiddlePointer() : address_(NULL, 0) {} bool Found() const { return address_.base != NULL; } float Prob() const { return util::ReadNonPositiveFloat31(address_.base, address_.offset); } float Backoff() const { return util::ReadFloat32(address_.base, address_.offset + 31); } float Rest() const { return Prob(); } void Write(float prob, float backoff) { util::WriteNonPositiveFloat31(address_.base, address_.offset, prob); util::WriteFloat32(address_.base, address_.offset + 31, backoff); } private: util::BitAddress address_; }; class LongestPointer { public: explicit LongestPointer(const DontQuantize &/*quant*/, util::BitAddress address) : address_(address) {} LongestPointer() : address_(NULL, 0) {} bool Found() const { return address_.base != NULL; } float Prob() const { return util::ReadNonPositiveFloat31(address_.base, address_.offset); } void Write(float prob) { util::WriteNonPositiveFloat31(address_.base, address_.offset, prob); } private: util::BitAddress address_; }; DontQuantize() {} void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {} static const bool kTrain = false; // These should never be called because kTrain is false. void Train(uint8_t /*order*/, std::vector &/*prob*/, std::vector &/*backoff*/) {} void TrainProb(uint8_t, std::vector &/*prob*/) {} void FinishedLoading(const Config &) {} }; class SeparatelyQuantize { private: class Bins { public: // Sigh C++ default constructor Bins() {} Bins(uint8_t bits, float *begin) : begin_(begin), end_(begin_ + (1ULL << bits)), bits_(bits), mask_((1ULL << bits) - 1) {} float *Populate() { return begin_; } uint64_t EncodeProb(float value) const { return Encode(value, 0); } uint64_t EncodeBackoff(float value) const { if (value == 0.0) { return HasExtension(value) ? kExtensionQuant : kNoExtensionQuant; } return Encode(value, 2); } float Decode(std::size_t off) const { return begin_[off]; } uint8_t Bits() const { return bits_; } uint64_t Mask() const { return mask_; } private: uint64_t Encode(float value, size_t reserved) const { const float *above = std::lower_bound(static_cast(begin_) + reserved, end_, value); if (above == begin_ + reserved) return reserved; if (above == end_) return end_ - begin_ - 1; return above - begin_ - (value - *(above - 1) < *above - value); } float *begin_; const float *end_; uint8_t bits_; uint64_t mask_; }; public: static const ModelType kModelTypeAdd = kQuantAdd; static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config); static uint64_t Size(uint8_t order, const Config &config) { uint64_t longest_table = (static_cast(1) << static_cast(config.prob_bits)) * sizeof(float); uint64_t middle_table = (static_cast(1) << static_cast(config.backoff_bits)) * sizeof(float) + longest_table; // unigrams are currently not quantized so no need for a table. return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8; } static uint8_t MiddleBits(const Config &config) { return config.prob_bits + config.backoff_bits; } static uint8_t LongestBits(const Config &config) { return config.prob_bits; } class MiddlePointer { public: MiddlePointer(const SeparatelyQuantize &quant, unsigned char order_minus_2, const util::BitAddress &address) : bins_(quant.GetTables(order_minus_2)), address_(address) {} MiddlePointer() : address_(NULL, 0) {} bool Found() const { return address_.base != NULL; } float Prob() const { return ProbBins().Decode(util::ReadInt25(address_.base, address_.offset + BackoffBins().Bits(), ProbBins().Bits(), ProbBins().Mask())); } float Backoff() const { return BackoffBins().Decode(util::ReadInt25(address_.base, address_.offset, BackoffBins().Bits(), BackoffBins().Mask())); } float Rest() const { return Prob(); } void Write(float prob, float backoff) const { util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(), (ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff)); } private: const Bins &ProbBins() const { return bins_[0]; } const Bins &BackoffBins() const { return bins_[1]; } const Bins *bins_; util::BitAddress address_; }; class LongestPointer { public: LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {} LongestPointer() : address_(NULL, 0) {} bool Found() const { return address_.base != NULL; } void Write(float prob) const { util::WriteInt25(address_.base, address_.offset, table_->Bits(), table_->EncodeProb(prob)); } float Prob() const { return table_->Decode(util::ReadInt25(address_.base, address_.offset, table_->Bits(), table_->Mask())); } private: const Bins *table_; util::BitAddress address_; }; SeparatelyQuantize() {} void SetupMemory(void *start, unsigned char order, const Config &config); static const bool kTrain = true; // Assumes 0.0 is removed from backoff. void Train(uint8_t order, std::vector &prob, std::vector &backoff); // Train just probabilities (for longest order). void TrainProb(uint8_t order, std::vector &prob); void FinishedLoading(const Config &config); const Bins *GetTables(unsigned char order_minus_2) const { return tables_[order_minus_2]; } const Bins &LongestTable() const { return longest_; } private: Bins tables_[KENLM_MAX_ORDER - 1][2]; Bins longest_; uint8_t *actual_base_; uint8_t prob_bits_, backoff_bits_; }; } // namespace ngram } // namespace lm #endif // LM_QUANTIZE_H__ ================================================ FILE: src/kenlm/lm/query_main.cc ================================================ #include "lm/ngram_query.hh" int main(int argc, char *argv[]) { if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { std::cerr << "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << "." << std::endl; std::cerr << "Usage: " << argv[0] << " lm_file [null]" << std::endl; std::cerr << "Input is wrapped in and unless null is passed." << std::endl; return 1; } try { bool sentence_context = (argc == 2); using namespace lm::ngram; ModelType model_type; if (RecognizeBinary(argv[1], model_type)) { switch(model_type) { case PROBING: Query(argv[1], sentence_context, std::cin, std::cout); break; case REST_PROBING: Query(argv[1], sentence_context, std::cin, std::cout); break; case TRIE: Query(argv[1], sentence_context, std::cin, std::cout); break; case QUANT_TRIE: Query(argv[1], sentence_context, std::cin, std::cout); break; case ARRAY_TRIE: Query(argv[1], sentence_context, std::cin, std::cout); break; case QUANT_ARRAY_TRIE: Query(argv[1], sentence_context, std::cin, std::cout); break; default: std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; abort(); } } else { Query(argv[1], sentence_context, std::cin, std::cout); } std::cerr << "Total time including destruction:\n"; util::PrintUsage(std::cerr); } catch (const std::exception &e) { std::cerr << e.what() << std::endl; return 1; } return 0; } ================================================ FILE: src/kenlm/lm/read_arpa.cc ================================================ #include "lm/read_arpa.hh" #include "lm/blank.hh" #include "util/file.hh" #include #include #include #include #include #include #include #include #ifdef WIN32 #include #endif namespace lm { // 1 for '\t', '\n', and ' '. This is stricter than isspace. const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; namespace { bool IsEntirelyWhiteSpace(const StringPiece &line) { for (size_t i = 0; i < static_cast(line.size()); ++i) { if (!isspace(line.data()[i])) return false; } return true; } const char kBinaryMagic[] = "mmap lm http://kheafield.com/code"; // strtoull isn't portable enough :-( uint64_t ReadCount(const std::string &from) { std::stringstream stream(from); uint64_t ret; stream >> ret; UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from); return ret; } } // namespace void ReadARPACounts(util::FilePiece &in, std::vector &number) { number.clear(); StringPiece line = in.ReadLine(); // In general, ARPA files can have arbitrary text before "\data\" // But in KenLM, we require such lines to start with "#", so that // we can do stricter error checking while (IsEntirelyWhiteSpace(line) || line.starts_with("#")) { line = in.ReadLine(); } if (line != "\\data\\") { if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast(line.data()[1]) == 0x8b)) { UTIL_THROW(FormatLoadException, "Looks like a gzip file. If this is an ARPA file, pipe " << in.FileName() << " through zcat. If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip."); } if (static_cast(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic) UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser. Did you compress the binary file or pass a binary file where only ARPA files are accepted?"); UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file. Did you forget to pass --text yes to compile-lm?"); UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file. You need an ARPA file. Run\n compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst."); UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\."); } while (!IsEntirelyWhiteSpace(line = in.ReadLine())) { if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \""); // So strtol doesn't go off the end of line. std::string remaining(line.data() + 6, line.size() - 6); char *end_ptr; unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10); if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); ++end_ptr; number.push_back(ReadCount(end_ptr)); } } void ReadNGramHeader(util::FilePiece &in, unsigned int length) { StringPiece line; while (IsEntirelyWhiteSpace(line = in.ReadLine())) {} std::stringstream expected; expected << '\\' << length << "-grams:"; if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead"); } void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) { switch (in.get()) { case '\t': { float got = in.ReadFloat(); if (got != 0.0) UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff"); } break; case '\n': break; default: UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff"); } } void ReadBackoff(util::FilePiece &in, float &backoff) { // Always make zero negative. // Negative zero means that no (n+1)-gram has this n-gram as context. // Therefore the hypothesis state can be shorter. Of course, many n-grams // are context for (n+1)-grams. An algorithm in the data structure will go // back and set the backoff to positive zero in these cases. switch (in.get()) { case '\t': backoff = in.ReadFloat(); if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff; { #ifdef WIN32 int float_class = _fpclass(backoff); UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff); #else int float_class = std::fpclassify(backoff); UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); #endif } UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff"); break; case '\n': backoff = ngram::kNoExtensionBackoff; break; default: UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff"); } } void ReadEnd(util::FilePiece &in) { StringPiece line; do { line = in.ReadLine(); } while (IsEntirelyWhiteSpace(line)); if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line); try { while (true) { line = in.ReadLine(); if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line); } } catch (const util::EndOfFileException &e) {} } void PositiveProbWarn::Warn(float prob) { switch (action_) { case THROW_UP: UTIL_THROW(FormatLoadException, "Positive log probability " << prob << " in the model. This is a bug in IRSTLM; you can set config.positive_log_probability = SILENT or pass -i to build_binary to substitute 0.0 for the log probability. Error"); case COMPLAIN: std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM. This and subsequent entires will be mapepd to 0 log probability." << std::endl; action_ = SILENT; break; case SILENT: break; } } } // namespace lm ================================================ FILE: src/kenlm/lm/read_arpa.hh ================================================ #ifndef LM_READ_ARPA__ #define LM_READ_ARPA__ #include "lm/lm_exception.hh" #include "lm/word_index.hh" #include "lm/weights.hh" #include "util/file_piece.hh" #include #include #include namespace lm { void ReadARPACounts(util::FilePiece &in, std::vector &number); void ReadNGramHeader(util::FilePiece &in, unsigned int length); void ReadBackoff(util::FilePiece &in, Prob &weights); void ReadBackoff(util::FilePiece &in, float &backoff); inline void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) { ReadBackoff(in, weights.backoff); } inline void ReadBackoff(util::FilePiece &in, RestWeights &weights) { ReadBackoff(in, weights.backoff); } void ReadEnd(util::FilePiece &in); extern const bool kARPASpaces[256]; // Positive log probability warning. class PositiveProbWarn { public: PositiveProbWarn() : action_(THROW_UP) {} explicit PositiveProbWarn(WarningAction action) : action_(action) {} void Warn(float prob); private: WarningAction action_; }; template void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { try { float prob = f.ReadFloat(); if (prob > 0.0) { warn.Warn(prob); prob = 0.0; } if (f.get() != '\t') UTIL_THROW(FormatLoadException, "Expected tab after probability"); Weights &value = unigrams[vocab.Insert(f.ReadDelimited(kARPASpaces))]; value.prob = prob; ReadBackoff(f, value); } catch(util::Exception &e) { e << " in the 1-gram at byte " << f.Offset(); throw; } } // Return true if a positive log probability came out. template void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) { ReadNGramHeader(f, 1); for (std::size_t i = 0; i < count; ++i) { Read1Gram(f, vocab, unigrams, warn); } vocab.FinishedLoading(unigrams); } // Return true if a positive log probability came out. template void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, WordIndex *const reverse_indices, Weights &weights, PositiveProbWarn &warn) { try { weights.prob = f.ReadFloat(); if (weights.prob > 0.0) { warn.Warn(weights.prob); weights.prob = 0.0; } for (WordIndex *vocab_out = reverse_indices + n - 1; vocab_out >= reverse_indices; --vocab_out) { *vocab_out = vocab.Index(f.ReadDelimited(kARPASpaces)); } ReadBackoff(f, weights); } catch(util::Exception &e) { e << " in the " << static_cast(n) << "-gram at byte " << f.Offset(); throw; } } } // namespace lm #endif // LM_READ_ARPA__ ================================================ FILE: src/kenlm/lm/return.hh ================================================ #ifndef LM_RETURN__ #define LM_RETURN__ #include namespace lm { /* Structure returned by scoring routines. */ struct FullScoreReturn { // log10 probability float prob; /* The length of n-gram matched. Do not use this for recombination. * Consider a model containing only the following n-grams: * -1 foo * -3.14 bar * -2.718 baz -5 * -6 foo bar * * If you score ``bar'' then ngram_length is 1 and recombination state is the * empty string because bar has zero backoff and does not extend to the * right. * If you score ``foo'' then ngram_length is 1 and recombination state is * ``foo''. * * Ideally, keep output states around and compare them. Failing that, * get out_state.ValidLength() and use that length for recombination. */ unsigned char ngram_length; /* Left extension information. If independent_left is set, then prob is * independent of words to the left (up to additional backoff). Otherwise, * extend_left indicates how to efficiently extend further to the left. */ bool independent_left; uint64_t extend_left; // Defined only if independent_left // Rest cost for extension to the left. float rest; }; } // namespace lm #endif // LM_RETURN__ ================================================ FILE: src/kenlm/lm/search_hashed.cc ================================================ #include "lm/search_hashed.hh" #include "lm/binary_format.hh" #include "lm/blank.hh" #include "lm/lm_exception.hh" #include "lm/model.hh" #include "lm/read_arpa.hh" #include "lm/value.hh" #include "lm/vocab.hh" #include "util/bit_packing.hh" #include "util/file_piece.hh" #include namespace lm { namespace ngram { class ProbingModel; namespace { /* These are passed to ReadNGrams so that n-grams with zero backoff that appear as context will still be used in state. */ template class ActivateLowerMiddle { public: explicit ActivateLowerMiddle(Middle &middle) : modify_(middle) {} void operator()(const WordIndex *vocab_ids, const unsigned int n) { uint64_t hash = static_cast(vocab_ids[1]); for (const WordIndex *i = vocab_ids + 2; i < vocab_ids + n; ++i) { hash = detail::CombineWordHash(hash, *i); } typename Middle::MutableIterator i; // TODO: somehow get text of n-gram for this error message. if (!modify_.UnsafeMutableFind(hash, i)) UTIL_THROW(FormatLoadException, "The context of every " << n << "-gram should appear as a " << (n-1) << "-gram"); SetExtension(i->value.backoff); } private: Middle &modify_; }; template class ActivateUnigram { public: explicit ActivateUnigram(Weights *unigram) : modify_(unigram) {} void operator()(const WordIndex *vocab_ids, const unsigned int /*n*/) { // assert(n == 2); SetExtension(modify_[vocab_ids[1]].backoff); } private: Weights *modify_; }; // Find the lower order entry, inserting blanks along the way as necessary. template void FindLower( const std::vector &keys, typename Value::Weights &unigram, std::vector > &middle, std::vector &between) { typename util::ProbingHashTable::MutableIterator iter; typename Value::ProbingEntry entry; // Backoff will always be 0.0. We'll get the probability and rest in another pass. entry.value.backoff = kNoExtensionBackoff; // Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb. for (int lower = keys.size() - 2; ; --lower) { if (lower == -1) { between.push_back(&unigram); return; } entry.key = keys[lower]; bool found = middle[lower].FindOrInsert(entry, iter); between.push_back(&iter->value); if (found) return; } } // Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here. template void AdjustLower( const Added &added, const Build &build, std::vector &between, const unsigned int n, const std::vector &vocab_ids, typename Build::Value::Weights *unigrams, std::vector > &middle) { typedef typename Build::Value Value; if (between.size() == 1) { build.MarkExtends(*between.front(), added); return; } typedef util::ProbingHashTable Middle; float prob = -fabs(between.back()->prob); // Order of the n-gram on which probabilities are based. unsigned char basis = n - between.size(); assert(basis != 0); typename Build::Value::Weights **change = &between.back(); // Skip the basis. --change; if (basis == 1) { // Hallucinate a bigram based on a unigram's backoff and a unigram probability. float &backoff = unigrams[vocab_ids[1]].backoff; SetExtension(backoff); prob += backoff; (*change)->prob = prob; build.SetRest(&*vocab_ids.begin(), 2, **change); basis = 2; --change; } uint64_t backoff_hash = static_cast(vocab_ids[1]); for (unsigned char i = 2; i <= basis; ++i) { backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[i]); } for (; basis < n - 1; ++basis, --change) { typename Middle::MutableIterator gotit; if (middle[basis - 2].UnsafeMutableFind(backoff_hash, gotit)) { float &backoff = gotit->value.backoff; SetExtension(backoff); prob += backoff; } (*change)->prob = prob; build.SetRest(&*vocab_ids.begin(), basis + 1, **change); backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[basis+1]); } typename std::vector::const_iterator i(between.begin()); build.MarkExtends(**i, added); const typename Value::Weights *longer = *i; // Everything has probability but is not marked as extending. for (++i; i != between.end(); ++i) { build.MarkExtends(**i, *longer); longer = *i; } } // Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds. template void MarkLower( const std::vector &keys, const Build &build, typename Build::Value::Weights &unigram, std::vector > &middle, int start_order, const typename Build::Value::Weights &longer) { if (start_order == 0) return; typename util::ProbingHashTable::MutableIterator iter; // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code. for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) { if (even_lower == -1) { build.MarkExtends(unigram, longer); return; } middle[even_lower].UnsafeMutableFind(keys[even_lower], iter); if (!build.MarkExtends(iter->value, longer)) return; } } template void ReadNGrams( util::FilePiece &f, const unsigned int n, const size_t count, const ProbingVocabulary &vocab, const Build &build, typename Build::Value::Weights *unigrams, std::vector > &middle, Activate activate, Store &store, PositiveProbWarn &warn) { typedef typename Build::Value Value; typedef util::ProbingHashTable Middle; assert(n >= 2); ReadNGramHeader(f, n); // Both vocab_ids and keys are non-empty because n >= 2. // vocab ids of words in reverse order. std::vector vocab_ids(n); std::vector keys(n-1); typename Store::Entry entry; std::vector between; for (size_t i = 0; i < count; ++i) { ReadNGram(f, n, vocab, &*vocab_ids.begin(), entry.value, warn); build.SetRest(&*vocab_ids.begin(), n, entry.value); keys[0] = detail::CombineWordHash(static_cast(vocab_ids.front()), vocab_ids[1]); for (unsigned int h = 1; h < n - 1; ++h) { keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]); } // Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0. util::SetSign(entry.value.prob); entry.key = keys[n-2]; store.Insert(entry); between.clear(); FindLower(keys, unigrams[vocab_ids.front()], middle, between); AdjustLower(entry.value, build, between, n, vocab_ids, unigrams, middle); if (Build::kMarkEvenLower) MarkLower(keys, build, unigrams[vocab_ids.front()], middle, n - between.size() - 1, *between.back()); activate(&*vocab_ids.begin(), n); } store.FinishedInserting(); } } // namespace namespace detail { template uint8_t *HashedSearch::SetupMemory(uint8_t *start, const std::vector &counts, const Config &config) { std::size_t allocated = Unigram::Size(counts[0]); unigram_ = Unigram(start, counts[0], allocated); start += allocated; for (unsigned int n = 2; n < counts.size(); ++n) { allocated = Middle::Size(counts[n - 1], config.probing_multiplier); middle_.push_back(Middle(start, allocated)); start += allocated; } allocated = Longest::Size(counts.back(), config.probing_multiplier); longest_ = Longest(start, allocated); start += allocated; return start; } template void HashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing) { // TODO: fix sorted. SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config); PositiveProbWarn warn(config.positive_log_probability); Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn); CheckSpecials(config, vocab); DispatchBuild(f, counts, config, vocab, warn); } template <> void HashedSearch::DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { NoRestBuild build; ApplyBuild(f, counts, vocab, warn, build); } template <> void HashedSearch::DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) { switch (config.rest_function) { case Config::REST_MAX: { MaxRestBuild build; ApplyBuild(f, counts, vocab, warn, build); } break; case Config::REST_LOWER: { LowerRestBuild build(config, counts.size(), vocab); ApplyBuild(f, counts, vocab, warn, build); } break; } } template template void HashedSearch::ApplyBuild(util::FilePiece &f, const std::vector &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build) { for (WordIndex i = 0; i < counts[0]; ++i) { build.SetRest(&i, (unsigned int)1, unigram_.Raw()[i]); } try { if (counts.size() > 2) { ReadNGrams, Middle>( f, 2, counts[1], vocab, build, unigram_.Raw(), middle_, ActivateUnigram(unigram_.Raw()), middle_[0], warn); } for (unsigned int n = 3; n < counts.size(); ++n) { ReadNGrams, Middle>( f, n, counts[n-1], vocab, build, unigram_.Raw(), middle_, ActivateLowerMiddle(middle_[n-3]), middle_[n-2], warn); } if (counts.size() > 2) { ReadNGrams, Longest>( f, counts.size(), counts[counts.size() - 1], vocab, build, unigram_.Raw(), middle_, ActivateLowerMiddle(middle_.back()), longest_, warn); } else { ReadNGrams, Longest>( f, counts.size(), counts[counts.size() - 1], vocab, build, unigram_.Raw(), middle_, ActivateUnigram(unigram_.Raw()), longest_, warn); } } catch (util::ProbingSizeException &e) { UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model. KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them. Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n"); } ReadEnd(f); } template void HashedSearch::LoadedBinary() { unigram_.LoadedBinary(); for (typename std::vector::iterator i = middle_.begin(); i != middle_.end(); ++i) { i->LoadedBinary(); } longest_.LoadedBinary(); } template class HashedSearch; template class HashedSearch; } // namespace detail } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/search_hashed.hh ================================================ #ifndef LM_SEARCH_HASHED__ #define LM_SEARCH_HASHED__ #include "lm/model_type.hh" #include "lm/config.hh" #include "lm/read_arpa.hh" #include "lm/return.hh" #include "lm/weights.hh" #include "util/bit_packing.hh" #include "util/probing_hash_table.hh" #include #include #include namespace util { class FilePiece; } namespace lm { namespace ngram { struct Backing; class ProbingVocabulary; namespace detail { inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) { uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast(1 + next) * 17894857484156487943ULL); return ret; } #pragma pack(push) #pragma pack(4) struct ProbEntry { uint64_t key; Prob value; typedef uint64_t Key; typedef Prob Value; uint64_t GetKey() const { return key; } }; #pragma pack(pop) class LongestPointer { public: explicit LongestPointer(const float &to) : to_(&to) {} LongestPointer() : to_(NULL) {} bool Found() const { return to_ != NULL; } float Prob() const { return *to_; } private: const float *to_; }; template class HashedSearch { public: typedef uint64_t Node; typedef typename Value::ProbingProxy UnigramPointer; typedef typename Value::ProbingProxy MiddlePointer; typedef ::lm::ngram::detail::LongestPointer LongestPointer; static const ModelType kModelType = Value::kProbingModelType; static const bool kDifferentRest = Value::kDifferentRest; static const unsigned int kVersion = 0; // TODO: move probing_multiplier here with next binary file format update. static void UpdateConfigFromBinary(int, const std::vector &, Config &) {} static uint64_t Size(const std::vector &counts, const Config &config) { uint64_t ret = Unigram::Size(counts[0]); for (unsigned char n = 1; n < counts.size() - 1; ++n) { ret += Middle::Size(counts[n], config.probing_multiplier); } return ret + Longest::Size(counts.back(), config.probing_multiplier); } uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector &counts, const Config &config, ProbingVocabulary &vocab, Backing &backing); void LoadedBinary(); unsigned char Order() const { return middle_.size() + 2; } typename Value::Weights &UnknownUnigram() { return unigram_.Unknown(); } UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { extend_left = static_cast(word); next = extend_left; UnigramPointer ret(unigram_.Lookup(word)); independent_left = ret.IndependentLeft(); return ret; } #pragma GCC diagnostic ignored "-Wuninitialized" MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { node = extend_pointer; typename Middle::ConstIterator found; bool got = middle_[extend_length - 2].Find(extend_pointer, found); assert(got); (void)got; return MiddlePointer(found->value); } MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const { node = CombineWordHash(node, word); typename Middle::ConstIterator found; if (!middle_[order_minus_2].Find(node, found)) { independent_left = true; return MiddlePointer(); } extend_pointer = node; MiddlePointer ret(found->value); independent_left = ret.IndependentLeft(); return ret; } LongestPointer LookupLongest(WordIndex word, const Node &node) const { // Sign bit is always on because longest n-grams do not extend left. typename Longest::ConstIterator found; if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer(); return LongestPointer(found->value.prob); } // Generate a node without necessarily checking that it actually exists. // Optionally return false if it's know to not exist. bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { assert(begin != end); node = static_cast(*begin); for (const WordIndex *i = begin + 1; i < end; ++i) { node = CombineWordHash(node, *i); } return true; } private: // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild. void DispatchBuild(util::FilePiece &f, const std::vector &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn); template void ApplyBuild(util::FilePiece &f, const std::vector &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build); class Unigram { public: Unigram() {} Unigram(void *start, uint64_t count, std::size_t /*allocated*/) : unigram_(static_cast(start)) #ifdef DEBUG , count_(count) #endif {} static uint64_t Size(uint64_t count) { return (count + 1) * sizeof(typename Value::Weights); // +1 for hallucinate } const typename Value::Weights &Lookup(WordIndex index) const { #ifdef DEBUG assert(index < count_); #endif return unigram_[index]; } typename Value::Weights &Unknown() { return unigram_[0]; } void LoadedBinary() {} // For building. typename Value::Weights *Raw() { return unigram_; } private: typename Value::Weights *unigram_; #ifdef DEBUG uint64_t count_; #endif }; Unigram unigram_; typedef util::ProbingHashTable Middle; std::vector middle_; typedef util::ProbingHashTable Longest; Longest longest_; }; } // namespace detail } // namespace ngram } // namespace lm #endif // LM_SEARCH_HASHED__ ================================================ FILE: src/kenlm/lm/search_trie.cc ================================================ /* This is where the trie is built. It's on-disk. */ #include "lm/search_trie.hh" #include "lm/bhiksha.hh" #include "lm/binary_format.hh" #include "lm/blank.hh" #include "lm/lm_exception.hh" #include "lm/max_order.hh" #include "lm/quantize.hh" #include "lm/trie.hh" #include "lm/trie_sort.hh" #include "lm/vocab.hh" #include "lm/weights.hh" #include "lm/word_index.hh" #include "util/ersatz_progress.hh" #include "util/mmap.hh" #include "util/proxy_iterator.hh" #include "util/scoped.hh" #include "util/sized_iterator.hh" #include #include #include #include #include #include #include #include #if defined(_WIN32) || defined(_WIN64) #include #endif namespace lm { namespace ngram { namespace trie { namespace { void ReadOrThrow(FILE *from, void *data, size_t size) { UTIL_THROW_IF(1 != std::fread(data, size, 1, from), util::ErrnoException, "Short read"); } int Compare(unsigned char order, const void *first_void, const void *second_void) { const WordIndex *first = reinterpret_cast(first_void), *second = reinterpret_cast(second_void); const WordIndex *end = first + order; for (; first != end; ++first, ++second) { if (*first < *second) return -1; if (*first > *second) return 1; } return 0; } struct ProbPointer { unsigned char array; uint64_t index; }; // Array of n-grams and float indices. class BackoffMessages { public: void Init(std::size_t entry_size) { current_ = NULL; allocated_ = NULL; entry_size_ = entry_size; } void Add(const WordIndex *to, ProbPointer index) { while (current_ + entry_size_ > allocated_) { std::size_t allocated_size = allocated_ - (uint8_t*)backing_.get(); Resize(std::max(allocated_size * 2, entry_size_)); } memcpy(current_, to, entry_size_ - sizeof(ProbPointer)); *reinterpret_cast(current_ + entry_size_ - sizeof(ProbPointer)) = index; current_ += entry_size_; } void Apply(float *const *const base, FILE *unigrams) { FinishedAdding(); if (current_ == allocated_) return; rewind(unigrams); ProbBackoff weights; WordIndex unigram = 0; ReadOrThrow(unigrams, &weights, sizeof(weights)); for (; current_ != allocated_; current_ += entry_size_) { const WordIndex &cur_word = *reinterpret_cast(current_); for (; unigram < cur_word; ++unigram) { ReadOrThrow(unigrams, &weights, sizeof(weights)); } if (!HasExtension(weights.backoff)) { weights.backoff = kExtensionBackoff; UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed."); util::WriteOrThrow(unigrams, &weights, sizeof(weights)); } const ProbPointer &write_to = *reinterpret_cast(current_ + sizeof(WordIndex)); base[write_to.array][write_to.index] += weights.backoff; } backing_.reset(); } void Apply(float *const *const base, RecordReader &reader) { FinishedAdding(); if (current_ == allocated_) return; // We'll also use the same buffer to record messages to blanks that they extend. WordIndex *extend_out = reinterpret_cast(current_); const unsigned char order = (entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex); for (reader.Rewind(); reader && (current_ != allocated_); ) { switch (Compare(order, reader.Data(), current_)) { case -1: ++reader; break; case 1: // Message but nobody to receive it. Write it down at the beginning of the buffer so we can inform this blank that it extends. for (const WordIndex *w = reinterpret_cast(current_); w != reinterpret_cast(current_) + order; ++w, ++extend_out) *extend_out = *w; current_ += entry_size_; break; case 0: float &backoff = reinterpret_cast((uint8_t*)reader.Data() + order * sizeof(WordIndex))->backoff; if (!HasExtension(backoff)) { backoff = kExtensionBackoff; reader.Overwrite(&backoff, sizeof(float)); } else { const ProbPointer &write_to = *reinterpret_cast(current_ + entry_size_ - sizeof(ProbPointer)); base[write_to.array][write_to.index] += backoff; } current_ += entry_size_; break; } } // Now this is a list of blanks that extend right. entry_size_ = sizeof(WordIndex) * order; Resize(sizeof(WordIndex) * (extend_out - (const WordIndex*)backing_.get())); current_ = (uint8_t*)backing_.get(); } // Call after Apply bool Extends(unsigned char order, const WordIndex *words) { if (current_ == allocated_) return false; assert(order * sizeof(WordIndex) == entry_size_); while (true) { switch(Compare(order, words, current_)) { case 1: current_ += entry_size_; if (current_ == allocated_) return false; break; case -1: return false; case 0: return true; } } } private: void FinishedAdding() { Resize(current_ - (uint8_t*)backing_.get()); // Sort requests in same order as files. std::sort( util::SizedIterator(util::SizedProxy(backing_.get(), entry_size_)), util::SizedIterator(util::SizedProxy(current_, entry_size_)), util::SizedCompare(EntryCompare((entry_size_ - sizeof(ProbPointer)) / sizeof(WordIndex)))); current_ = (uint8_t*)backing_.get(); } void Resize(std::size_t to) { std::size_t current = current_ - (uint8_t*)backing_.get(); backing_.call_realloc(to); current_ = (uint8_t*)backing_.get() + current; allocated_ = (uint8_t*)backing_.get() + to; } util::scoped_malloc backing_; uint8_t *current_, *allocated_; std::size_t entry_size_; }; const float kBadProb = std::numeric_limits::infinity(); class SRISucks { public: SRISucks() { for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i) i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1)); } void Send(unsigned char begin, unsigned char order, const WordIndex *to, float prob_basis) { assert(prob_basis != kBadProb); ProbPointer pointer; pointer.array = order - 1; pointer.index = values_[order - 1].size(); for (unsigned char i = begin; i < order; ++i) { messages_[i - 1].Add(to, pointer); } values_[order - 1].push_back(prob_basis); } void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) { it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); } messages_[0].Apply(it_, unigram_file); BackoffMessages *messages = messages_ + 1; const RecordReader *end = reader + total_order - 2 /* exclude unigrams and longest order */; for (; reader != end; ++messages, ++reader) { messages->Apply(it_, *reader); } } ProbBackoff GetBlank(unsigned char total_order, unsigned char order, const WordIndex *indices) { assert(order > 1); ProbBackoff ret; ret.prob = *(it_[order - 1]++); ret.backoff = ((order != total_order - 1) && messages_[order - 1].Extends(order, indices)) ? kExtensionBackoff : kNoExtensionBackoff; return ret; } const std::vector &Values(unsigned char order) const { return values_[order - 1]; } private: // This used to be one array. Then I needed to separate it by order for quantization to work. std::vector values_[KENLM_MAX_ORDER - 1]; BackoffMessages messages_[KENLM_MAX_ORDER - 1]; float *it_[KENLM_MAX_ORDER - 1]; }; class FindBlanks { public: FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) : counts_(order), unigrams_(unigrams), sri_(messages) {} float UnigramProb(WordIndex index) const { return unigrams_[index].prob; } void Unigram(WordIndex /*index*/) { ++counts_[0]; } void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) { sri_.Send(lower, order, indices + 1, prob_basis); ++counts_[order - 1]; } void Middle(const unsigned char order, const void * /*data*/) { ++counts_[order - 1]; } void Longest(const void * /*data*/) { ++counts_.back(); } // Unigrams wrote one past. void Cleanup() { --counts_[0]; } const std::vector &Counts() const { return counts_; } private: std::vector counts_; const ProbBackoff *unigrams_; SRISucks &sri_; }; // Phase to actually write n-grams to the trie. template class WriteEntries { public: WriteEntries(RecordReader *contexts, const Quant &quant, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : contexts_(contexts), quant_(quant), unigrams_(unigrams), middle_(middle), longest_(longest), bigram_pack_((order == 2) ? static_cast(longest_) : static_cast(*middle_)), order_(order), sri_(sri) {} float UnigramProb(WordIndex index) const { return unigrams_[index].weights.prob; } void Unigram(WordIndex word) { unigrams_[word].next = bigram_pack_.InsertIndex(); } void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char /*lower*/, float /*prob_base*/) { ProbBackoff weights = sri_.GetBlank(order_, order, indices); typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(indices[order - 1])).Write(weights.prob, weights.backoff); } void Middle(const unsigned char order, const void *data) { RecordReader &context = contexts_[order - 1]; const WordIndex *words = reinterpret_cast(data); ProbBackoff weights = *reinterpret_cast(words + order); if (context && !memcmp(data, context.Data(), sizeof(WordIndex) * order)) { SetExtension(weights.backoff); ++context; } typename Quant::MiddlePointer(quant_, order - 2, middle_[order - 2].Insert(words[order - 1])).Write(weights.prob, weights.backoff); } void Longest(const void *data) { const WordIndex *words = reinterpret_cast(data); typename Quant::LongestPointer(quant_, longest_.Insert(words[order_ - 1])).Write(reinterpret_cast(words + order_)->prob); } void Cleanup() {} private: RecordReader *contexts_; const Quant &quant_; UnigramValue *const unigrams_; BitPackedMiddle *const middle_; BitPackedLongest &longest_; BitPacked &bigram_pack_; const unsigned char order_; SRISucks &sri_; }; struct Gram { Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {} const WordIndex *begin, *end; // For queue, this is the direction we want. bool operator<(const Gram &other) const { return std::lexicographical_compare(other.begin, other.end, begin, end); } }; template class BlankManager { public: BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) { for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb; } void Visit(const WordIndex *to, unsigned char length, float prob) { basis_[length - 1] = prob; unsigned char overlap = std::min(length - 1, been_length_); const WordIndex *cur; WordIndex *pre; for (cur = to, pre = been_; cur != to + overlap; ++cur, ++pre) { if (*pre != *cur) break; } if (cur == to + length - 1) { *pre = *cur; been_length_ = length; return; } // There are blanks to insert starting with order blank. unsigned char blank = cur - to + 1; UTIL_THROW_IF(blank == 1, FormatLoadException, "Missing a unigram that appears as context."); const float *lower_basis; for (lower_basis = basis_ + blank - 2; *lower_basis == kBadProb; --lower_basis) {} unsigned char based_on = lower_basis - basis_ + 1; for (; cur != to + length - 1; ++blank, ++cur, ++pre) { assert(*lower_basis != kBadProb); doing_.MiddleBlank(blank, to, based_on, *lower_basis); *pre = *cur; // Mark that the probability is a blank so it shouldn't be used as the basis for a later n-gram. basis_[blank - 1] = kBadProb; } *pre = *cur; been_length_ = length; } private: const unsigned char total_order_; WordIndex been_[KENLM_MAX_ORDER]; unsigned char been_length_; float basis_[KENLM_MAX_ORDER]; Doing &doing_; }; template void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) { util::ErsatzProgress progress(unigram_count + 1, progress_out, message); WordIndex unigram = 0; std::priority_queue grams; grams.push(Gram(&unigram, 1)); for (unsigned char i = 2; i <= total_order; ++i) { if (input[i-2]) grams.push(Gram(reinterpret_cast(input[i-2].Data()), i)); } BlankManager blank(total_order, doing); while (true) { Gram top = grams.top(); grams.pop(); unsigned char order = top.end - top.begin; if (order == 1) { blank.Visit(&unigram, 1, doing.UnigramProb(unigram)); doing.Unigram(unigram); progress.Set(unigram); if (++unigram == unigram_count + 1) break; grams.push(top); } else { if (order == total_order) { blank.Visit(top.begin, order, reinterpret_cast(top.end)->prob); doing.Longest(top.begin); } else { blank.Visit(top.begin, order, reinterpret_cast(top.end)->prob); doing.Middle(order, top.begin); } RecordReader &reader = input[order - 2]; if (++reader) grams.push(top); } } assert(grams.empty()); doing.Cleanup(); } void SanityCheckCounts(const std::vector &initial, const std::vector &fixed) { if (fixed[0] != initial[0]) UTIL_THROW(util::Exception, "Unigram count should be constant but initial is " << initial[0] << " and recounted is " << fixed[0]); if (fixed.back() != initial.back()) UTIL_THROW(util::Exception, "Longest count should be constant but it changed from " << initial.back() << " to " << fixed.back()); for (unsigned char i = 0; i < initial.size(); ++i) { if (fixed[i] < initial[i]) UTIL_THROW(util::Exception, "Counts came out lower than expected. This shouldn't happen"); } } template void TrainQuantizer(uint8_t order, uint64_t count, const std::vector &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { std::vector probs(additional), backoffs; probs.reserve(count + additional.size()); backoffs.reserve(count); for (reader.Rewind(); reader; ++reader) { const ProbBackoff &weights = *reinterpret_cast(reinterpret_cast(reader.Data()) + sizeof(WordIndex) * order); probs.push_back(weights.prob); if (weights.backoff != 0.0) backoffs.push_back(weights.backoff); ++progress; } quant.Train(order, probs, backoffs); } template void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { std::vector probs, backoffs; probs.reserve(count); for (reader.Rewind(); reader; ++reader) { const Prob &weights = *reinterpret_cast(reinterpret_cast(reader.Data()) + sizeof(WordIndex) * order); probs.push_back(weights.prob); ++progress; } quant.TrainProb(order, probs); } void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { // Fill unigram probabilities. try { rewind(file); for (WordIndex i = 0; i < unigram_count; ++i) { ReadOrThrow(file, &unigrams[i].weights, sizeof(ProbBackoff)); if (contexts && *reinterpret_cast(contexts.Data()) == i) { SetExtension(unigrams[i].weights.backoff); ++contexts; } } } catch (util::Exception &e) { e << " while re-reading unigram probabilities"; throw; } } } // namespace template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { RecordReader inputs[KENLM_MAX_ORDER - 1]; RecordReader contexts[KENLM_MAX_ORDER - 1]; for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex)); } SRISucks sri; std::vector fixed_counts; util::scoped_FILE unigram_file; util::scoped_fd unigram_fd(files.StealUnigram()); { util::scoped_memory unigrams; MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); FindBlanks finder(counts.size(), reinterpret_cast(unigrams.get()), sri); RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Identifying n-grams omitted by SRI", finder); fixed_counts = finder.Counts(); } unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) { if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); } SanityCheckCounts(counts, fixed_counts); counts = fixed_counts; sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config); for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Rewind(); } if (Quant::kTrain) { util::ErsatzProgress progress(std::accumulate(counts.begin() + 1, counts.end(), 0), config.ProgressMessages(), "Quantizing"); for (unsigned char i = 2; i < counts.size(); ++i) { TrainQuantizer(i, counts[i-1], sri.Values(i), inputs[i-2], progress, quant); } TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant); quant.FinishedLoading(config); } UnigramValue *unigrams = out.unigram_.Raw(); PopulateUnigramWeights(unigram_file.get(), counts[0], contexts[0], unigrams); unigram_file.reset(); for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Rewind(); } // Fill entries except unigram probabilities. { WriteEntries writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri); RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer); } // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. for (unsigned char order = 2; order <= counts.size(); ++order) { const RecordReader &context = contexts[order - 2]; if (context) { FormatLoadException e; e << "A " << static_cast(order) << "-gram has context"; const WordIndex *ctx = reinterpret_cast(context.Data()); for (const WordIndex *i = ctx; i != ctx + order - 1; ++i) { e << ' ' << *i; } e << " so this context must appear in the model as a " << static_cast(order - 1) << "-gram but it does not"; throw e; } } /* Set ending offsets so the last entry will be sized properly */ // Last entry for unigrams was already set. if (out.middle_begin_ != out.middle_end_) { for (typename TrieSearch::Middle *i = out.middle_begin_; i != out.middle_end_ - 1; ++i) { i->FinishedLoading((i+1)->InsertIndex(), config); } (out.middle_end_ - 1)->FinishedLoading(out.longest_.InsertIndex(), config); } } template uint8_t *TrieSearch::SetupMemory(uint8_t *start, const std::vector &counts, const Config &config) { quant_.SetupMemory(start, counts.size(), config); start += Quant::Size(counts.size(), config); unigram_.Init(start); start += Unigram::Size(counts[0]); FreeMiddles(); middle_begin_ = static_cast(malloc(sizeof(Middle) * (counts.size() - 2))); middle_end_ = middle_begin_ + (counts.size() - 2); std::vector middle_starts(counts.size() - 2); for (unsigned char i = 2; i < counts.size(); ++i) { middle_starts[i-2] = start; start += Middle::Size(Quant::MiddleBits(config), counts[i-1], counts[0], counts[i], config); } // Crazy backwards thing so we initialize using pointers to ones that have already been initialized for (unsigned char i = counts.size() - 1; i >= 2; --i) { new (middle_begin_ + i - 2) Middle( middle_starts[i-2], quant_.MiddleBits(config), counts[i-1], counts[0], counts[i], (i == counts.size() - 1) ? static_cast(longest_) : static_cast(middle_begin_[i-1]), config); } longest_.Init(start, quant_.LongestBits(config), counts[0]); return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); } template void TrieSearch::LoadedBinary() { unigram_.LoadedBinary(); for (Middle *i = middle_begin_; i != middle_end_; ++i) { i->LoadedBinary(); } longest_.LoadedBinary(); } template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) { std::string temporary_prefix; if (config.temporary_directory_prefix) { temporary_prefix = config.temporary_directory_prefix; } else if (config.write_mmap) { temporary_prefix = config.write_mmap; } else { temporary_prefix = file; } // At least 1MB sorting memory. SortedFiles sorted(config, f, counts, std::max(config.building_memory, 1048576), temporary_prefix, vocab); BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); } template class TrieSearch; template class TrieSearch; template class TrieSearch; template class TrieSearch; } // namespace trie } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/search_trie.hh ================================================ #ifndef LM_SEARCH_TRIE__ #define LM_SEARCH_TRIE__ #include "lm/config.hh" #include "lm/model_type.hh" #include "lm/return.hh" #include "lm/trie.hh" #include "lm/weights.hh" #include "util/file.hh" #include "util/file_piece.hh" #include #include namespace lm { namespace ngram { struct Backing; class SortedVocabulary; namespace trie { template class TrieSearch; class SortedFiles; template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); template class TrieSearch { public: typedef NodeRange Node; typedef ::lm::ngram::trie::UnigramPointer UnigramPointer; typedef typename Quant::MiddlePointer MiddlePointer; typedef typename Quant::LongestPointer LongestPointer; static const bool kDifferentRest = false; static const ModelType kModelType = static_cast(TRIE_SORTED + Quant::kModelTypeAdd + Bhiksha::kModelTypeAdd); static const unsigned int kVersion = 1; static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { Quant::UpdateConfigFromBinary(fd, counts, config); util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0])); Bhiksha::UpdateConfigFromBinary(fd, config); } static uint64_t Size(const std::vector &counts, const Config &config) { uint64_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); for (unsigned char i = 1; i < counts.size() - 1; ++i) { ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], counts[i+1], config); } return ret + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); } TrieSearch() : middle_begin_(NULL), middle_end_(NULL) {} ~TrieSearch() { FreeMiddles(); } uint8_t *SetupMemory(uint8_t *start, const std::vector &counts, const Config &config); void LoadedBinary(); void InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing); unsigned char Order() const { return middle_end_ - middle_begin_ + 2; } ProbBackoff &UnknownUnigram() { return unigram_.Unknown(); } UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const { extend_left = static_cast(word); UnigramPointer ret(unigram_.Find(word, next)); independent_left = (next.begin == next.end); return ret; } MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const { return MiddlePointer(quant_, extend_length - 2, middle_begin_[extend_length - 2].ReadEntry(extend_pointer, node)); } MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_left) const { util::BitAddress address(middle_begin_[order_minus_2].Find(word, node, extend_left)); independent_left = (address.base == NULL) || (node.begin == node.end); return MiddlePointer(quant_, order_minus_2, address); } LongestPointer LookupLongest(WordIndex word, const Node &node) const { return LongestPointer(quant_, longest_.Find(word, node)); } bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const { assert(begin != end); bool independent_left; uint64_t ignored; LookupUnigram(*begin, node, independent_left, ignored); for (const WordIndex *i = begin + 1; i < end; ++i) { if (independent_left || !LookupMiddle(i - begin - 1, *i, node, independent_left, ignored).Found()) return false; } return true; } private: friend void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); // Middles are managed manually so we can delay construction and they don't have to be copyable. void FreeMiddles() { for (const Middle *i = middle_begin_; i != middle_end_; ++i) { i->~Middle(); } free(middle_begin_); } typedef trie::BitPackedMiddle Middle; typedef trie::BitPackedLongest Longest; Longest longest_; Middle *middle_begin_, *middle_end_; Quant quant_; typedef ::lm::ngram::trie::Unigram Unigram; Unigram unigram_; }; } // namespace trie } // namespace ngram } // namespace lm #endif // LM_SEARCH_TRIE__ ================================================ FILE: src/kenlm/lm/sizes.cc ================================================ #include "lm/sizes.hh" #include "lm/model.hh" #include "util/file_piece.hh" #include #include namespace lm { namespace ngram { void ShowSizes(const std::vector &counts, const lm::ngram::Config &config) { uint64_t sizes[6]; sizes[0] = ProbingModel::Size(counts, config); sizes[1] = RestProbingModel::Size(counts, config); sizes[2] = TrieModel::Size(counts, config); sizes[3] = QuantTrieModel::Size(counts, config); sizes[4] = ArrayTrieModel::Size(counts, config); sizes[5] = QuantArrayTrieModel::Size(counts, config); uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); uint64_t divide; char prefix; if (min_length < (1 << 10) * 10) { prefix = ' '; divide = 1; } else if (min_length < (1 << 20) * 10) { prefix = 'k'; divide = 1 << 10; } else if (min_length < (1ULL << 30) * 10) { prefix = 'M'; divide = 1 << 20; } else { prefix = 'G'; divide = 1 << 30; } long int length = std::max(2, static_cast(ceil(log10((double) max_length / divide)))); std::cerr << "Memory estimate for binary LM:\ntype "; // right align bytes. for (long int i = 0; i < length - 2; ++i) std::cerr << ' '; std::cerr << prefix << "B\n" "probing " << std::setw(length) << (sizes[0] / divide) << " assuming -p " << config.probing_multiplier << "\n" "probing " << std::setw(length) << (sizes[1] / divide) << " assuming -r models -p " << config.probing_multiplier << "\n" "trie " << std::setw(length) << (sizes[2] / divide) << " without quantization\n" "trie " << std::setw(length) << (sizes[3] / divide) << " assuming -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits << " quantization \n" "trie " << std::setw(length) << (sizes[4] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " array pointer compression\n" "trie " << std::setw(length) << (sizes[5] / divide) << " assuming -a " << (unsigned)config.pointer_bhiksha_bits << " -q " << (unsigned)config.prob_bits << " -b " << (unsigned)config.backoff_bits<< " array pointer compression and quantization\n"; } void ShowSizes(const std::vector &counts) { lm::ngram::Config config; ShowSizes(counts, config); } void ShowSizes(const char *file, const lm::ngram::Config &config) { std::vector counts; util::FilePiece f(file); lm::ReadARPACounts(f, counts); ShowSizes(counts, config); } }} //namespaces ================================================ FILE: src/kenlm/lm/sizes.hh ================================================ #ifndef LM_SIZES__ #define LM_SIZES__ #include #include namespace lm { namespace ngram { struct Config; void ShowSizes(const std::vector &counts, const lm::ngram::Config &config); void ShowSizes(const std::vector &counts); void ShowSizes(const char *file, const lm::ngram::Config &config); }} // namespaces #endif // LM_SIZES__ ================================================ FILE: src/kenlm/lm/state.hh ================================================ #ifndef LM_STATE__ #define LM_STATE__ #include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/murmur_hash.hh" #include namespace lm { namespace ngram { // This is a POD but if you want memcmp to return the same as operator==, call // ZeroRemaining first. class State { public: bool operator==(const State &other) const { if (length != other.length) return false; return !memcmp(words, other.words, length * sizeof(WordIndex)); } // Three way comparison function. int Compare(const State &other) const { if (length != other.length) return length < other.length ? -1 : 1; return memcmp(words, other.words, length * sizeof(WordIndex)); } bool operator<(const State &other) const { if (length != other.length) return length < other.length; return memcmp(words, other.words, length * sizeof(WordIndex)) < 0; } // Call this before using raw memcmp. void ZeroRemaining() { for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) { words[i] = 0; backoff[i] = 0.0; } } unsigned char Length() const { return length; } // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD. // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit. WordIndex words[KENLM_MAX_ORDER - 1]; float backoff[KENLM_MAX_ORDER - 1]; unsigned char length; }; typedef State Right; inline uint64_t hash_value(const State &state, uint64_t seed = 0) { return util::MurmurHashNative(state.words, sizeof(WordIndex) * state.length, seed); } struct Left { bool operator==(const Left &other) const { return length == other.length && (!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full)); } int Compare(const Left &other) const { if (length < other.length) return -1; if (length > other.length) return 1; if (length == 0) return 0; // Must be full. if (pointers[length - 1] > other.pointers[length - 1]) return 1; if (pointers[length - 1] < other.pointers[length - 1]) return -1; return (int)full - (int)other.full; } bool operator<(const Left &other) const { return Compare(other) == -1; } void ZeroRemaining() { for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i) *i = 0; } uint64_t pointers[KENLM_MAX_ORDER - 1]; unsigned char length; bool full; }; inline uint64_t hash_value(const Left &left) { unsigned char add[2]; add[0] = left.length; add[1] = left.full; return util::MurmurHashNative(add, 2, left.length ? left.pointers[left.length - 1] : 0); } struct ChartState { bool operator==(const ChartState &other) { return (right == other.right) && (left == other.left); } int Compare(const ChartState &other) const { int lres = left.Compare(other.left); if (lres) return lres; return right.Compare(other.right); } bool operator<(const ChartState &other) const { return Compare(other) == -1; } void ZeroRemaining() { left.ZeroRemaining(); right.ZeroRemaining(); } Left left; State right; }; inline uint64_t hash_value(const ChartState &state) { return hash_value(state.right, hash_value(state.left)); } } // namespace ngram } // namespace lm #endif // LM_STATE__ ================================================ FILE: src/kenlm/lm/test.arpa ================================================ \data\ ngram 1=37 ngram 2=47 ngram 3=11 ngram 4=6 ngram 5=4 \1-grams: -1.383514 , -0.30103 -1.139057 . -0.845098 -1.029493 -99 -0.4149733 -1.995635 -20 -1.285941 a -0.69897 -1.687872 also -0.30103 -1.687872 beyond -0.30103 -1.687872 biarritz -0.30103 -1.687872 call -0.30103 -1.687872 concerns -0.30103 -1.687872 consider -0.30103 -1.687872 considering -0.30103 -1.687872 for -0.30103 -1.509559 higher -0.30103 -1.687872 however -0.30103 -1.687872 i -0.30103 -1.687872 immediate -0.30103 -1.687872 in -0.30103 -1.687872 is -0.30103 -1.285941 little -0.69897 -1.383514 loin -0.30103 -1.687872 look -0.30103 -1.285941 looking -0.4771212 -1.206319 more -0.544068 -1.509559 on -0.4771212 -1.509559 screening -0.4771212 -1.687872 small -0.30103 -1.687872 the -0.30103 -1.687872 to -0.30103 -1.687872 watch -0.30103 -1.687872 watching -0.30103 -1.687872 what -0.30103 -1.687872 would -0.30103 -3.141592 foo -2.718281 bar 3.0 -6.535897 baz -0.0 \2-grams: -0.6925742 , . -0.7522095 , however -0.7522095 , is -0.0602359 . -0.4846522 looking -0.4771214 -1.051485 screening -1.07153 the -1.07153 watching -1.07153 what -0.09132547 a little -0.69897 -0.2922095 also call -0.2922095 beyond immediate -0.2705918 biarritz . -0.2922095 call for -0.2922095 concerns in -0.2922095 consider watch -0.2922095 considering consider -0.2834328 for , -0.5511513 higher more -0.5845945 higher small -0.2834328 however , -0.2922095 i would -0.2922095 immediate concerns -0.2922095 in biarritz -0.2922095 is to -0.09021038 little more -0.1998621 -0.7273645 loin , -0.6925742 loin . -0.6708385 loin -0.2922095 look beyond -0.4638903 looking higher -0.4638903 looking on -0.4771212 -0.5136299 more . -0.4771212 -0.3561665 more loin -0.1649931 on a -0.4771213 -0.1649931 screening a -0.4771213 -0.2705918 small . -0.287799 the screening -0.2922095 to look -0.2622373 watch -0.2922095 watching considering -0.2922095 what i -0.2922095 would also -2 also would -6 -15 -2 -4 however -1 -6 foo bar \3-grams: -0.01916512 more . -0.0283603 on a little -0.4771212 -0.0283603 screening a little -0.4771212 -0.01660496 a little more -0.09409451 -0.3488368 looking higher -0.3488368 looking on -0.4771212 -0.1892331 little more loin -0.04835128 looking on a -0.4771212 -3 also would consider -7 -6 however -12 -7 to look good \4-grams: -0.009249173 looking on a little -0.4771212 -0.005464747 on a little more -0.4771212 -0.005464747 screening a little more -0.1453306 a little more loin -0.01552657 looking on a -0.4771212 -4 also would consider higher -8 \5-grams: -0.003061223 looking on a little -0.001813953 looking on a little more -0.0432557 on a little more loin -5 also would consider higher looking \end\ ================================================ FILE: src/kenlm/lm/test_nounk.arpa ================================================ \data\ ngram 1=36 ngram 2=45 ngram 3=10 ngram 4=6 ngram 5=4 \1-grams: -1.383514 , -0.30103 -1.139057 . -0.845098 -1.029493 -99 -0.4149733 -1.285941 a -0.69897 -1.687872 also -0.30103 -1.687872 beyond -0.30103 -1.687872 biarritz -0.30103 -1.687872 call -0.30103 -1.687872 concerns -0.30103 -1.687872 consider -0.30103 -1.687872 considering -0.30103 -1.687872 for -0.30103 -1.509559 higher -0.30103 -1.687872 however -0.30103 -1.687872 i -0.30103 -1.687872 immediate -0.30103 -1.687872 in -0.30103 -1.687872 is -0.30103 -1.285941 little -0.69897 -1.383514 loin -0.30103 -1.687872 look -0.30103 -1.285941 looking -0.4771212 -1.206319 more -0.544068 -1.509559 on -0.4771212 -1.509559 screening -0.4771212 -1.687872 small -0.30103 -1.687872 the -0.30103 -1.687872 to -0.30103 -1.687872 watch -0.30103 -1.687872 watching -0.30103 -1.687872 what -0.30103 -1.687872 would -0.30103 -3.141592 foo -2.718281 bar 3.0 -6.535897 baz -0.0 \2-grams: -0.6925742 , . -0.7522095 , however -0.7522095 , is -0.0602359 . -0.4846522 looking -0.4771214 -1.051485 screening -1.07153 the -1.07153 watching -1.07153 what -0.09132547 a little -0.69897 -0.2922095 also call -0.2922095 beyond immediate -0.2705918 biarritz . -0.2922095 call for -0.2922095 concerns in -0.2922095 consider watch -0.2922095 considering consider -0.2834328 for , -0.5511513 higher more -0.5845945 higher small -0.2834328 however , -0.2922095 i would -0.2922095 immediate concerns -0.2922095 in biarritz -0.2922095 is to -0.09021038 little more -0.1998621 -0.7273645 loin , -0.6925742 loin . -0.6708385 loin -0.2922095 look beyond -0.4638903 looking higher -0.4638903 looking on -0.4771212 -0.5136299 more . -0.4771212 -0.3561665 more loin -0.1649931 on a -0.4771213 -0.1649931 screening a -0.4771213 -0.2705918 small . -0.287799 the screening -0.2922095 to look -0.2622373 watch -0.2922095 watching considering -0.2922095 what i -0.2922095 would also -2 also would -6 -6 foo bar \3-grams: -0.01916512 more . -0.0283603 on a little -0.4771212 -0.0283603 screening a little -0.4771212 -0.01660496 a little more -0.09409451 -0.3488368 looking higher -0.3488368 looking on -0.4771212 -0.1892331 little more loin -0.04835128 looking on a -0.4771212 -3 also would consider -7 -7 to look good \4-grams: -0.009249173 looking on a little -0.4771212 -0.005464747 on a little more -0.4771212 -0.005464747 screening a little more -0.1453306 a little more loin -0.01552657 looking on a -0.4771212 -4 also would consider higher -8 \5-grams: -0.003061223 looking on a little -0.001813953 looking on a little more -0.0432557 on a little more loin -5 also would consider higher looking \end\ ================================================ FILE: src/kenlm/lm/trie.cc ================================================ #include "lm/trie.hh" #include "lm/bhiksha.hh" #include "util/bit_packing.hh" #include "util/exception.hh" #include "util/sorted_uniform.hh" #include namespace lm { namespace ngram { namespace trie { namespace { class KeyAccessor { public: KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits) : base_(reinterpret_cast(base)), key_mask_(key_mask), key_bits_(key_bits), total_bits_(total_bits) {} typedef uint64_t Key; Key operator()(uint64_t index) const { return util::ReadInt57(base_, index * static_cast(total_bits_), key_bits_, key_mask_); } private: const uint8_t *const base_; const WordIndex key_mask_; const uint8_t key_bits_, total_bits_; }; bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits, uint64_t begin_index, uint64_t end_index, const uint64_t max_vocab, const uint64_t key, uint64_t &at_index) { KeyAccessor accessor(base, key_mask, key_bits, total_bits); if (!util::BoundedSortedUniformFind::T>(accessor, begin_index - 1, (uint64_t)0, end_index, max_vocab, key, at_index)) return false; return true; } } // namespace uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits; // Extra entry for next pointer at the end. // +7 then / 8 to round up bits and convert to bytes // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault. // Note that this waste is O(order), not O(number of ngrams). return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t); } void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits) { util::BitPackingSanity(); word_bits_ = util::RequiredBits(max_vocab); word_mask_ = (1ULL << word_bits_) - 1ULL; if (word_bits_ > 57) UTIL_THROW(util::Exception, "Sorry, word indices more than " << (1ULL << 57) << " are not implemented. Edit util/bit_packing.hh and fix the bit packing functions."); total_bits_ = word_bits_ + remaining_bits; base_ = static_cast(base); insert_index_ = 0; max_vocab_ = max_vocab; } template uint64_t BitPackedMiddle::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { return Bhiksha::Size(entries + 1, max_ptr, config) + BaseSize(entries, max_vocab, quant_bits + Bhiksha::InlineBits(entries + 1, max_ptr, config)); } template BitPackedMiddle::BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config) : BitPacked(), quant_bits_(quant_bits), // If the offset of the method changes, also change TrieSearch::UpdateConfigFromBinary. bhiksha_(base, entries + 1, max_next, config), next_source_(&next_source) { if (entries + 1 >= (1ULL << 57) || (max_next >= (1ULL << 57))) UTIL_THROW(util::Exception, "Sorry, this does not support more than " << (1ULL << 57) << " n-grams of a particular order. Edit util/bit_packing.hh and fix the bit packing functions."); BaseInit(reinterpret_cast(base) + Bhiksha::Size(entries + 1, max_next, config), max_vocab, quant_bits_ + bhiksha_.InlineBits()); } template util::BitAddress BitPackedMiddle::Insert(WordIndex word) { assert(word <= word_mask_); uint64_t at_pointer = insert_index_ * total_bits_; util::WriteInt57(base_, at_pointer, word_bits_, word); at_pointer += word_bits_; util::BitAddress ret(base_, at_pointer); at_pointer += quant_bits_; uint64_t next = next_source_->InsertIndex(); bhiksha_.WriteNext(base_, at_pointer, insert_index_, next); ++insert_index_; return ret; } template util::BitAddress BitPackedMiddle::Find(WordIndex word, NodeRange &range, uint64_t &pointer) const { uint64_t at_pointer; if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) { return util::BitAddress(NULL, 0); } pointer = at_pointer; at_pointer *= total_bits_; at_pointer += word_bits_; bhiksha_.ReadNext(base_, at_pointer + quant_bits_, pointer, total_bits_, range); return util::BitAddress(base_, at_pointer); } template void BitPackedMiddle::FinishedLoading(uint64_t next_end, const Config &config) { uint64_t last_next_write = (insert_index_ + 1) * total_bits_ - bhiksha_.InlineBits(); bhiksha_.WriteNext(base_, last_next_write, insert_index_ + 1, next_end); bhiksha_.FinishedLoading(config); } util::BitAddress BitPackedLongest::Insert(WordIndex index) { assert(index <= word_mask_); uint64_t at_pointer = insert_index_ * total_bits_; util::WriteInt57(base_, at_pointer, word_bits_, index); at_pointer += word_bits_; ++insert_index_; return util::BitAddress(base_, at_pointer); } util::BitAddress BitPackedLongest::Find(WordIndex word, const NodeRange &range) const { uint64_t at_pointer; if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) return util::BitAddress(NULL, 0); at_pointer = at_pointer * total_bits_ + word_bits_; return util::BitAddress(base_, at_pointer); } template class BitPackedMiddle; template class BitPackedMiddle; } // namespace trie } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/trie.hh ================================================ #ifndef LM_TRIE__ #define LM_TRIE__ #include "lm/weights.hh" #include "lm/word_index.hh" #include "util/bit_packing.hh" #include #include namespace lm { namespace ngram { struct Config; namespace trie { struct NodeRange { uint64_t begin, end; }; // TODO: if the number of unigrams is a concern, also bit pack these records. struct UnigramValue { ProbBackoff weights; uint64_t next; uint64_t Next() const { return next; } }; class UnigramPointer { public: explicit UnigramPointer(const ProbBackoff &to) : to_(&to) {} UnigramPointer() : to_(NULL) {} bool Found() const { return to_ != NULL; } float Prob() const { return to_->prob; } float Backoff() const { return to_->backoff; } float Rest() const { return Prob(); } private: const ProbBackoff *to_; }; class Unigram { public: Unigram() {} void Init(void *start) { unigram_ = static_cast(start); } static uint64_t Size(uint64_t count) { // +1 in case unknown doesn't appear. +1 for the final next. return (count + 2) * sizeof(UnigramValue); } const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index].weights; } ProbBackoff &Unknown() { return unigram_[0].weights; } UnigramValue *Raw() { return unigram_; } void LoadedBinary() {} UnigramPointer Find(WordIndex word, NodeRange &next) const { UnigramValue *val = unigram_ + word; next.begin = val->next; next.end = (val+1)->next; return UnigramPointer(val->weights); } private: UnigramValue *unigram_; }; class BitPacked { public: BitPacked() {} uint64_t InsertIndex() const { return insert_index_; } protected: static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits); uint8_t word_bits_; uint8_t total_bits_; uint64_t word_mask_; uint8_t *base_; uint64_t insert_index_, max_vocab_; }; template class BitPackedMiddle : public BitPacked { public: static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); // next_source need not be initialized. BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config); util::BitAddress Insert(WordIndex word); void FinishedLoading(uint64_t next_end, const Config &config); void LoadedBinary() { bhiksha_.LoadedBinary(); } util::BitAddress Find(WordIndex word, NodeRange &range, uint64_t &pointer) const; util::BitAddress ReadEntry(uint64_t pointer, NodeRange &range) { uint64_t addr = pointer * total_bits_; addr += word_bits_; bhiksha_.ReadNext(base_, addr + quant_bits_, pointer, total_bits_, range); return util::BitAddress(base_, addr); } private: uint8_t quant_bits_; Bhiksha bhiksha_; const BitPacked *next_source_; }; class BitPackedLongest : public BitPacked { public: static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { return BaseSize(entries, max_vocab, quant_bits); } BitPackedLongest() {} void Init(void *base, uint8_t quant_bits, uint64_t max_vocab) { BaseInit(base, max_vocab, quant_bits); } void LoadedBinary() {} util::BitAddress Insert(WordIndex word); util::BitAddress Find(WordIndex word, const NodeRange &node) const; private: uint8_t quant_bits_; }; } // namespace trie } // namespace ngram } // namespace lm #endif // LM_TRIE__ ================================================ FILE: src/kenlm/lm/trie_sort.cc ================================================ #include "lm/trie_sort.hh" #include "lm/config.hh" #include "lm/lm_exception.hh" #include "lm/read_arpa.hh" #include "lm/vocab.hh" #include "lm/weights.hh" #include "lm/word_index.hh" #include "util/file_piece.hh" #include "util/mmap.hh" #include "util/proxy_iterator.hh" #include "util/sized_iterator.hh" #include #include #include #include #include #include #include namespace lm { namespace ngram { namespace trie { namespace { typedef util::SizedIterator NGramIter; // Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams. class PartialViewProxy { public: PartialViewProxy() : attention_size_(0), inner_() {} PartialViewProxy(void *ptr, std::size_t block_size, std::size_t attention_size) : attention_size_(attention_size), inner_(ptr, block_size) {} operator std::string() const { return std::string(reinterpret_cast(inner_.Data()), attention_size_); } PartialViewProxy &operator=(const PartialViewProxy &from) { memcpy(inner_.Data(), from.inner_.Data(), attention_size_); return *this; } PartialViewProxy &operator=(const std::string &from) { memcpy(inner_.Data(), from.data(), attention_size_); return *this; } const void *Data() const { return inner_.Data(); } void *Data() { return inner_.Data(); } private: friend class util::ProxyIterator; typedef std::string value_type; const std::size_t attention_size_; typedef util::SizedInnerIterator InnerIterator; InnerIterator &Inner() { return inner_; } const InnerIterator &Inner() const { return inner_; } InnerIterator inner_; }; typedef util::ProxyIterator PartialIter; FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &temp_prefix) { util::scoped_fd file(util::MakeTemp(temp_prefix)); util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); return util::FDOpenOrThrow(file); } FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) { const size_t context_size = sizeof(WordIndex) * (order - 1); // Sort just the contexts using the same memory. PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size)); PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size)); #if defined(_WIN32) || defined(_WIN64) std::stable_sort #else std::sort #endif (context_begin, context_end, util::SizedCompare(EntryCompare(order - 1))); util::scoped_FILE out(util::FMakeTemp(temp_prefix)); // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. if (context_begin == context_end) return out.release(); PartialIter i(context_begin); util::WriteOrThrow(out.get(), i->Data(), context_size); const void *previous = i->Data(); ++i; for (; i != context_end; ++i) { if (memcmp(previous, i->Data(), context_size)) { util::WriteOrThrow(out.get(), i->Data(), context_size); previous = i->Data(); } } return out.release(); } struct ThrowCombine { void operator()(std::size_t /*entry_size*/, const void * /*first*/, const void * /*second*/, FILE * /*out*/) const { UTIL_THROW(FormatLoadException, "Duplicate n-gram detected."); } }; // Useful for context files that just contain records with no value. struct FirstCombine { void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const { util::WriteOrThrow(out, first, entry_size); } }; template FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const std::string &temp_prefix, std::size_t weights_size, unsigned char order, const Combine &combine) { std::size_t entry_size = sizeof(WordIndex) * order + weights_size; RecordReader first, second; first.Init(first_file, entry_size); second.Init(second_file, entry_size); util::scoped_FILE out_file(util::FMakeTemp(temp_prefix)); EntryCompare less(order); while (first && second) { if (less(first.Data(), second.Data())) { util::WriteOrThrow(out_file.get(), first.Data(), entry_size); ++first; } else if (less(second.Data(), first.Data())) { util::WriteOrThrow(out_file.get(), second.Data(), entry_size); ++second; } else { combine(entry_size, first.Data(), second.Data(), out_file.get()); ++first; ++second; } } for (RecordReader &remains = (first ? first : second); remains; ++remains) { util::WriteOrThrow(out_file.get(), remains.Data(), entry_size); } return out_file.release(); } } // namespace void RecordReader::Init(FILE *file, std::size_t entry_size) { entry_size_ = entry_size; data_.reset(malloc(entry_size)); UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); file_ = file; if (file) { rewind(file); remains_ = true; ++*this; } else { remains_ = false; } } void RecordReader::Overwrite(const void *start, std::size_t amount) { long internal = (uint8_t*)start - (uint8_t*)data_.get(); UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); util::WriteOrThrow(file_, start, amount); long forward = entry_size_ - internal - amount; #if !defined(_WIN32) && !defined(_WIN64) if (forward) #endif UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); } void RecordReader::Rewind() { if (file_) { rewind(file_); remains_ = true; ++*this; } else { remains_ = false; } } SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { PositiveProbWarn warn(config.positive_log_probability); unigram_.reset(util::MakeTemp(file_prefix)); { // In case appears. size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out); Read1Grams(f, counts[0], vocab, reinterpret_cast(unigram_mmap.get()), warn); CheckSpecials(config, vocab); if (!vocab.SawUnk()) ++counts[0]; } // Only use as much buffer as we need. size_t buffer_use = 0; for (unsigned int order = 2; order < counts.size(); ++order) { buffer_use = std::max(buffer_use, static_cast((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1])); } buffer_use = std::max(buffer_use, static_cast((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back())); buffer = std::min(buffer, buffer_use); util::scoped_malloc mem; mem.reset(malloc(buffer)); if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); for (unsigned char order = 2; order <= counts.size(); ++order) { ConvertToSorted(f, vocab, counts, file_prefix, order, warn, mem.get(), buffer); } ReadEnd(f); } namespace { class Closer { public: explicit Closer(std::deque &files) : files_(files) {} ~Closer() { for (std::deque::iterator i = files_.begin(); i != files_.end(); ++i) { util::scoped_FILE deleter(*i); } } void PopFront() { util::scoped_FILE deleter(files_.front()); files_.pop_front(); } private: std::deque &files_; }; } // namespace void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { ReadNGramHeader(f, order); const size_t count = counts[order - 1]; // Size of weights. Does it include backoff? const size_t words_size = sizeof(WordIndex) * order; const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); const size_t entry_size = words_size + weights_size; const size_t batch_size = std::min(count, mem_size / entry_size); uint8_t *const begin = reinterpret_cast(mem); std::deque files, contexts; Closer files_closer(files), contexts_closer(contexts); for (std::size_t batch = 0, done = 0; done < count; ++batch) { uint8_t *out = begin; uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; if (order == counts.size()) { for (; out != out_end; out += entry_size) { ReadNGram(f, order, vocab, reinterpret_cast(out), *reinterpret_cast(out + words_size), warn); } } else { for (; out != out_end; out += entry_size) { ReadNGram(f, order, vocab, reinterpret_cast(out), *reinterpret_cast(out + words_size), warn); } } // Sort full records by full n-gram. util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size); // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies. #if defined(_WIN32) || defined(_WIN64) std::stable_sort #else std::sort #endif (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare(EntryCompare(order))); files.push_back(DiskFlush(begin, out_end, file_prefix)); contexts.push_back(WriteContextFile(begin, out_end, file_prefix, entry_size, order)); done += (out_end - begin) / entry_size; } // All individual files created. Merge them. while (files.size() > 1) { files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine())); files_closer.PopFront(); files_closer.PopFront(); contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], file_prefix, 0, order - 1, FirstCombine())); contexts_closer.PopFront(); contexts_closer.PopFront(); } if (!files.empty()) { // Steal from closers. full_[order - 2].reset(files.front()); files.pop_front(); context_[order - 2].reset(contexts.front()); contexts.pop_front(); } } } // namespace trie } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/trie_sort.hh ================================================ // Step of trie builder: create sorted files. #ifndef LM_TRIE_SORT__ #define LM_TRIE_SORT__ #include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/file.hh" #include "util/scoped.hh" #include #include #include #include #include namespace util { class FilePiece; } // namespace util namespace lm { class PositiveProbWarn; namespace ngram { class SortedVocabulary; struct Config; namespace trie { class EntryCompare : public std::binary_function { public: explicit EntryCompare(unsigned char order) : order_(order) {} bool operator()(const void *first_void, const void *second_void) const { const WordIndex *first = static_cast(first_void); const WordIndex *second = static_cast(second_void); const WordIndex *end = first + order_; for (; first != end; ++first, ++second) { if (*first < *second) return true; if (*first > *second) return false; } return false; } private: unsigned char order_; }; class RecordReader { public: RecordReader() : remains_(true) {} void Init(FILE *file, std::size_t entry_size); void *Data() { return data_.get(); } const void *Data() const { return data_.get(); } RecordReader &operator++() { std::size_t ret = fread(data_.get(), entry_size_, 1, file_); if (!ret) { UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); remains_ = false; } return *this; } operator bool() const { return remains_; } void Rewind(); std::size_t EntrySize() const { return entry_size_; } void Overwrite(const void *start, std::size_t amount); private: FILE *file_; util::scoped_malloc data_; bool remains_; std::size_t entry_size_; }; class SortedFiles { public: // Build from ARPA SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); int StealUnigram() { return unigram_.release(); } FILE *Full(unsigned char order) { return full_[order - 2].get(); } FILE *Context(unsigned char of_order) { return context_[of_order - 2].get(); } private: void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); util::scoped_fd unigram_; util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; }; } // namespace trie } // namespace ngram } // namespace lm #endif // LM_TRIE_SORT__ ================================================ FILE: src/kenlm/lm/value.hh ================================================ #ifndef LM_VALUE__ #define LM_VALUE__ #include "lm/model_type.hh" #include "lm/value_build.hh" #include "lm/weights.hh" #include "util/bit_packing.hh" #include namespace lm { namespace ngram { // Template proxy for probing unigrams and middle. template class GenericProbingProxy { public: explicit GenericProbingProxy(const Weights &to) : to_(&to) {} GenericProbingProxy() : to_(0) {} bool Found() const { return to_ != 0; } float Prob() const { util::FloatEnc enc; enc.f = to_->prob; enc.i |= util::kSignBit; return enc.f; } float Backoff() const { return to_->backoff; } bool IndependentLeft() const { util::FloatEnc enc; enc.f = to_->prob; return enc.i & util::kSignBit; } protected: const Weights *to_; }; // Basic proxy for trie unigrams. template class GenericTrieUnigramProxy { public: explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {} GenericTrieUnigramProxy() : to_(0) {} bool Found() const { return to_ != 0; } float Prob() const { return to_->prob; } float Backoff() const { return to_->backoff; } float Rest() const { return Prob(); } protected: const Weights *to_; }; struct BackoffValue { typedef ProbBackoff Weights; static const ModelType kProbingModelType = PROBING; class ProbingProxy : public GenericProbingProxy { public: explicit ProbingProxy(const Weights &to) : GenericProbingProxy(to) {} ProbingProxy() {} float Rest() const { return Prob(); } }; class TrieUnigramProxy : public GenericTrieUnigramProxy { public: explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy(to) {} TrieUnigramProxy() {} float Rest() const { return Prob(); } }; struct ProbingEntry { typedef uint64_t Key; typedef Weights Value; uint64_t key; ProbBackoff value; uint64_t GetKey() const { return key; } }; struct TrieUnigramValue { Weights weights; uint64_t next; uint64_t Next() const { return next; } }; const static bool kDifferentRest = false; template void Callback(const Config &, unsigned int, typename Model::Vocabulary &, C &callback) { NoRestBuild build; callback(build); } }; struct RestValue { typedef RestWeights Weights; static const ModelType kProbingModelType = REST_PROBING; class ProbingProxy : public GenericProbingProxy { public: explicit ProbingProxy(const Weights &to) : GenericProbingProxy(to) {} ProbingProxy() {} float Rest() const { return to_->rest; } }; class TrieUnigramProxy : public GenericTrieUnigramProxy { public: explicit TrieUnigramProxy(const Weights &to) : GenericTrieUnigramProxy(to) {} TrieUnigramProxy() {} float Rest() const { return to_->rest; } }; // gcc 4.1 doesn't properly back dependent types :-(. #pragma pack(push) #pragma pack(4) struct ProbingEntry { typedef uint64_t Key; typedef Weights Value; Key key; Value value; Key GetKey() const { return key; } }; struct TrieUnigramValue { Weights weights; uint64_t next; uint64_t Next() const { return next; } }; #pragma pack(pop) const static bool kDifferentRest = true; template void Callback(const Config &config, unsigned int order, typename Model::Vocabulary &vocab, C &callback) { switch (config.rest_function) { case Config::REST_MAX: { MaxRestBuild build; callback(build); } break; case Config::REST_LOWER: { LowerRestBuild build(config, order, vocab); callback(build); } break; } } }; } // namespace ngram } // namespace lm #endif // LM_VALUE__ ================================================ FILE: src/kenlm/lm/value_build.cc ================================================ #include "lm/value_build.hh" #include "lm/model.hh" #include "lm/read_arpa.hh" namespace lm { namespace ngram { template LowerRestBuild::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) { UTIL_THROW_IF(config.rest_lower_files.size() != order - 1, ConfigException, "This model has order " << order << " so there should be " << (order - 1) << " lower-order models for rest cost purposes."); Config for_lower = config; for_lower.rest_lower_files.clear(); // Unigram models aren't supported, so this is a custom loader. // TODO: optimize the unigram loading? { util::FilePiece uni(config.rest_lower_files[0].c_str()); std::vector number; ReadARPACounts(uni, number); UTIL_THROW_IF(number.size() != 1, FormatLoadException, "Expected the unigram model to have order 1, not " << number.size()); ReadNGramHeader(uni, 1); unigrams_.resize(number[0]); unigrams_[0] = config.unknown_missing_logprob; PositiveProbWarn warn; for (uint64_t i = 0; i < number[0]; ++i) { WordIndex w; Prob entry; ReadNGram(uni, 1, vocab, &w, entry, warn); unigrams_[w] = entry.prob; } } try { for (unsigned int i = 2; i < order; ++i) { models_.push_back(new Model(config.rest_lower_files[i - 1].c_str(), for_lower)); UTIL_THROW_IF(models_.back()->Order() != i, FormatLoadException, "Lower order file " << config.rest_lower_files[i-1] << " should have order " << i); } } catch (...) { for (typename std::vector::const_iterator i = models_.begin(); i != models_.end(); ++i) { delete *i; } models_.clear(); throw; } // TODO: force/check same vocab. } template LowerRestBuild::~LowerRestBuild() { for (typename std::vector::const_iterator i = models_.begin(); i != models_.end(); ++i) { delete *i; } } template class LowerRestBuild; } // namespace ngram } // namespace lm ================================================ FILE: src/kenlm/lm/value_build.hh ================================================ #ifndef LM_VALUE_BUILD__ #define LM_VALUE_BUILD__ #include "lm/weights.hh" #include "lm/word_index.hh" #include "util/bit_packing.hh" #include namespace lm { namespace ngram { struct Config; struct BackoffValue; struct RestValue; class NoRestBuild { public: typedef BackoffValue Value; NoRestBuild() {} void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} void SetRest(const WordIndex *, unsigned int, const ProbBackoff &) const {} template bool MarkExtends(ProbBackoff &weights, const Second &) const { util::UnsetSign(weights.prob); return false; } // Probing doesn't need to go back to unigram. const static bool kMarkEvenLower = false; }; class MaxRestBuild { public: typedef RestValue Value; MaxRestBuild() {} void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} void SetRest(const WordIndex *, unsigned int, RestWeights &weights) const { weights.rest = weights.prob; util::SetSign(weights.rest); } bool MarkExtends(RestWeights &weights, const RestWeights &to) const { util::UnsetSign(weights.prob); if (weights.rest >= to.rest) return false; weights.rest = to.rest; return true; } bool MarkExtends(RestWeights &weights, const Prob &to) const { util::UnsetSign(weights.prob); if (weights.rest >= to.prob) return false; weights.rest = to.prob; return true; } // Probing does need to go back to unigram. const static bool kMarkEvenLower = true; }; template class LowerRestBuild { public: typedef RestValue Value; LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab); ~LowerRestBuild(); void SetRest(const WordIndex *, unsigned int, const Prob &/*prob*/) const {} void SetRest(const WordIndex *vocab_ids, unsigned int n, RestWeights &weights) const { typename Model::State ignored; if (n == 1) { weights.rest = unigrams_[*vocab_ids]; } else { weights.rest = models_[n-2]->FullScoreForgotState(vocab_ids + 1, vocab_ids + n, *vocab_ids, ignored).prob; } } template bool MarkExtends(RestWeights &weights, const Second &) const { util::UnsetSign(weights.prob); return false; } const static bool kMarkEvenLower = false; std::vector unigrams_; std::vector models_; }; } // namespace ngram } // namespace lm #endif // LM_VALUE_BUILD__ ================================================ FILE: src/kenlm/lm/virtual_interface.cc ================================================ #include "lm/virtual_interface.hh" #include "lm/lm_exception.hh" namespace lm { namespace base { Vocabulary::~Vocabulary() {} void Vocabulary::SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { begin_sentence_ = begin_sentence; end_sentence_ = end_sentence; not_found_ = not_found; } Model::~Model() {} } // namespace base } // namespace lm ================================================ FILE: src/kenlm/lm/virtual_interface.hh ================================================ #ifndef LM_VIRTUAL_INTERFACE__ #define LM_VIRTUAL_INTERFACE__ #include "lm/return.hh" #include "lm/word_index.hh" #include "util/string_piece.hh" #include namespace lm { namespace base { template class ModelFacade; /* Vocabulary interface. Call Index(string) and get a word index for use in * calling Model. It provides faster convenience functions for , , and * although you can also find these using Index. * * Some models do not load the mapping from index to string. If you need this, * check if the model Vocabulary class implements such a function and access it * directly. * * The Vocabulary object is always owned by the Model and can be retrieved from * the Model using BaseVocabulary() for this abstract interface or * GetVocabulary() for the actual implementation (in which case you'll need the * actual implementation of the Model too). */ class Vocabulary { public: virtual ~Vocabulary(); WordIndex BeginSentence() const { return begin_sentence_; } WordIndex EndSentence() const { return end_sentence_; } WordIndex NotFound() const { return not_found_; } /* Most implementations allow StringPiece lookups and need only override * Index(StringPiece). SRI requires null termination and overrides all * three methods. */ virtual WordIndex Index(const StringPiece &str) const = 0; virtual WordIndex Index(const std::string &str) const { return Index(StringPiece(str)); } virtual WordIndex Index(const char *str) const { return Index(StringPiece(str)); } protected: // Call SetSpecial afterward. Vocabulary() {} Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) { SetSpecial(begin_sentence, end_sentence, not_found); } void SetSpecial(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found); WordIndex begin_sentence_, end_sentence_, not_found_; private: // Disable copy constructors. They're private and undefined. // Ersatz boost::noncopyable. Vocabulary(const Vocabulary &); Vocabulary &operator=(const Vocabulary &); }; /* There are two ways to access a Model. * * * OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh). * * Every Model implements the scoring function: * float Score( * const Model::State &in_state, * const WordIndex new_word, * Model::State &out_state) const; * * It can also return the length of n-gram matched by the model: * FullScoreReturn FullScore( * const Model::State &in_state, * const WordIndex new_word, * Model::State &out_state) const; * * * There are also accessor functions: * const State &BeginSentenceState() const; * const State &NullContextState() const; * const Vocabulary &GetVocabulary() const; * unsigned int Order() const; * * NB: In case you're wondering why the model implementation looks like it's * missing these methods, see facade.hh. * * This is the fastest way to use a model and presents a normal State class to * be included in a hypothesis state structure. * * * OPTION 2: Use the virtual interface below. * * The virtual interface allow you to decide which Model to use at runtime * without templatizing everything on the Model type. However, each Model has * its own State class, so a single State cannot be efficiently provided (it * would require using the maximum memory of any Model's State or memory * allocation with each lookup). This means you become responsible for * allocating memory with size StateSize() and passing it to the Score or * FullScore functions provided here. * * For example, cdec has a std::string containing the entire state of a * hypothesis. It can reserve StateSize bytes in this string for the model * state. * * All the State objects are POD, so it's ok to use raw memory for storing * State. * in_state and out_state must not have the same address. */ class Model { public: virtual ~Model(); size_t StateSize() const { return state_size_; } const void *BeginSentenceMemory() const { return begin_sentence_memory_; } const void *NullContextMemory() const { return null_context_memory_; } // Requires in_state != out_state virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0; // Requires in_state != out_state virtual FullScoreReturn FullScore(const void *in_state, const WordIndex new_word, void *out_state) const = 0; unsigned char Order() const { return order_; } const Vocabulary &BaseVocabulary() const { return *base_vocab_; } private: template friend class ModelFacade; explicit Model(size_t state_size) : state_size_(state_size) {} const size_t state_size_; const void *begin_sentence_memory_, *null_context_memory_; const Vocabulary *base_vocab_; unsigned char order_; // Disable copy constructors. They're private and undefined. // Ersatz boost::noncopyable. Model(const Model &); Model &operator=(const Model &); }; } // mamespace base } // namespace lm #endif // LM_VIRTUAL_INTERFACE__ ================================================ FILE: src/kenlm/lm/weights.hh ================================================ #ifndef LM_WEIGHTS__ #define LM_WEIGHTS__ // Weights for n-grams. Probability and possibly a backoff. namespace lm { struct Prob { float prob; }; // No inheritance so this will be a POD. struct ProbBackoff { float prob; float backoff; }; struct RestWeights { float prob; float backoff; float rest; }; } // namespace lm #endif // LM_WEIGHTS__ ================================================ FILE: src/kenlm/lm/word_index.hh ================================================ // Separate header because this is used often. #ifndef LM_WORD_INDEX__ #define LM_WORD_INDEX__ #include namespace lm { typedef unsigned int WordIndex; const WordIndex kMaxWordIndex = UINT_MAX; } // namespace lm typedef lm::WordIndex LMWordIndex; #endif ================================================ FILE: src/kenlm/util/Jamfile ================================================ local compressed_flags = .. HAVE_ZLIB ; local compressed_deps = /top//z ; if [ test_library "bz2" ] && [ test_header "bzlib.h" ] { external-lib bz2 ; compressed_flags += HAVE_BZLIB ; compressed_deps += bz2 ; } if [ test_library "lzma" ] && [ test_header "lzma.h" ] { external-lib lzma ; compressed_flags += HAVE_XZLIB ; compressed_deps += lzma ; } obj read_compressed.o : read_compressed.cc : $(compressed_flags) ; alias read_compressed : read_compressed.o $(compressed_deps) ; obj read_compressed_test.o : read_compressed_test.cc /top//boost_unit_test_framework : $(compressed_flags) ; obj file_piece_test.o : file_piece_test.cc /top//boost_unit_test_framework : $(compressed_flags) ; fakelib kenutil : bit_packing.cc ersatz_progress.cc exception.cc file.cc file_piece.cc mmap.cc murmur_hash.cc pool.cc read_compressed scoped.cc string_piece.cc usage.cc double-conversion//double-conversion : .. : : .. ; import testing ; unit-test bit_packing_test : bit_packing_test.cc kenutil /top//boost_unit_test_framework ; run file_piece_test.o kenutil /top//boost_unit_test_framework : : file_piece.cc ; unit-test read_compressed_test : read_compressed_test.o kenutil /top//boost_unit_test_framework ; unit-test joint_sort_test : joint_sort_test.cc kenutil /top//boost_unit_test_framework ; unit-test probing_hash_table_test : probing_hash_table_test.cc kenutil /top//boost_unit_test_framework ; unit-test sorted_uniform_test : sorted_uniform_test.cc kenutil /top//boost_unit_test_framework ; unit-test tokenize_piece_test : tokenize_piece_test.cc kenutil /top//boost_unit_test_framework ; unit-test multi_intersection_test : multi_intersection_test.cc kenutil /top//boost_unit_test_framework ; ================================================ FILE: src/kenlm/util/bit_packing.cc ================================================ #include "util/bit_packing.hh" #include "util/exception.hh" #include namespace util { namespace { template struct StaticCheck {}; template <> struct StaticCheck { typedef bool StaticAssertionPassed; }; // If your float isn't 4 bytes, we're hosed. typedef StaticCheck::StaticAssertionPassed FloatSize; } // namespace uint8_t RequiredBits(uint64_t max_value) { if (!max_value) return 0; uint8_t ret = 1; while (max_value >>= 1) ++ret; return ret; } void BitPackingSanity() { const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 }; if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000"); char mem[57+8]; memset(mem, 0, sizeof(mem)); const uint64_t test57 = 0x123456789abcdefULL; for (uint64_t b = 0; b < 57 * 8; b += 57) { WriteInt57(mem, b, 57, test57); } for (uint64_t b = 0; b < 57 * 8; b += 57) { if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1)) UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler."); } // TODO: more checks. } } // namespace util ================================================ FILE: src/kenlm/util/bit_packing.hh ================================================ #ifndef UTIL_BIT_PACKING__ #define UTIL_BIT_PACKING__ /* Bit-level packing routines * * WARNING WARNING WARNING: * The write functions assume that memory is zero initially. This makes them * faster and is the appropriate case for mmapped language model construction. * These routines assume that unaligned access to uint64_t is fast. This is * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on * x86 but my target audience is large language models for which 64-bit is * necessary. * * Call the BitPackingSanity function to sanity check. Calling once suffices, * but it may be called multiple times when that's inconvenient. * * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at * NICT. */ #include #ifdef __APPLE__ #include #elif __linux__ #include #elif !defined(_WIN32) && !defined(_WIN64) #include #endif #include #include namespace util { // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. #if BYTE_ORDER == LITTLE_ENDIAN inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) { return bit; } #elif BYTE_ORDER == BIG_ENDIAN inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { return 64 - length - bit; } #else #error "Bit packing code isn't written for your byte order." #endif inline uint64_t ReadOff(const void *base, uint64_t bit_off) { #if defined(__arm) || defined(__arm__) const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); uint64_t value64; memcpy(&value64, base_off, sizeof(value64)); return value64; #else return *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)); #endif } /* Pack integers up to 57 bits using their least significant digits. * The length is specified using mask: * Assumes mask == (1 << length) - 1 where length <= 57. */ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) { return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask; } /* Assumes value < (1 << length) and length <= 57. * Assumes the memory is zero initially. */ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { #if defined(__arm) || defined(__arm__) uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); uint64_t value64; memcpy(&value64, base_off, sizeof(value64)); value64 |= (value << BitPackShift(bit_off & 7, length)); memcpy(base_off, &value64, sizeof(value64)); #else *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); #endif } /* Same caveats as above, but for a 25 bit limit. */ inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { #if defined(__arm) || defined(__arm__) const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); uint32_t value32; memcpy(&value32, base_off, sizeof(value32)); return (value32 >> BitPackShift(bit_off & 7, length)) & mask; #else return (*reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; #endif } inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { #if defined(__arm) || defined(__arm__) uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); uint32_t value32; memcpy(&value32, base_off, sizeof(value32)); value32 |= (value << BitPackShift(bit_off & 7, length)); memcpy(base_off, &value32, sizeof(value32)); #else *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); #endif } typedef union { float f; uint32_t i; } FloatEnc; inline float ReadFloat32(const void *base, uint64_t bit_off) { FloatEnc encoded; encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32); return encoded.f; } inline void WriteFloat32(void *base, uint64_t bit_off, float value) { FloatEnc encoded; encoded.f = value; WriteInt57(base, bit_off, 32, encoded.i); } const uint32_t kSignBit = 0x80000000; inline void SetSign(float &to) { FloatEnc enc; enc.f = to; enc.i |= kSignBit; to = enc.f; } inline void UnsetSign(float &to) { FloatEnc enc; enc.f = to; enc.i &= ~kSignBit; to = enc.f; } inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) { FloatEnc encoded; encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31); // Sign bit set means negative. encoded.i |= kSignBit; return encoded.f; } inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) { FloatEnc encoded; encoded.f = value; encoded.i &= ~kSignBit; WriteInt57(base, bit_off, 31, encoded.i); } void BitPackingSanity(); // Return bits required to store integers upto max_value. Not the most // efficient implementation, but this is only called a few times to size tries. uint8_t RequiredBits(uint64_t max_value); struct BitsMask { static BitsMask ByMax(uint64_t max_value) { BitsMask ret; ret.FromMax(max_value); return ret; } static BitsMask ByBits(uint8_t bits) { BitsMask ret; ret.bits = bits; ret.mask = (1ULL << bits) - 1; return ret; } void FromMax(uint64_t max_value) { bits = RequiredBits(max_value); mask = (1ULL << bits) - 1; } uint8_t bits; uint64_t mask; }; struct BitAddress { BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {} void *base; uint64_t offset; }; } // namespace util #endif // UTIL_BIT_PACKING__ ================================================ FILE: src/kenlm/util/bit_packing_test.cc ================================================ #include "util/bit_packing.hh" #define BOOST_TEST_MODULE BitPackingTest #include #include namespace util { namespace { const uint64_t test57 = 0x123456789abcdefULL; const uint32_t test25 = 0x1234567; BOOST_AUTO_TEST_CASE(ZeroBit57) { char mem[16]; memset(mem, 0, sizeof(mem)); WriteInt57(mem, 0, 57, test57); BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1)); } BOOST_AUTO_TEST_CASE(EachBit57) { char mem[16]; for (uint8_t b = 0; b < 8; ++b) { memset(mem, 0, sizeof(mem)); WriteInt57(mem, b, 57, test57); BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); } } BOOST_AUTO_TEST_CASE(Consecutive57) { char mem[57+8]; memset(mem, 0, sizeof(mem)); for (uint64_t b = 0; b < 57 * 8; b += 57) { WriteInt57(mem, b, 57, test57); BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); } for (uint64_t b = 0; b < 57 * 8; b += 57) { BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1)); } } BOOST_AUTO_TEST_CASE(Consecutive25) { char mem[25+8]; memset(mem, 0, sizeof(mem)); for (uint64_t b = 0; b < 25 * 8; b += 25) { WriteInt25(mem, b, 25, test25); BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); } for (uint64_t b = 0; b < 25 * 8; b += 25) { BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1)); } } BOOST_AUTO_TEST_CASE(Sanity) { BitPackingSanity(); } } // namespace } // namespace util ================================================ FILE: src/kenlm/util/double-conversion/Jamfile ================================================ fakelib double-conversion : [ glob *.cc ] : : : . ; ================================================ FILE: src/kenlm/util/double-conversion/LICENSE ================================================ Copyright 2006-2011, the V8 project authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: src/kenlm/util/double-conversion/bignum-dtoa.cc ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "bignum-dtoa.h" #include "bignum.h" #include "ieee.h" namespace double_conversion { static int NormalizedExponent(uint64_t significand, int exponent) { ASSERT(significand != 0); while ((significand & Double::kHiddenBit) == 0) { significand = significand << 1; exponent = exponent - 1; } return exponent; } // Forward declarations: // Returns an estimation of k such that 10^(k-1) <= v < 10^k. static int EstimatePower(int exponent); // Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator // and denominator. static void InitialScaledStartValues(uint64_t significand, int exponent, bool lower_boundary_is_closer, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus); // Multiplies numerator/denominator so that its values lies in the range 1-10. // Returns decimal_point s.t. // v = numerator'/denominator' * 10^(decimal_point-1) // where numerator' and denominator' are the values of numerator and // denominator after the call to this function. static void FixupMultiply10(int estimated_power, bool is_even, int* decimal_point, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus); // Generates digits from the left to the right and stops when the generated // digits yield the shortest decimal representation of v. static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus, bool is_even, Vector buffer, int* length); // Generates 'requested_digits' after the decimal point. static void BignumToFixed(int requested_digits, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector(buffer), int* length); // Generates 'count' digits of numerator/denominator. // Once 'count' digits have been produced rounds the result depending on the // remainder (remainders of exactly .5 round upwards). Might update the // decimal_point when rounding up (for example for 0.9999). static void GenerateCountedDigits(int count, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector(buffer), int* length); void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, Vector buffer, int* length, int* decimal_point) { ASSERT(v > 0); ASSERT(!Double(v).IsSpecial()); uint64_t significand; int exponent; bool lower_boundary_is_closer; if (mode == BIGNUM_DTOA_SHORTEST_SINGLE) { float f = static_cast(v); ASSERT(f == v); significand = Single(f).Significand(); exponent = Single(f).Exponent(); lower_boundary_is_closer = Single(f).LowerBoundaryIsCloser(); } else { significand = Double(v).Significand(); exponent = Double(v).Exponent(); lower_boundary_is_closer = Double(v).LowerBoundaryIsCloser(); } bool need_boundary_deltas = (mode == BIGNUM_DTOA_SHORTEST || mode == BIGNUM_DTOA_SHORTEST_SINGLE); bool is_even = (significand & 1) == 0; int normalized_exponent = NormalizedExponent(significand, exponent); // estimated_power might be too low by 1. int estimated_power = EstimatePower(normalized_exponent); // Shortcut for Fixed. // The requested digits correspond to the digits after the point. If the // number is much too small, then there is no need in trying to get any // digits. if (mode == BIGNUM_DTOA_FIXED && -estimated_power - 1 > requested_digits) { buffer[0] = '\0'; *length = 0; // Set decimal-point to -requested_digits. This is what Gay does. // Note that it should not have any effect anyways since the string is // empty. *decimal_point = -requested_digits; return; } Bignum numerator; Bignum denominator; Bignum delta_minus; Bignum delta_plus; // Make sure the bignum can grow large enough. The smallest double equals // 4e-324. In this case the denominator needs fewer than 324*4 binary digits. // The maximum double is 1.7976931348623157e308 which needs fewer than // 308*4 binary digits. ASSERT(Bignum::kMaxSignificantBits >= 324*4); InitialScaledStartValues(significand, exponent, lower_boundary_is_closer, estimated_power, need_boundary_deltas, &numerator, &denominator, &delta_minus, &delta_plus); // We now have v = (numerator / denominator) * 10^estimated_power. FixupMultiply10(estimated_power, is_even, decimal_point, &numerator, &denominator, &delta_minus, &delta_plus); // We now have v = (numerator / denominator) * 10^(decimal_point-1), and // 1 <= (numerator + delta_plus) / denominator < 10 switch (mode) { case BIGNUM_DTOA_SHORTEST: case BIGNUM_DTOA_SHORTEST_SINGLE: GenerateShortestDigits(&numerator, &denominator, &delta_minus, &delta_plus, is_even, buffer, length); break; case BIGNUM_DTOA_FIXED: BignumToFixed(requested_digits, decimal_point, &numerator, &denominator, buffer, length); break; case BIGNUM_DTOA_PRECISION: GenerateCountedDigits(requested_digits, decimal_point, &numerator, &denominator, buffer, length); break; default: UNREACHABLE(); } buffer[*length] = '\0'; } // The procedure starts generating digits from the left to the right and stops // when the generated digits yield the shortest decimal representation of v. A // decimal representation of v is a number lying closer to v than to any other // double, so it converts to v when read. // // This is true if d, the decimal representation, is between m- and m+, the // upper and lower boundaries. d must be strictly between them if !is_even. // m- := (numerator - delta_minus) / denominator // m+ := (numerator + delta_plus) / denominator // // Precondition: 0 <= (numerator+delta_plus) / denominator < 10. // If 1 <= (numerator+delta_plus) / denominator < 10 then no leading 0 digit // will be produced. This should be the standard precondition. static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus, bool is_even, Vector buffer, int* length) { // Small optimization: if delta_minus and delta_plus are the same just reuse // one of the two bignums. if (Bignum::Equal(*delta_minus, *delta_plus)) { delta_plus = delta_minus; } *length = 0; while (true) { uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. // digit = numerator / denominator (integer division). // numerator = numerator % denominator. buffer[(*length)++] = digit + '0'; // Can we stop already? // If the remainder of the division is less than the distance to the lower // boundary we can stop. In this case we simply round down (discarding the // remainder). // Similarly we test if we can round up (using the upper boundary). bool in_delta_room_minus; bool in_delta_room_plus; if (is_even) { in_delta_room_minus = Bignum::LessEqual(*numerator, *delta_minus); } else { in_delta_room_minus = Bignum::Less(*numerator, *delta_minus); } if (is_even) { in_delta_room_plus = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; } else { in_delta_room_plus = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; } if (!in_delta_room_minus && !in_delta_room_plus) { // Prepare for next iteration. numerator->Times10(); delta_minus->Times10(); // We optimized delta_plus to be equal to delta_minus (if they share the // same value). So don't multiply delta_plus if they point to the same // object. if (delta_minus != delta_plus) { delta_plus->Times10(); } } else if (in_delta_room_minus && in_delta_room_plus) { // Let's see if 2*numerator < denominator. // If yes, then the next digit would be < 5 and we can round down. int compare = Bignum::PlusCompare(*numerator, *numerator, *denominator); if (compare < 0) { // Remaining digits are less than .5. -> Round down (== do nothing). } else if (compare > 0) { // Remaining digits are more than .5 of denominator. -> Round up. // Note that the last digit could not be a '9' as otherwise the whole // loop would have stopped earlier. // We still have an assert here in case the preconditions were not // satisfied. ASSERT(buffer[(*length) - 1] != '9'); buffer[(*length) - 1]++; } else { // Halfway case. // TODO(floitsch): need a way to solve half-way cases. // For now let's round towards even (since this is what Gay seems to // do). if ((buffer[(*length) - 1] - '0') % 2 == 0) { // Round down => Do nothing. } else { ASSERT(buffer[(*length) - 1] != '9'); buffer[(*length) - 1]++; } } return; } else if (in_delta_room_minus) { // Round down (== do nothing). return; } else { // in_delta_room_plus // Round up. // Note again that the last digit could not be '9' since this would have // stopped the loop earlier. // We still have an ASSERT here, in case the preconditions were not // satisfied. ASSERT(buffer[(*length) -1] != '9'); buffer[(*length) - 1]++; return; } } } // Let v = numerator / denominator < 10. // Then we generate 'count' digits of d = x.xxxxx... (without the decimal point) // from left to right. Once 'count' digits have been produced we decide wether // to round up or down. Remainders of exactly .5 round upwards. Numbers such // as 9.999999 propagate a carry all the way, and change the // exponent (decimal_point), when rounding upwards. static void GenerateCountedDigits(int count, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector(buffer), int* length) { ASSERT(count >= 0); for (int i = 0; i < count - 1; ++i) { uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive. // digit = numerator / denominator (integer division). // numerator = numerator % denominator. buffer[i] = digit + '0'; // Prepare for next iteration. numerator->Times10(); } // Generate the last digit. uint16_t digit; digit = numerator->DivideModuloIntBignum(*denominator); if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { digit++; } buffer[count - 1] = digit + '0'; // Correct bad digits (in case we had a sequence of '9's). Propagate the // carry until we hat a non-'9' or til we reach the first digit. for (int i = count - 1; i > 0; --i) { if (buffer[i] != '0' + 10) break; buffer[i] = '0'; buffer[i - 1]++; } if (buffer[0] == '0' + 10) { // Propagate a carry past the top place. buffer[0] = '1'; (*decimal_point)++; } *length = count; } // Generates 'requested_digits' after the decimal point. It might omit // trailing '0's. If the input number is too small then no digits at all are // generated (ex.: 2 fixed digits for 0.00001). // // Input verifies: 1 <= (numerator + delta) / denominator < 10. static void BignumToFixed(int requested_digits, int* decimal_point, Bignum* numerator, Bignum* denominator, Vector(buffer), int* length) { // Note that we have to look at more than just the requested_digits, since // a number could be rounded up. Example: v=0.5 with requested_digits=0. // Even though the power of v equals 0 we can't just stop here. if (-(*decimal_point) > requested_digits) { // The number is definitively too small. // Ex: 0.001 with requested_digits == 1. // Set decimal-point to -requested_digits. This is what Gay does. // Note that it should not have any effect anyways since the string is // empty. *decimal_point = -requested_digits; *length = 0; return; } else if (-(*decimal_point) == requested_digits) { // We only need to verify if the number rounds down or up. // Ex: 0.04 and 0.06 with requested_digits == 1. ASSERT(*decimal_point == -requested_digits); // Initially the fraction lies in range (1, 10]. Multiply the denominator // by 10 so that we can compare more easily. denominator->Times10(); if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) { // If the fraction is >= 0.5 then we have to include the rounded // digit. buffer[0] = '1'; *length = 1; (*decimal_point)++; } else { // Note that we caught most of similar cases earlier. *length = 0; } return; } else { // The requested digits correspond to the digits after the point. // The variable 'needed_digits' includes the digits before the point. int needed_digits = (*decimal_point) + requested_digits; GenerateCountedDigits(needed_digits, decimal_point, numerator, denominator, buffer, length); } } // Returns an estimation of k such that 10^(k-1) <= v < 10^k where // v = f * 2^exponent and 2^52 <= f < 2^53. // v is hence a normalized double with the given exponent. The output is an // approximation for the exponent of the decimal approimation .digits * 10^k. // // The result might undershoot by 1 in which case 10^k <= v < 10^k+1. // Note: this property holds for v's upper boundary m+ too. // 10^k <= m+ < 10^k+1. // (see explanation below). // // Examples: // EstimatePower(0) => 16 // EstimatePower(-52) => 0 // // Note: e >= 0 => EstimatedPower(e) > 0. No similar claim can be made for e<0. static int EstimatePower(int exponent) { // This function estimates log10 of v where v = f*2^e (with e == exponent). // Note that 10^floor(log10(v)) <= v, but v <= 10^ceil(log10(v)). // Note that f is bounded by its container size. Let p = 53 (the double's // significand size). Then 2^(p-1) <= f < 2^p. // // Given that log10(v) == log2(v)/log2(10) and e+(len(f)-1) is quite close // to log2(v) the function is simplified to (e+(len(f)-1)/log2(10)). // The computed number undershoots by less than 0.631 (when we compute log3 // and not log10). // // Optimization: since we only need an approximated result this computation // can be performed on 64 bit integers. On x86/x64 architecture the speedup is // not really measurable, though. // // Since we want to avoid overshooting we decrement by 1e10 so that // floating-point imprecisions don't affect us. // // Explanation for v's boundary m+: the computation takes advantage of // the fact that 2^(p-1) <= f < 2^p. Boundaries still satisfy this requirement // (even for denormals where the delta can be much more important). const double k1Log10 = 0.30102999566398114; // 1/lg(10) // For doubles len(f) == 53 (don't forget the hidden bit). const int kSignificandSize = Double::kSignificandSize; double estimate = ceil((exponent + kSignificandSize - 1) * k1Log10 - 1e-10); return static_cast(estimate); } // See comments for InitialScaledStartValues. static void InitialScaledStartValuesPositiveExponent( uint64_t significand, int exponent, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { // A positive exponent implies a positive power. ASSERT(estimated_power >= 0); // Since the estimated_power is positive we simply multiply the denominator // by 10^estimated_power. // numerator = v. numerator->AssignUInt64(significand); numerator->ShiftLeft(exponent); // denominator = 10^estimated_power. denominator->AssignPowerUInt16(10, estimated_power); if (need_boundary_deltas) { // Introduce a common denominator so that the deltas to the boundaries are // integers. denominator->ShiftLeft(1); numerator->ShiftLeft(1); // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common // denominator (of 2) delta_plus equals 2^e. delta_plus->AssignUInt16(1); delta_plus->ShiftLeft(exponent); // Same for delta_minus. The adjustments if f == 2^p-1 are done later. delta_minus->AssignUInt16(1); delta_minus->ShiftLeft(exponent); } } // See comments for InitialScaledStartValues static void InitialScaledStartValuesNegativeExponentPositivePower( uint64_t significand, int exponent, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { // v = f * 2^e with e < 0, and with estimated_power >= 0. // This means that e is close to 0 (have a look at how estimated_power is // computed). // numerator = significand // since v = significand * 2^exponent this is equivalent to // numerator = v * / 2^-exponent numerator->AssignUInt64(significand); // denominator = 10^estimated_power * 2^-exponent (with exponent < 0) denominator->AssignPowerUInt16(10, estimated_power); denominator->ShiftLeft(-exponent); if (need_boundary_deltas) { // Introduce a common denominator so that the deltas to the boundaries are // integers. denominator->ShiftLeft(1); numerator->ShiftLeft(1); // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common // denominator (of 2) delta_plus equals 2^e. // Given that the denominator already includes v's exponent the distance // to the boundaries is simply 1. delta_plus->AssignUInt16(1); // Same for delta_minus. The adjustments if f == 2^p-1 are done later. delta_minus->AssignUInt16(1); } } // See comments for InitialScaledStartValues static void InitialScaledStartValuesNegativeExponentNegativePower( uint64_t significand, int exponent, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { // Instead of multiplying the denominator with 10^estimated_power we // multiply all values (numerator and deltas) by 10^-estimated_power. // Use numerator as temporary container for power_ten. Bignum* power_ten = numerator; power_ten->AssignPowerUInt16(10, -estimated_power); if (need_boundary_deltas) { // Since power_ten == numerator we must make a copy of 10^estimated_power // before we complete the computation of the numerator. // delta_plus = delta_minus = 10^estimated_power delta_plus->AssignBignum(*power_ten); delta_minus->AssignBignum(*power_ten); } // numerator = significand * 2 * 10^-estimated_power // since v = significand * 2^exponent this is equivalent to // numerator = v * 10^-estimated_power * 2 * 2^-exponent. // Remember: numerator has been abused as power_ten. So no need to assign it // to itself. ASSERT(numerator == power_ten); numerator->MultiplyByUInt64(significand); // denominator = 2 * 2^-exponent with exponent < 0. denominator->AssignUInt16(1); denominator->ShiftLeft(-exponent); if (need_boundary_deltas) { // Introduce a common denominator so that the deltas to the boundaries are // integers. numerator->ShiftLeft(1); denominator->ShiftLeft(1); // With this shift the boundaries have their correct value, since // delta_plus = 10^-estimated_power, and // delta_minus = 10^-estimated_power. // These assignments have been done earlier. // The adjustments if f == 2^p-1 (lower boundary is closer) are done later. } } // Let v = significand * 2^exponent. // Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator // and denominator. The functions GenerateShortestDigits and // GenerateCountedDigits will then convert this ratio to its decimal // representation d, with the required accuracy. // Then d * 10^estimated_power is the representation of v. // (Note: the fraction and the estimated_power might get adjusted before // generating the decimal representation.) // // The initial start values consist of: // - a scaled numerator: s.t. numerator/denominator == v / 10^estimated_power. // - a scaled (common) denominator. // optionally (used by GenerateShortestDigits to decide if it has the shortest // decimal converting back to v): // - v - m-: the distance to the lower boundary. // - m+ - v: the distance to the upper boundary. // // v, m+, m-, and therefore v - m- and m+ - v all share the same denominator. // // Let ep == estimated_power, then the returned values will satisfy: // v / 10^ep = numerator / denominator. // v's boundarys m- and m+: // m- / 10^ep == v / 10^ep - delta_minus / denominator // m+ / 10^ep == v / 10^ep + delta_plus / denominator // Or in other words: // m- == v - delta_minus * 10^ep / denominator; // m+ == v + delta_plus * 10^ep / denominator; // // Since 10^(k-1) <= v < 10^k (with k == estimated_power) // or 10^k <= v < 10^(k+1) // we then have 0.1 <= numerator/denominator < 1 // or 1 <= numerator/denominator < 10 // // It is then easy to kickstart the digit-generation routine. // // The boundary-deltas are only filled if the mode equals BIGNUM_DTOA_SHORTEST // or BIGNUM_DTOA_SHORTEST_SINGLE. static void InitialScaledStartValues(uint64_t significand, int exponent, bool lower_boundary_is_closer, int estimated_power, bool need_boundary_deltas, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { if (exponent >= 0) { InitialScaledStartValuesPositiveExponent( significand, exponent, estimated_power, need_boundary_deltas, numerator, denominator, delta_minus, delta_plus); } else if (estimated_power >= 0) { InitialScaledStartValuesNegativeExponentPositivePower( significand, exponent, estimated_power, need_boundary_deltas, numerator, denominator, delta_minus, delta_plus); } else { InitialScaledStartValuesNegativeExponentNegativePower( significand, exponent, estimated_power, need_boundary_deltas, numerator, denominator, delta_minus, delta_plus); } if (need_boundary_deltas && lower_boundary_is_closer) { // The lower boundary is closer at half the distance of "normal" numbers. // Increase the common denominator and adapt all but the delta_minus. denominator->ShiftLeft(1); // *2 numerator->ShiftLeft(1); // *2 delta_plus->ShiftLeft(1); // *2 } } // This routine multiplies numerator/denominator so that its values lies in the // range 1-10. That is after a call to this function we have: // 1 <= (numerator + delta_plus) /denominator < 10. // Let numerator the input before modification and numerator' the argument // after modification, then the output-parameter decimal_point is such that // numerator / denominator * 10^estimated_power == // numerator' / denominator' * 10^(decimal_point - 1) // In some cases estimated_power was too low, and this is already the case. We // then simply adjust the power so that 10^(k-1) <= v < 10^k (with k == // estimated_power) but do not touch the numerator or denominator. // Otherwise the routine multiplies the numerator and the deltas by 10. static void FixupMultiply10(int estimated_power, bool is_even, int* decimal_point, Bignum* numerator, Bignum* denominator, Bignum* delta_minus, Bignum* delta_plus) { bool in_range; if (is_even) { // For IEEE doubles half-way cases (in decimal system numbers ending with 5) // are rounded to the closest floating-point number with even significand. in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0; } else { in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0; } if (in_range) { // Since numerator + delta_plus >= denominator we already have // 1 <= numerator/denominator < 10. Simply update the estimated_power. *decimal_point = estimated_power + 1; } else { *decimal_point = estimated_power; numerator->Times10(); if (Bignum::Equal(*delta_minus, *delta_plus)) { delta_minus->Times10(); delta_plus->AssignBignum(*delta_minus); } else { delta_minus->Times10(); delta_plus->Times10(); } } } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/bignum-dtoa.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_ #define DOUBLE_CONVERSION_BIGNUM_DTOA_H_ #include "utils.h" namespace double_conversion { enum BignumDtoaMode { // Return the shortest correct representation. // For example the output of 0.299999999999999988897 is (the less accurate but // correct) 0.3. BIGNUM_DTOA_SHORTEST, // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats. BIGNUM_DTOA_SHORTEST_SINGLE, // Return a fixed number of digits after the decimal point. // For instance fixed(0.1, 4) becomes 0.1000 // If the input number is big, the output will be big. BIGNUM_DTOA_FIXED, // Return a fixed number of digits, no matter what the exponent is. BIGNUM_DTOA_PRECISION }; // Converts the given double 'v' to ascii. // The result should be interpreted as buffer * 10^(point-length). // The buffer will be null-terminated. // // The input v must be > 0 and different from NaN, and Infinity. // // The output depends on the given mode: // - SHORTEST: produce the least amount of digits for which the internal // identity requirement is still satisfied. If the digits are printed // (together with the correct exponent) then reading this number will give // 'v' again. The buffer will choose the representation that is closest to // 'v'. If there are two at the same distance, than the number is round up. // In this mode the 'requested_digits' parameter is ignored. // - FIXED: produces digits necessary to print a given number with // 'requested_digits' digits after the decimal point. The produced digits // might be too short in which case the caller has to fill the gaps with '0's. // Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2. // Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns // buffer="2", point=0. // Note: the length of the returned buffer has no meaning wrt the significance // of its digits. That is, just because it contains '0's does not mean that // any other digit would not satisfy the internal identity requirement. // - PRECISION: produces 'requested_digits' where the first digit is not '0'. // Even though the length of produced digits usually equals // 'requested_digits', the function is allowed to return fewer digits, in // which case the caller has to fill the missing digits with '0's. // Halfway cases are again rounded up. // 'BignumDtoa' expects the given buffer to be big enough to hold all digits // and a terminating null-character. void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits, Vector buffer, int* length, int* point); } // namespace double_conversion #endif // DOUBLE_CONVERSION_BIGNUM_DTOA_H_ ================================================ FILE: src/kenlm/util/double-conversion/bignum.cc ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "bignum.h" #include "utils.h" namespace double_conversion { Bignum::Bignum() : bigits_(bigits_buffer_, kBigitCapacity), used_digits_(0), exponent_(0) { for (int i = 0; i < kBigitCapacity; ++i) { bigits_[i] = 0; } } template static int BitSize(S value) { return 8 * sizeof(value); } // Guaranteed to lie in one Bigit. void Bignum::AssignUInt16(uint16_t value) { ASSERT(kBigitSize >= BitSize(value)); Zero(); if (value == 0) return; EnsureCapacity(1); bigits_[0] = value; used_digits_ = 1; } void Bignum::AssignUInt64(uint64_t value) { const int kUInt64Size = 64; Zero(); if (value == 0) return; int needed_bigits = kUInt64Size / kBigitSize + 1; EnsureCapacity(needed_bigits); for (int i = 0; i < needed_bigits; ++i) { bigits_[i] = value & kBigitMask; value = value >> kBigitSize; } used_digits_ = needed_bigits; Clamp(); } void Bignum::AssignBignum(const Bignum& other) { exponent_ = other.exponent_; for (int i = 0; i < other.used_digits_; ++i) { bigits_[i] = other.bigits_[i]; } // Clear the excess digits (if there were any). for (int i = other.used_digits_; i < used_digits_; ++i) { bigits_[i] = 0; } used_digits_ = other.used_digits_; } static uint64_t ReadUInt64(Vector buffer, int from, int digits_to_read) { uint64_t result = 0; for (int i = from; i < from + digits_to_read; ++i) { int digit = buffer[i] - '0'; ASSERT(0 <= digit && digit <= 9); result = result * 10 + digit; } return result; } void Bignum::AssignDecimalString(Vector value) { // 2^64 = 18446744073709551616 > 10^19 const int kMaxUint64DecimalDigits = 19; Zero(); int length = value.length(); int pos = 0; // Let's just say that each digit needs 4 bits. while (length >= kMaxUint64DecimalDigits) { uint64_t digits = ReadUInt64(value, pos, kMaxUint64DecimalDigits); pos += kMaxUint64DecimalDigits; length -= kMaxUint64DecimalDigits; MultiplyByPowerOfTen(kMaxUint64DecimalDigits); AddUInt64(digits); } uint64_t digits = ReadUInt64(value, pos, length); MultiplyByPowerOfTen(length); AddUInt64(digits); Clamp(); } static int HexCharValue(char c) { if ('0' <= c && c <= '9') return c - '0'; if ('a' <= c && c <= 'f') return 10 + c - 'a'; if ('A' <= c && c <= 'F') return 10 + c - 'A'; UNREACHABLE(); return 0; // To make compiler happy. } void Bignum::AssignHexString(Vector value) { Zero(); int length = value.length(); int needed_bigits = length * 4 / kBigitSize + 1; EnsureCapacity(needed_bigits); int string_index = length - 1; for (int i = 0; i < needed_bigits - 1; ++i) { // These bigits are guaranteed to be "full". Chunk current_bigit = 0; for (int j = 0; j < kBigitSize / 4; j++) { current_bigit += HexCharValue(value[string_index--]) << (j * 4); } bigits_[i] = current_bigit; } used_digits_ = needed_bigits - 1; Chunk most_significant_bigit = 0; // Could be = 0; for (int j = 0; j <= string_index; ++j) { most_significant_bigit <<= 4; most_significant_bigit += HexCharValue(value[j]); } if (most_significant_bigit != 0) { bigits_[used_digits_] = most_significant_bigit; used_digits_++; } Clamp(); } void Bignum::AddUInt64(uint64_t operand) { if (operand == 0) return; Bignum other; other.AssignUInt64(operand); AddBignum(other); } void Bignum::AddBignum(const Bignum& other) { ASSERT(IsClamped()); ASSERT(other.IsClamped()); // If this has a greater exponent than other append zero-bigits to this. // After this call exponent_ <= other.exponent_. Align(other); // There are two possibilities: // aaaaaaaaaaa 0000 (where the 0s represent a's exponent) // bbbbb 00000000 // ---------------- // ccccccccccc 0000 // or // aaaaaaaaaa 0000 // bbbbbbbbb 0000000 // ----------------- // cccccccccccc 0000 // In both cases we might need a carry bigit. EnsureCapacity(1 + Max(BigitLength(), other.BigitLength()) - exponent_); Chunk carry = 0; int bigit_pos = other.exponent_ - exponent_; ASSERT(bigit_pos >= 0); for (int i = 0; i < other.used_digits_; ++i) { Chunk sum = bigits_[bigit_pos] + other.bigits_[i] + carry; bigits_[bigit_pos] = sum & kBigitMask; carry = sum >> kBigitSize; bigit_pos++; } while (carry != 0) { Chunk sum = bigits_[bigit_pos] + carry; bigits_[bigit_pos] = sum & kBigitMask; carry = sum >> kBigitSize; bigit_pos++; } used_digits_ = Max(bigit_pos, used_digits_); ASSERT(IsClamped()); } void Bignum::SubtractBignum(const Bignum& other) { ASSERT(IsClamped()); ASSERT(other.IsClamped()); // We require this to be bigger than other. ASSERT(LessEqual(other, *this)); Align(other); int offset = other.exponent_ - exponent_; Chunk borrow = 0; int i; for (i = 0; i < other.used_digits_; ++i) { ASSERT((borrow == 0) || (borrow == 1)); Chunk difference = bigits_[i + offset] - other.bigits_[i] - borrow; bigits_[i + offset] = difference & kBigitMask; borrow = difference >> (kChunkSize - 1); } while (borrow != 0) { Chunk difference = bigits_[i + offset] - borrow; bigits_[i + offset] = difference & kBigitMask; borrow = difference >> (kChunkSize - 1); ++i; } Clamp(); } void Bignum::ShiftLeft(int shift_amount) { if (used_digits_ == 0) return; exponent_ += shift_amount / kBigitSize; int local_shift = shift_amount % kBigitSize; EnsureCapacity(used_digits_ + 1); BigitsShiftLeft(local_shift); } void Bignum::MultiplyByUInt32(uint32_t factor) { if (factor == 1) return; if (factor == 0) { Zero(); return; } if (used_digits_ == 0) return; // The product of a bigit with the factor is of size kBigitSize + 32. // Assert that this number + 1 (for the carry) fits into double chunk. ASSERT(kDoubleChunkSize >= kBigitSize + 32 + 1); DoubleChunk carry = 0; for (int i = 0; i < used_digits_; ++i) { DoubleChunk product = static_cast(factor) * bigits_[i] + carry; bigits_[i] = static_cast(product & kBigitMask); carry = (product >> kBigitSize); } while (carry != 0) { EnsureCapacity(used_digits_ + 1); bigits_[used_digits_] = carry & kBigitMask; used_digits_++; carry >>= kBigitSize; } } void Bignum::MultiplyByUInt64(uint64_t factor) { if (factor == 1) return; if (factor == 0) { Zero(); return; } ASSERT(kBigitSize < 32); uint64_t carry = 0; uint64_t low = factor & 0xFFFFFFFF; uint64_t high = factor >> 32; for (int i = 0; i < used_digits_; ++i) { uint64_t product_low = low * bigits_[i]; uint64_t product_high = high * bigits_[i]; uint64_t tmp = (carry & kBigitMask) + product_low; bigits_[i] = tmp & kBigitMask; carry = (carry >> kBigitSize) + (tmp >> kBigitSize) + (product_high << (32 - kBigitSize)); } while (carry != 0) { EnsureCapacity(used_digits_ + 1); bigits_[used_digits_] = carry & kBigitMask; used_digits_++; carry >>= kBigitSize; } } void Bignum::MultiplyByPowerOfTen(int exponent) { const uint64_t kFive27 = UINT64_2PART_C(0x6765c793, fa10079d); const uint16_t kFive1 = 5; const uint16_t kFive2 = kFive1 * 5; const uint16_t kFive3 = kFive2 * 5; const uint16_t kFive4 = kFive3 * 5; const uint16_t kFive5 = kFive4 * 5; const uint16_t kFive6 = kFive5 * 5; const uint32_t kFive7 = kFive6 * 5; const uint32_t kFive8 = kFive7 * 5; const uint32_t kFive9 = kFive8 * 5; const uint32_t kFive10 = kFive9 * 5; const uint32_t kFive11 = kFive10 * 5; const uint32_t kFive12 = kFive11 * 5; const uint32_t kFive13 = kFive12 * 5; const uint32_t kFive1_to_12[] = { kFive1, kFive2, kFive3, kFive4, kFive5, kFive6, kFive7, kFive8, kFive9, kFive10, kFive11, kFive12 }; ASSERT(exponent >= 0); if (exponent == 0) return; if (used_digits_ == 0) return; // We shift by exponent at the end just before returning. int remaining_exponent = exponent; while (remaining_exponent >= 27) { MultiplyByUInt64(kFive27); remaining_exponent -= 27; } while (remaining_exponent >= 13) { MultiplyByUInt32(kFive13); remaining_exponent -= 13; } if (remaining_exponent > 0) { MultiplyByUInt32(kFive1_to_12[remaining_exponent - 1]); } ShiftLeft(exponent); } void Bignum::Square() { ASSERT(IsClamped()); int product_length = 2 * used_digits_; EnsureCapacity(product_length); // Comba multiplication: compute each column separately. // Example: r = a2a1a0 * b2b1b0. // r = 1 * a0b0 + // 10 * (a1b0 + a0b1) + // 100 * (a2b0 + a1b1 + a0b2) + // 1000 * (a2b1 + a1b2) + // 10000 * a2b2 // // In the worst case we have to accumulate nb-digits products of digit*digit. // // Assert that the additional number of bits in a DoubleChunk are enough to // sum up used_digits of Bigit*Bigit. if ((1 << (2 * (kChunkSize - kBigitSize))) <= used_digits_) { UNIMPLEMENTED(); } DoubleChunk accumulator = 0; // First shift the digits so we don't overwrite them. int copy_offset = used_digits_; for (int i = 0; i < used_digits_; ++i) { bigits_[copy_offset + i] = bigits_[i]; } // We have two loops to avoid some 'if's in the loop. for (int i = 0; i < used_digits_; ++i) { // Process temporary digit i with power i. // The sum of the two indices must be equal to i. int bigit_index1 = i; int bigit_index2 = 0; // Sum all of the sub-products. while (bigit_index1 >= 0) { Chunk chunk1 = bigits_[copy_offset + bigit_index1]; Chunk chunk2 = bigits_[copy_offset + bigit_index2]; accumulator += static_cast(chunk1) * chunk2; bigit_index1--; bigit_index2++; } bigits_[i] = static_cast(accumulator) & kBigitMask; accumulator >>= kBigitSize; } for (int i = used_digits_; i < product_length; ++i) { int bigit_index1 = used_digits_ - 1; int bigit_index2 = i - bigit_index1; // Invariant: sum of both indices is again equal to i. // Inner loop runs 0 times on last iteration, emptying accumulator. while (bigit_index2 < used_digits_) { Chunk chunk1 = bigits_[copy_offset + bigit_index1]; Chunk chunk2 = bigits_[copy_offset + bigit_index2]; accumulator += static_cast(chunk1) * chunk2; bigit_index1--; bigit_index2++; } // The overwritten bigits_[i] will never be read in further loop iterations, // because bigit_index1 and bigit_index2 are always greater // than i - used_digits_. bigits_[i] = static_cast(accumulator) & kBigitMask; accumulator >>= kBigitSize; } // Since the result was guaranteed to lie inside the number the // accumulator must be 0 now. ASSERT(accumulator == 0); // Don't forget to update the used_digits and the exponent. used_digits_ = product_length; exponent_ *= 2; Clamp(); } void Bignum::AssignPowerUInt16(uint16_t base, int power_exponent) { ASSERT(base != 0); ASSERT(power_exponent >= 0); if (power_exponent == 0) { AssignUInt16(1); return; } Zero(); int shifts = 0; // We expect base to be in range 2-32, and most often to be 10. // It does not make much sense to implement different algorithms for counting // the bits. while ((base & 1) == 0) { base >>= 1; shifts++; } int bit_size = 0; int tmp_base = base; while (tmp_base != 0) { tmp_base >>= 1; bit_size++; } int final_size = bit_size * power_exponent; // 1 extra bigit for the shifting, and one for rounded final_size. EnsureCapacity(final_size / kBigitSize + 2); // Left to Right exponentiation. int mask = 1; while (power_exponent >= mask) mask <<= 1; // The mask is now pointing to the bit above the most significant 1-bit of // power_exponent. // Get rid of first 1-bit; mask >>= 2; uint64_t this_value = base; bool delayed_multipliciation = false; const uint64_t max_32bits = 0xFFFFFFFF; while (mask != 0 && this_value <= max_32bits) { this_value = this_value * this_value; // Verify that there is enough space in this_value to perform the // multiplication. The first bit_size bits must be 0. if ((power_exponent & mask) != 0) { uint64_t base_bits_mask = ~((static_cast(1) << (64 - bit_size)) - 1); bool high_bits_zero = (this_value & base_bits_mask) == 0; if (high_bits_zero) { this_value *= base; } else { delayed_multipliciation = true; } } mask >>= 1; } AssignUInt64(this_value); if (delayed_multipliciation) { MultiplyByUInt32(base); } // Now do the same thing as a bignum. while (mask != 0) { Square(); if ((power_exponent & mask) != 0) { MultiplyByUInt32(base); } mask >>= 1; } // And finally add the saved shifts. ShiftLeft(shifts * power_exponent); } // Precondition: this/other < 16bit. uint16_t Bignum::DivideModuloIntBignum(const Bignum& other) { ASSERT(IsClamped()); ASSERT(other.IsClamped()); ASSERT(other.used_digits_ > 0); // Easy case: if we have less digits than the divisor than the result is 0. // Note: this handles the case where this == 0, too. if (BigitLength() < other.BigitLength()) { return 0; } Align(other); uint16_t result = 0; // Start by removing multiples of 'other' until both numbers have the same // number of digits. while (BigitLength() > other.BigitLength()) { // This naive approach is extremely inefficient if the this divided other // might be big. This function is implemented for doubleToString where // the result should be small (less than 10). ASSERT(other.bigits_[other.used_digits_ - 1] >= ((1 << kBigitSize) / 16)); // Remove the multiples of the first digit. // Example this = 23 and other equals 9. -> Remove 2 multiples. result += bigits_[used_digits_ - 1]; SubtractTimes(other, bigits_[used_digits_ - 1]); } ASSERT(BigitLength() == other.BigitLength()); // Both bignums are at the same length now. // Since other has more than 0 digits we know that the access to // bigits_[used_digits_ - 1] is safe. Chunk this_bigit = bigits_[used_digits_ - 1]; Chunk other_bigit = other.bigits_[other.used_digits_ - 1]; if (other.used_digits_ == 1) { // Shortcut for easy (and common) case. int quotient = this_bigit / other_bigit; bigits_[used_digits_ - 1] = this_bigit - other_bigit * quotient; result += quotient; Clamp(); return result; } int division_estimate = this_bigit / (other_bigit + 1); result += division_estimate; SubtractTimes(other, division_estimate); if (other_bigit * (division_estimate + 1) > this_bigit) { // No need to even try to subtract. Even if other's remaining digits were 0 // another subtraction would be too much. return result; } while (LessEqual(other, *this)) { SubtractBignum(other); result++; } return result; } template static int SizeInHexChars(S number) { ASSERT(number > 0); int result = 0; while (number != 0) { number >>= 4; result++; } return result; } static char HexCharOfValue(int value) { ASSERT(0 <= value && value <= 16); if (value < 10) return value + '0'; return value - 10 + 'A'; } bool Bignum::ToHexString(char* buffer, int buffer_size) const { ASSERT(IsClamped()); // Each bigit must be printable as separate hex-character. ASSERT(kBigitSize % 4 == 0); const int kHexCharsPerBigit = kBigitSize / 4; if (used_digits_ == 0) { if (buffer_size < 2) return false; buffer[0] = '0'; buffer[1] = '\0'; return true; } // We add 1 for the terminating '\0' character. int needed_chars = (BigitLength() - 1) * kHexCharsPerBigit + SizeInHexChars(bigits_[used_digits_ - 1]) + 1; if (needed_chars > buffer_size) return false; int string_index = needed_chars - 1; buffer[string_index--] = '\0'; for (int i = 0; i < exponent_; ++i) { for (int j = 0; j < kHexCharsPerBigit; ++j) { buffer[string_index--] = '0'; } } for (int i = 0; i < used_digits_ - 1; ++i) { Chunk current_bigit = bigits_[i]; for (int j = 0; j < kHexCharsPerBigit; ++j) { buffer[string_index--] = HexCharOfValue(current_bigit & 0xF); current_bigit >>= 4; } } // And finally the last bigit. Chunk most_significant_bigit = bigits_[used_digits_ - 1]; while (most_significant_bigit != 0) { buffer[string_index--] = HexCharOfValue(most_significant_bigit & 0xF); most_significant_bigit >>= 4; } return true; } Bignum::Chunk Bignum::BigitAt(int index) const { if (index >= BigitLength()) return 0; if (index < exponent_) return 0; return bigits_[index - exponent_]; } int Bignum::Compare(const Bignum& a, const Bignum& b) { ASSERT(a.IsClamped()); ASSERT(b.IsClamped()); int bigit_length_a = a.BigitLength(); int bigit_length_b = b.BigitLength(); if (bigit_length_a < bigit_length_b) return -1; if (bigit_length_a > bigit_length_b) return +1; for (int i = bigit_length_a - 1; i >= Min(a.exponent_, b.exponent_); --i) { Chunk bigit_a = a.BigitAt(i); Chunk bigit_b = b.BigitAt(i); if (bigit_a < bigit_b) return -1; if (bigit_a > bigit_b) return +1; // Otherwise they are equal up to this digit. Try the next digit. } return 0; } int Bignum::PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c) { ASSERT(a.IsClamped()); ASSERT(b.IsClamped()); ASSERT(c.IsClamped()); if (a.BigitLength() < b.BigitLength()) { return PlusCompare(b, a, c); } if (a.BigitLength() + 1 < c.BigitLength()) return -1; if (a.BigitLength() > c.BigitLength()) return +1; // The exponent encodes 0-bigits. So if there are more 0-digits in 'a' than // 'b' has digits, then the bigit-length of 'a'+'b' must be equal to the one // of 'a'. if (a.exponent_ >= b.BigitLength() && a.BigitLength() < c.BigitLength()) { return -1; } Chunk borrow = 0; // Starting at min_exponent all digits are == 0. So no need to compare them. int min_exponent = Min(Min(a.exponent_, b.exponent_), c.exponent_); for (int i = c.BigitLength() - 1; i >= min_exponent; --i) { Chunk chunk_a = a.BigitAt(i); Chunk chunk_b = b.BigitAt(i); Chunk chunk_c = c.BigitAt(i); Chunk sum = chunk_a + chunk_b; if (sum > chunk_c + borrow) { return +1; } else { borrow = chunk_c + borrow - sum; if (borrow > 1) return -1; borrow <<= kBigitSize; } } if (borrow == 0) return 0; return -1; } void Bignum::Clamp() { while (used_digits_ > 0 && bigits_[used_digits_ - 1] == 0) { used_digits_--; } if (used_digits_ == 0) { // Zero. exponent_ = 0; } } bool Bignum::IsClamped() const { return used_digits_ == 0 || bigits_[used_digits_ - 1] != 0; } void Bignum::Zero() { for (int i = 0; i < used_digits_; ++i) { bigits_[i] = 0; } used_digits_ = 0; exponent_ = 0; } void Bignum::Align(const Bignum& other) { if (exponent_ > other.exponent_) { // If "X" represents a "hidden" digit (by the exponent) then we are in the // following case (a == this, b == other): // a: aaaaaaXXXX or a: aaaaaXXX // b: bbbbbbX b: bbbbbbbbXX // We replace some of the hidden digits (X) of a with 0 digits. // a: aaaaaa000X or a: aaaaa0XX int zero_digits = exponent_ - other.exponent_; EnsureCapacity(used_digits_ + zero_digits); for (int i = used_digits_ - 1; i >= 0; --i) { bigits_[i + zero_digits] = bigits_[i]; } for (int i = 0; i < zero_digits; ++i) { bigits_[i] = 0; } used_digits_ += zero_digits; exponent_ -= zero_digits; ASSERT(used_digits_ >= 0); ASSERT(exponent_ >= 0); } } void Bignum::BigitsShiftLeft(int shift_amount) { ASSERT(shift_amount < kBigitSize); ASSERT(shift_amount >= 0); Chunk carry = 0; for (int i = 0; i < used_digits_; ++i) { Chunk new_carry = bigits_[i] >> (kBigitSize - shift_amount); bigits_[i] = ((bigits_[i] << shift_amount) + carry) & kBigitMask; carry = new_carry; } if (carry != 0) { bigits_[used_digits_] = carry; used_digits_++; } } void Bignum::SubtractTimes(const Bignum& other, int factor) { ASSERT(exponent_ <= other.exponent_); if (factor < 3) { for (int i = 0; i < factor; ++i) { SubtractBignum(other); } return; } Chunk borrow = 0; int exponent_diff = other.exponent_ - exponent_; for (int i = 0; i < other.used_digits_; ++i) { DoubleChunk product = static_cast(factor) * other.bigits_[i]; DoubleChunk remove = borrow + product; Chunk difference = bigits_[i + exponent_diff] - (remove & kBigitMask); bigits_[i + exponent_diff] = difference & kBigitMask; borrow = static_cast((difference >> (kChunkSize - 1)) + (remove >> kBigitSize)); } for (int i = other.used_digits_ + exponent_diff; i < used_digits_; ++i) { if (borrow == 0) return; Chunk difference = bigits_[i] - borrow; bigits_[i] = difference & kBigitMask; borrow = difference >> (kChunkSize - 1); ++i; } Clamp(); } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/bignum.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_BIGNUM_H_ #define DOUBLE_CONVERSION_BIGNUM_H_ #include "utils.h" namespace double_conversion { class Bignum { public: // 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately. // This bignum can encode much bigger numbers, since it contains an // exponent. static const int kMaxSignificantBits = 3584; Bignum(); void AssignUInt16(uint16_t value); void AssignUInt64(uint64_t value); void AssignBignum(const Bignum& other); void AssignDecimalString(Vector value); void AssignHexString(Vector value); void AssignPowerUInt16(uint16_t base, int exponent); void AddUInt16(uint16_t operand); void AddUInt64(uint64_t operand); void AddBignum(const Bignum& other); // Precondition: this >= other. void SubtractBignum(const Bignum& other); void Square(); void ShiftLeft(int shift_amount); void MultiplyByUInt32(uint32_t factor); void MultiplyByUInt64(uint64_t factor); void MultiplyByPowerOfTen(int exponent); void Times10() { return MultiplyByUInt32(10); } // Pseudocode: // int result = this / other; // this = this % other; // In the worst case this function is in O(this/other). uint16_t DivideModuloIntBignum(const Bignum& other); bool ToHexString(char* buffer, int buffer_size) const; // Returns // -1 if a < b, // 0 if a == b, and // +1 if a > b. static int Compare(const Bignum& a, const Bignum& b); static bool Equal(const Bignum& a, const Bignum& b) { return Compare(a, b) == 0; } static bool LessEqual(const Bignum& a, const Bignum& b) { return Compare(a, b) <= 0; } static bool Less(const Bignum& a, const Bignum& b) { return Compare(a, b) < 0; } // Returns Compare(a + b, c); static int PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c); // Returns a + b == c static bool PlusEqual(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) == 0; } // Returns a + b <= c static bool PlusLessEqual(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) <= 0; } // Returns a + b < c static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) { return PlusCompare(a, b, c) < 0; } private: typedef uint32_t Chunk; typedef uint64_t DoubleChunk; static const int kChunkSize = sizeof(Chunk) * 8; static const int kDoubleChunkSize = sizeof(DoubleChunk) * 8; // With bigit size of 28 we loose some bits, but a double still fits easily // into two chunks, and more importantly we can use the Comba multiplication. static const int kBigitSize = 28; static const Chunk kBigitMask = (1 << kBigitSize) - 1; // Every instance allocates kBigitLength chunks on the stack. Bignums cannot // grow. There are no checks if the stack-allocated space is sufficient. static const int kBigitCapacity = kMaxSignificantBits / kBigitSize; void EnsureCapacity(int size) { if (size > kBigitCapacity) { UNREACHABLE(); } } void Align(const Bignum& other); void Clamp(); bool IsClamped() const; void Zero(); // Requires this to have enough capacity (no tests done). // Updates used_digits_ if necessary. // shift_amount must be < kBigitSize. void BigitsShiftLeft(int shift_amount); // BigitLength includes the "hidden" digits encoded in the exponent. int BigitLength() const { return used_digits_ + exponent_; } Chunk BigitAt(int index) const; void SubtractTimes(const Bignum& other, int factor); Chunk bigits_buffer_[kBigitCapacity]; // A vector backed by bigits_buffer_. This way accesses to the array are // checked for out-of-bounds errors. Vector bigits_; int used_digits_; // The Bignum's value equals value(bigits_) * 2^(exponent_ * kBigitSize). int exponent_; DISALLOW_COPY_AND_ASSIGN(Bignum); }; } // namespace double_conversion #endif // DOUBLE_CONVERSION_BIGNUM_H_ ================================================ FILE: src/kenlm/util/double-conversion/cached-powers.cc ================================================ // Copyright 2006-2008 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include #include "utils.h" #include "cached-powers.h" namespace double_conversion { struct CachedPower { uint64_t significand; int16_t binary_exponent; int16_t decimal_exponent; }; static const CachedPower kCachedPowers[] = { {UINT64_2PART_C(0xfa8fd5a0, 081c0288), -1220, -348}, {UINT64_2PART_C(0xbaaee17f, a23ebf76), -1193, -340}, {UINT64_2PART_C(0x8b16fb20, 3055ac76), -1166, -332}, {UINT64_2PART_C(0xcf42894a, 5dce35ea), -1140, -324}, {UINT64_2PART_C(0x9a6bb0aa, 55653b2d), -1113, -316}, {UINT64_2PART_C(0xe61acf03, 3d1a45df), -1087, -308}, {UINT64_2PART_C(0xab70fe17, c79ac6ca), -1060, -300}, {UINT64_2PART_C(0xff77b1fc, bebcdc4f), -1034, -292}, {UINT64_2PART_C(0xbe5691ef, 416bd60c), -1007, -284}, {UINT64_2PART_C(0x8dd01fad, 907ffc3c), -980, -276}, {UINT64_2PART_C(0xd3515c28, 31559a83), -954, -268}, {UINT64_2PART_C(0x9d71ac8f, ada6c9b5), -927, -260}, {UINT64_2PART_C(0xea9c2277, 23ee8bcb), -901, -252}, {UINT64_2PART_C(0xaecc4991, 4078536d), -874, -244}, {UINT64_2PART_C(0x823c1279, 5db6ce57), -847, -236}, {UINT64_2PART_C(0xc2109436, 4dfb5637), -821, -228}, {UINT64_2PART_C(0x9096ea6f, 3848984f), -794, -220}, {UINT64_2PART_C(0xd77485cb, 25823ac7), -768, -212}, {UINT64_2PART_C(0xa086cfcd, 97bf97f4), -741, -204}, {UINT64_2PART_C(0xef340a98, 172aace5), -715, -196}, {UINT64_2PART_C(0xb23867fb, 2a35b28e), -688, -188}, {UINT64_2PART_C(0x84c8d4df, d2c63f3b), -661, -180}, {UINT64_2PART_C(0xc5dd4427, 1ad3cdba), -635, -172}, {UINT64_2PART_C(0x936b9fce, bb25c996), -608, -164}, {UINT64_2PART_C(0xdbac6c24, 7d62a584), -582, -156}, {UINT64_2PART_C(0xa3ab6658, 0d5fdaf6), -555, -148}, {UINT64_2PART_C(0xf3e2f893, dec3f126), -529, -140}, {UINT64_2PART_C(0xb5b5ada8, aaff80b8), -502, -132}, {UINT64_2PART_C(0x87625f05, 6c7c4a8b), -475, -124}, {UINT64_2PART_C(0xc9bcff60, 34c13053), -449, -116}, {UINT64_2PART_C(0x964e858c, 91ba2655), -422, -108}, {UINT64_2PART_C(0xdff97724, 70297ebd), -396, -100}, {UINT64_2PART_C(0xa6dfbd9f, b8e5b88f), -369, -92}, {UINT64_2PART_C(0xf8a95fcf, 88747d94), -343, -84}, {UINT64_2PART_C(0xb9447093, 8fa89bcf), -316, -76}, {UINT64_2PART_C(0x8a08f0f8, bf0f156b), -289, -68}, {UINT64_2PART_C(0xcdb02555, 653131b6), -263, -60}, {UINT64_2PART_C(0x993fe2c6, d07b7fac), -236, -52}, {UINT64_2PART_C(0xe45c10c4, 2a2b3b06), -210, -44}, {UINT64_2PART_C(0xaa242499, 697392d3), -183, -36}, {UINT64_2PART_C(0xfd87b5f2, 8300ca0e), -157, -28}, {UINT64_2PART_C(0xbce50864, 92111aeb), -130, -20}, {UINT64_2PART_C(0x8cbccc09, 6f5088cc), -103, -12}, {UINT64_2PART_C(0xd1b71758, e219652c), -77, -4}, {UINT64_2PART_C(0x9c400000, 00000000), -50, 4}, {UINT64_2PART_C(0xe8d4a510, 00000000), -24, 12}, {UINT64_2PART_C(0xad78ebc5, ac620000), 3, 20}, {UINT64_2PART_C(0x813f3978, f8940984), 30, 28}, {UINT64_2PART_C(0xc097ce7b, c90715b3), 56, 36}, {UINT64_2PART_C(0x8f7e32ce, 7bea5c70), 83, 44}, {UINT64_2PART_C(0xd5d238a4, abe98068), 109, 52}, {UINT64_2PART_C(0x9f4f2726, 179a2245), 136, 60}, {UINT64_2PART_C(0xed63a231, d4c4fb27), 162, 68}, {UINT64_2PART_C(0xb0de6538, 8cc8ada8), 189, 76}, {UINT64_2PART_C(0x83c7088e, 1aab65db), 216, 84}, {UINT64_2PART_C(0xc45d1df9, 42711d9a), 242, 92}, {UINT64_2PART_C(0x924d692c, a61be758), 269, 100}, {UINT64_2PART_C(0xda01ee64, 1a708dea), 295, 108}, {UINT64_2PART_C(0xa26da399, 9aef774a), 322, 116}, {UINT64_2PART_C(0xf209787b, b47d6b85), 348, 124}, {UINT64_2PART_C(0xb454e4a1, 79dd1877), 375, 132}, {UINT64_2PART_C(0x865b8692, 5b9bc5c2), 402, 140}, {UINT64_2PART_C(0xc83553c5, c8965d3d), 428, 148}, {UINT64_2PART_C(0x952ab45c, fa97a0b3), 455, 156}, {UINT64_2PART_C(0xde469fbd, 99a05fe3), 481, 164}, {UINT64_2PART_C(0xa59bc234, db398c25), 508, 172}, {UINT64_2PART_C(0xf6c69a72, a3989f5c), 534, 180}, {UINT64_2PART_C(0xb7dcbf53, 54e9bece), 561, 188}, {UINT64_2PART_C(0x88fcf317, f22241e2), 588, 196}, {UINT64_2PART_C(0xcc20ce9b, d35c78a5), 614, 204}, {UINT64_2PART_C(0x98165af3, 7b2153df), 641, 212}, {UINT64_2PART_C(0xe2a0b5dc, 971f303a), 667, 220}, {UINT64_2PART_C(0xa8d9d153, 5ce3b396), 694, 228}, {UINT64_2PART_C(0xfb9b7cd9, a4a7443c), 720, 236}, {UINT64_2PART_C(0xbb764c4c, a7a44410), 747, 244}, {UINT64_2PART_C(0x8bab8eef, b6409c1a), 774, 252}, {UINT64_2PART_C(0xd01fef10, a657842c), 800, 260}, {UINT64_2PART_C(0x9b10a4e5, e9913129), 827, 268}, {UINT64_2PART_C(0xe7109bfb, a19c0c9d), 853, 276}, {UINT64_2PART_C(0xac2820d9, 623bf429), 880, 284}, {UINT64_2PART_C(0x80444b5e, 7aa7cf85), 907, 292}, {UINT64_2PART_C(0xbf21e440, 03acdd2d), 933, 300}, {UINT64_2PART_C(0x8e679c2f, 5e44ff8f), 960, 308}, {UINT64_2PART_C(0xd433179d, 9c8cb841), 986, 316}, {UINT64_2PART_C(0x9e19db92, b4e31ba9), 1013, 324}, {UINT64_2PART_C(0xeb96bf6e, badf77d9), 1039, 332}, {UINT64_2PART_C(0xaf87023b, 9bf0ee6b), 1066, 340}, }; static const int kCachedPowersLength = ARRAY_SIZE(kCachedPowers); static const int kCachedPowersOffset = 348; // -1 * the first decimal_exponent. static const double kD_1_LOG2_10 = 0.30102999566398114; // 1 / lg(10) // Difference between the decimal exponents in the table above. const int PowersOfTenCache::kDecimalExponentDistance = 8; const int PowersOfTenCache::kMinDecimalExponent = -348; const int PowersOfTenCache::kMaxDecimalExponent = 340; void PowersOfTenCache::GetCachedPowerForBinaryExponentRange( int min_exponent, int max_exponent, DiyFp* power, int* decimal_exponent) { int kQ = DiyFp::kSignificandSize; double k = ceil((min_exponent + kQ - 1) * kD_1_LOG2_10); int foo = kCachedPowersOffset; int index = (foo + static_cast(k) - 1) / kDecimalExponentDistance + 1; ASSERT(0 <= index && index < kCachedPowersLength); CachedPower cached_power = kCachedPowers[index]; ASSERT(min_exponent <= cached_power.binary_exponent); ASSERT(cached_power.binary_exponent <= max_exponent); *decimal_exponent = cached_power.decimal_exponent; *power = DiyFp(cached_power.significand, cached_power.binary_exponent); } void PowersOfTenCache::GetCachedPowerForDecimalExponent(int requested_exponent, DiyFp* power, int* found_exponent) { ASSERT(kMinDecimalExponent <= requested_exponent); ASSERT(requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance); int index = (requested_exponent + kCachedPowersOffset) / kDecimalExponentDistance; CachedPower cached_power = kCachedPowers[index]; *power = DiyFp(cached_power.significand, cached_power.binary_exponent); *found_exponent = cached_power.decimal_exponent; ASSERT(*found_exponent <= requested_exponent); ASSERT(requested_exponent < *found_exponent + kDecimalExponentDistance); } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/cached-powers.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_CACHED_POWERS_H_ #define DOUBLE_CONVERSION_CACHED_POWERS_H_ #include "diy-fp.h" namespace double_conversion { class PowersOfTenCache { public: // Not all powers of ten are cached. The decimal exponent of two neighboring // cached numbers will differ by kDecimalExponentDistance. static const int kDecimalExponentDistance; static const int kMinDecimalExponent; static const int kMaxDecimalExponent; // Returns a cached power-of-ten with a binary exponent in the range // [min_exponent; max_exponent] (boundaries included). static void GetCachedPowerForBinaryExponentRange(int min_exponent, int max_exponent, DiyFp* power, int* decimal_exponent); // Returns a cached power of ten x ~= 10^k such that // k <= decimal_exponent < k + kCachedPowersDecimalDistance. // The given decimal_exponent must satisfy // kMinDecimalExponent <= requested_exponent, and // requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance. static void GetCachedPowerForDecimalExponent(int requested_exponent, DiyFp* power, int* found_exponent); }; } // namespace double_conversion #endif // DOUBLE_CONVERSION_CACHED_POWERS_H_ ================================================ FILE: src/kenlm/util/double-conversion/diy-fp.cc ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "diy-fp.h" #include "utils.h" namespace double_conversion { void DiyFp::Multiply(const DiyFp& other) { // Simply "emulates" a 128 bit multiplication. // However: the resulting number only contains 64 bits. The least // significant 64 bits are only used for rounding the most significant 64 // bits. const uint64_t kM32 = 0xFFFFFFFFU; uint64_t a = f_ >> 32; uint64_t b = f_ & kM32; uint64_t c = other.f_ >> 32; uint64_t d = other.f_ & kM32; uint64_t ac = a * c; uint64_t bc = b * c; uint64_t ad = a * d; uint64_t bd = b * d; uint64_t tmp = (bd >> 32) + (ad & kM32) + (bc & kM32); // By adding 1U << 31 to tmp we round the final result. // Halfway cases will be round up. tmp += 1U << 31; uint64_t result_f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32); e_ += other.e_ + 64; f_ = result_f; } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/diy-fp.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_DIY_FP_H_ #define DOUBLE_CONVERSION_DIY_FP_H_ #include "utils.h" namespace double_conversion { // This "Do It Yourself Floating Point" class implements a floating-point number // with a uint64 significand and an int exponent. Normalized DiyFp numbers will // have the most significant bit of the significand set. // Multiplication and Subtraction do not normalize their results. // DiyFp are not designed to contain special doubles (NaN and Infinity). class DiyFp { public: static const int kSignificandSize = 64; DiyFp() : f_(0), e_(0) {} DiyFp(uint64_t f, int e) : f_(f), e_(e) {} // this = this - other. // The exponents of both numbers must be the same and the significand of this // must be bigger than the significand of other. // The result will not be normalized. void Subtract(const DiyFp& other) { ASSERT(e_ == other.e_); ASSERT(f_ >= other.f_); f_ -= other.f_; } // Returns a - b. // The exponents of both numbers must be the same and this must be bigger // than other. The result will not be normalized. static DiyFp Minus(const DiyFp& a, const DiyFp& b) { DiyFp result = a; result.Subtract(b); return result; } // this = this * other. void Multiply(const DiyFp& other); // returns a * b; static DiyFp Times(const DiyFp& a, const DiyFp& b) { DiyFp result = a; result.Multiply(b); return result; } void Normalize() { ASSERT(f_ != 0); uint64_t f = f_; int e = e_; // This method is mainly called for normalizing boundaries. In general // boundaries need to be shifted by 10 bits. We thus optimize for this case. const uint64_t k10MSBits = UINT64_2PART_C(0xFFC00000, 00000000); while ((f & k10MSBits) == 0) { f <<= 10; e -= 10; } while ((f & kUint64MSB) == 0) { f <<= 1; e--; } f_ = f; e_ = e; } static DiyFp Normalize(const DiyFp& a) { DiyFp result = a; result.Normalize(); return result; } uint64_t f() const { return f_; } int e() const { return e_; } void set_f(uint64_t new_value) { f_ = new_value; } void set_e(int new_value) { e_ = new_value; } private: static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000); uint64_t f_; int e_; }; } // namespace double_conversion #endif // DOUBLE_CONVERSION_DIY_FP_H_ ================================================ FILE: src/kenlm/util/double-conversion/double-conversion.cc ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include "double-conversion.h" #include "bignum-dtoa.h" #include "fast-dtoa.h" #include "fixed-dtoa.h" #include "ieee.h" #include "strtod.h" #include "utils.h" namespace double_conversion { const DoubleToStringConverter& DoubleToStringConverter::EcmaScriptConverter() { int flags = UNIQUE_ZERO | EMIT_POSITIVE_EXPONENT_SIGN; static DoubleToStringConverter converter(flags, "Infinity", "NaN", 'e', -6, 21, 6, 0); return converter; } bool DoubleToStringConverter::HandleSpecialValues( double value, StringBuilder* result_builder) const { Double double_inspect(value); if (double_inspect.IsInfinite()) { if (infinity_symbol_ == NULL) return false; if (value < 0) { result_builder->AddCharacter('-'); } result_builder->AddString(infinity_symbol_); return true; } if (double_inspect.IsNan()) { if (nan_symbol_ == NULL) return false; result_builder->AddString(nan_symbol_); return true; } return false; } void DoubleToStringConverter::CreateExponentialRepresentation( const char* decimal_digits, int length, int exponent, StringBuilder* result_builder) const { ASSERT(length != 0); result_builder->AddCharacter(decimal_digits[0]); if (length != 1) { result_builder->AddCharacter('.'); result_builder->AddSubstring(&decimal_digits[1], length-1); } result_builder->AddCharacter(exponent_character_); if (exponent < 0) { result_builder->AddCharacter('-'); exponent = -exponent; } else { if ((flags_ & EMIT_POSITIVE_EXPONENT_SIGN) != 0) { result_builder->AddCharacter('+'); } } if (exponent == 0) { result_builder->AddCharacter('0'); return; } ASSERT(exponent < 1e4); const int kMaxExponentLength = 5; char buffer[kMaxExponentLength + 1]; buffer[kMaxExponentLength] = '\0'; int first_char_pos = kMaxExponentLength; while (exponent > 0) { buffer[--first_char_pos] = '0' + (exponent % 10); exponent /= 10; } result_builder->AddSubstring(&buffer[first_char_pos], kMaxExponentLength - first_char_pos); } void DoubleToStringConverter::CreateDecimalRepresentation( const char* decimal_digits, int length, int decimal_point, int digits_after_point, StringBuilder* result_builder) const { // Create a representation that is padded with zeros if needed. if (decimal_point <= 0) { // "0.00000decimal_rep". result_builder->AddCharacter('0'); if (digits_after_point > 0) { result_builder->AddCharacter('.'); result_builder->AddPadding('0', -decimal_point); ASSERT(length <= digits_after_point - (-decimal_point)); result_builder->AddSubstring(decimal_digits, length); int remaining_digits = digits_after_point - (-decimal_point) - length; result_builder->AddPadding('0', remaining_digits); } } else if (decimal_point >= length) { // "decimal_rep0000.00000" or "decimal_rep.0000" result_builder->AddSubstring(decimal_digits, length); result_builder->AddPadding('0', decimal_point - length); if (digits_after_point > 0) { result_builder->AddCharacter('.'); result_builder->AddPadding('0', digits_after_point); } } else { // "decima.l_rep000" ASSERT(digits_after_point > 0); result_builder->AddSubstring(decimal_digits, decimal_point); result_builder->AddCharacter('.'); ASSERT(length - decimal_point <= digits_after_point); result_builder->AddSubstring(&decimal_digits[decimal_point], length - decimal_point); int remaining_digits = digits_after_point - (length - decimal_point); result_builder->AddPadding('0', remaining_digits); } if (digits_after_point == 0) { if ((flags_ & EMIT_TRAILING_DECIMAL_POINT) != 0) { result_builder->AddCharacter('.'); } if ((flags_ & EMIT_TRAILING_ZERO_AFTER_POINT) != 0) { result_builder->AddCharacter('0'); } } } bool DoubleToStringConverter::ToShortestIeeeNumber( double value, StringBuilder* result_builder, DoubleToStringConverter::DtoaMode mode) const { ASSERT(mode == SHORTEST || mode == SHORTEST_SINGLE); if (Double(value).IsSpecial()) { return HandleSpecialValues(value, result_builder); } int decimal_point; bool sign; const int kDecimalRepCapacity = kBase10MaximalLength + 1; char decimal_rep[kDecimalRepCapacity]; int decimal_rep_length; DoubleToAscii(value, mode, 0, decimal_rep, kDecimalRepCapacity, &sign, &decimal_rep_length, &decimal_point); bool unique_zero = (flags_ & UNIQUE_ZERO) != 0; if (sign && (value != 0.0 || !unique_zero)) { result_builder->AddCharacter('-'); } int exponent = decimal_point - 1; if ((decimal_in_shortest_low_ <= exponent) && (exponent < decimal_in_shortest_high_)) { CreateDecimalRepresentation(decimal_rep, decimal_rep_length, decimal_point, Max(0, decimal_rep_length - decimal_point), result_builder); } else { CreateExponentialRepresentation(decimal_rep, decimal_rep_length, exponent, result_builder); } return true; } bool DoubleToStringConverter::ToFixed(double value, int requested_digits, StringBuilder* result_builder) const { ASSERT(kMaxFixedDigitsBeforePoint == 60); const double kFirstNonFixed = 1e60; if (Double(value).IsSpecial()) { return HandleSpecialValues(value, result_builder); } if (requested_digits > kMaxFixedDigitsAfterPoint) return false; if (value >= kFirstNonFixed || value <= -kFirstNonFixed) return false; // Find a sufficiently precise decimal representation of n. int decimal_point; bool sign; // Add space for the '\0' byte. const int kDecimalRepCapacity = kMaxFixedDigitsBeforePoint + kMaxFixedDigitsAfterPoint + 1; char decimal_rep[kDecimalRepCapacity]; int decimal_rep_length; DoubleToAscii(value, FIXED, requested_digits, decimal_rep, kDecimalRepCapacity, &sign, &decimal_rep_length, &decimal_point); bool unique_zero = ((flags_ & UNIQUE_ZERO) != 0); if (sign && (value != 0.0 || !unique_zero)) { result_builder->AddCharacter('-'); } CreateDecimalRepresentation(decimal_rep, decimal_rep_length, decimal_point, requested_digits, result_builder); return true; } bool DoubleToStringConverter::ToExponential( double value, int requested_digits, StringBuilder* result_builder) const { if (Double(value).IsSpecial()) { return HandleSpecialValues(value, result_builder); } if (requested_digits < -1) return false; if (requested_digits > kMaxExponentialDigits) return false; int decimal_point; bool sign; // Add space for digit before the decimal point and the '\0' character. const int kDecimalRepCapacity = kMaxExponentialDigits + 2; ASSERT(kDecimalRepCapacity > kBase10MaximalLength); char decimal_rep[kDecimalRepCapacity]; int decimal_rep_length; if (requested_digits == -1) { DoubleToAscii(value, SHORTEST, 0, decimal_rep, kDecimalRepCapacity, &sign, &decimal_rep_length, &decimal_point); } else { DoubleToAscii(value, PRECISION, requested_digits + 1, decimal_rep, kDecimalRepCapacity, &sign, &decimal_rep_length, &decimal_point); ASSERT(decimal_rep_length <= requested_digits + 1); for (int i = decimal_rep_length; i < requested_digits + 1; ++i) { decimal_rep[i] = '0'; } decimal_rep_length = requested_digits + 1; } bool unique_zero = ((flags_ & UNIQUE_ZERO) != 0); if (sign && (value != 0.0 || !unique_zero)) { result_builder->AddCharacter('-'); } int exponent = decimal_point - 1; CreateExponentialRepresentation(decimal_rep, decimal_rep_length, exponent, result_builder); return true; } bool DoubleToStringConverter::ToPrecision(double value, int precision, StringBuilder* result_builder) const { if (Double(value).IsSpecial()) { return HandleSpecialValues(value, result_builder); } if (precision < kMinPrecisionDigits || precision > kMaxPrecisionDigits) { return false; } // Find a sufficiently precise decimal representation of n. int decimal_point; bool sign; // Add one for the terminating null character. const int kDecimalRepCapacity = kMaxPrecisionDigits + 1; char decimal_rep[kDecimalRepCapacity]; int decimal_rep_length; DoubleToAscii(value, PRECISION, precision, decimal_rep, kDecimalRepCapacity, &sign, &decimal_rep_length, &decimal_point); ASSERT(decimal_rep_length <= precision); bool unique_zero = ((flags_ & UNIQUE_ZERO) != 0); if (sign && (value != 0.0 || !unique_zero)) { result_builder->AddCharacter('-'); } // The exponent if we print the number as x.xxeyyy. That is with the // decimal point after the first digit. int exponent = decimal_point - 1; int extra_zero = ((flags_ & EMIT_TRAILING_ZERO_AFTER_POINT) != 0) ? 1 : 0; if ((-decimal_point + 1 > max_leading_padding_zeroes_in_precision_mode_) || (decimal_point - precision + extra_zero > max_trailing_padding_zeroes_in_precision_mode_)) { // Fill buffer to contain 'precision' digits. // Usually the buffer is already at the correct length, but 'DoubleToAscii' // is allowed to return less characters. for (int i = decimal_rep_length; i < precision; ++i) { decimal_rep[i] = '0'; } CreateExponentialRepresentation(decimal_rep, precision, exponent, result_builder); } else { CreateDecimalRepresentation(decimal_rep, decimal_rep_length, decimal_point, Max(0, precision - decimal_point), result_builder); } return true; } static BignumDtoaMode DtoaToBignumDtoaMode( DoubleToStringConverter::DtoaMode dtoa_mode) { switch (dtoa_mode) { case DoubleToStringConverter::SHORTEST: return BIGNUM_DTOA_SHORTEST; case DoubleToStringConverter::SHORTEST_SINGLE: return BIGNUM_DTOA_SHORTEST_SINGLE; case DoubleToStringConverter::FIXED: return BIGNUM_DTOA_FIXED; case DoubleToStringConverter::PRECISION: return BIGNUM_DTOA_PRECISION; default: UNREACHABLE(); return BIGNUM_DTOA_SHORTEST; // To silence compiler. } } void DoubleToStringConverter::DoubleToAscii(double v, DtoaMode mode, int requested_digits, char* buffer, int buffer_length, bool* sign, int* length, int* point) { Vector vector(buffer, buffer_length); ASSERT(!Double(v).IsSpecial()); ASSERT(mode == SHORTEST || mode == SHORTEST_SINGLE || requested_digits >= 0); if (Double(v).Sign() < 0) { *sign = true; v = -v; } else { *sign = false; } if (mode == PRECISION && requested_digits == 0) { vector[0] = '\0'; *length = 0; return; } if (v == 0) { vector[0] = '0'; vector[1] = '\0'; *length = 1; *point = 1; return; } bool fast_worked; switch (mode) { case SHORTEST: fast_worked = FastDtoa(v, FAST_DTOA_SHORTEST, 0, vector, length, point); break; case SHORTEST_SINGLE: fast_worked = FastDtoa(v, FAST_DTOA_SHORTEST_SINGLE, 0, vector, length, point); break; case FIXED: fast_worked = FastFixedDtoa(v, requested_digits, vector, length, point); break; case PRECISION: fast_worked = FastDtoa(v, FAST_DTOA_PRECISION, requested_digits, vector, length, point); break; default: UNREACHABLE(); fast_worked = false; } if (fast_worked) return; // If the fast dtoa didn't succeed use the slower bignum version. BignumDtoaMode bignum_mode = DtoaToBignumDtoaMode(mode); BignumDtoa(v, bignum_mode, requested_digits, vector, length, point); vector[*length] = '\0'; } // Consumes the given substring from the iterator. // Returns false, if the substring does not match. static bool ConsumeSubString(const char** current, const char* end, const char* substring) { ASSERT(**current == *substring); for (substring++; *substring != '\0'; substring++) { ++*current; if (*current == end || **current != *substring) return false; } ++*current; return true; } // Maximum number of significant digits in decimal representation. // The longest possible double in decimal representation is // (2^53 - 1) * 2 ^ -1074 that is (2 ^ 53 - 1) * 5 ^ 1074 / 10 ^ 1074 // (768 digits). If we parse a number whose first digits are equal to a // mean of 2 adjacent doubles (that could have up to 769 digits) the result // must be rounded to the bigger one unless the tail consists of zeros, so // we don't need to preserve all the digits. const int kMaxSignificantDigits = 772; // Returns true if a nonspace found and false if the end has reached. static inline bool AdvanceToNonspace(const char** current, const char* end) { while (*current != end) { if (**current != ' ') return true; ++*current; } return false; } static bool isDigit(int x, int radix) { return (x >= '0' && x <= '9' && x < '0' + radix) || (radix > 10 && x >= 'a' && x < 'a' + radix - 10) || (radix > 10 && x >= 'A' && x < 'A' + radix - 10); } static double SignedZero(bool sign) { return sign ? -0.0 : 0.0; } // Parsing integers with radix 2, 4, 8, 16, 32. Assumes current != end. template static double RadixStringToIeee(const char* current, const char* end, bool sign, bool allow_trailing_junk, double junk_string_value, bool read_as_double, const char** trailing_pointer) { ASSERT(current != end); const int kDoubleSize = Double::kSignificandSize; const int kSingleSize = Single::kSignificandSize; const int kSignificandSize = read_as_double? kDoubleSize: kSingleSize; // Skip leading 0s. while (*current == '0') { ++current; if (current == end) { *trailing_pointer = end; return SignedZero(sign); } } int64_t number = 0; int exponent = 0; const int radix = (1 << radix_log_2); do { int digit; if (*current >= '0' && *current <= '9' && *current < '0' + radix) { digit = static_cast(*current) - '0'; } else if (radix > 10 && *current >= 'a' && *current < 'a' + radix - 10) { digit = static_cast(*current) - 'a' + 10; } else if (radix > 10 && *current >= 'A' && *current < 'A' + radix - 10) { digit = static_cast(*current) - 'A' + 10; } else { if (allow_trailing_junk || !AdvanceToNonspace(¤t, end)) { break; } else { return junk_string_value; } } number = number * radix + digit; int overflow = static_cast(number >> kSignificandSize); if (overflow != 0) { // Overflow occurred. Need to determine which direction to round the // result. int overflow_bits_count = 1; while (overflow > 1) { overflow_bits_count++; overflow >>= 1; } int dropped_bits_mask = ((1 << overflow_bits_count) - 1); int dropped_bits = static_cast(number) & dropped_bits_mask; number >>= overflow_bits_count; exponent = overflow_bits_count; bool zero_tail = true; while (true) { ++current; if (current == end || !isDigit(*current, radix)) break; zero_tail = zero_tail && *current == '0'; exponent += radix_log_2; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value; } int middle_value = (1 << (overflow_bits_count - 1)); if (dropped_bits > middle_value) { number++; // Rounding up. } else if (dropped_bits == middle_value) { // Rounding to even to consistency with decimals: half-way case rounds // up if significant part is odd and down otherwise. if ((number & 1) != 0 || !zero_tail) { number++; // Rounding up. } } // Rounding up may cause overflow. if ((number & ((int64_t)1 << kSignificandSize)) != 0) { exponent++; number >>= 1; } break; } ++current; } while (current != end); ASSERT(number < ((int64_t)1 << kSignificandSize)); ASSERT(static_cast(static_cast(number)) == number); *trailing_pointer = current; if (exponent == 0) { if (sign) { if (number == 0) return -0.0; number = -number; } return static_cast(number); } ASSERT(number != 0); return Double(DiyFp(number, exponent)).value(); } double StringToDoubleConverter::StringToIeee( const char* input, int length, int* processed_characters_count, bool read_as_double) const { const char* current = input; const char* end = input + length; *processed_characters_count = 0; const bool allow_trailing_junk = (flags_ & ALLOW_TRAILING_JUNK) != 0; const bool allow_leading_spaces = (flags_ & ALLOW_LEADING_SPACES) != 0; const bool allow_trailing_spaces = (flags_ & ALLOW_TRAILING_SPACES) != 0; const bool allow_spaces_after_sign = (flags_ & ALLOW_SPACES_AFTER_SIGN) != 0; // To make sure that iterator dereferencing is valid the following // convention is used: // 1. Each '++current' statement is followed by check for equality to 'end'. // 2. If AdvanceToNonspace returned false then current == end. // 3. If 'current' becomes equal to 'end' the function returns or goes to // 'parsing_done'. // 4. 'current' is not dereferenced after the 'parsing_done' label. // 5. Code before 'parsing_done' may rely on 'current != end'. if (current == end) return empty_string_value_; if (allow_leading_spaces || allow_trailing_spaces) { if (!AdvanceToNonspace(¤t, end)) { *processed_characters_count = current - input; return empty_string_value_; } if (!allow_leading_spaces && (input != current)) { // No leading spaces allowed, but AdvanceToNonspace moved forward. return junk_string_value_; } } // The longest form of simplified number is: "-.1eXXX\0". const int kBufferSize = kMaxSignificantDigits + 10; char buffer[kBufferSize]; // NOLINT: size is known at compile time. int buffer_pos = 0; // Exponent will be adjusted if insignificant digits of the integer part // or insignificant leading zeros of the fractional part are dropped. int exponent = 0; int significant_digits = 0; int insignificant_digits = 0; bool nonzero_digit_dropped = false; bool sign = false; if (*current == '+' || *current == '-') { sign = (*current == '-'); ++current; const char* next_non_space = current; // Skip following spaces (if allowed). if (!AdvanceToNonspace(&next_non_space, end)) return junk_string_value_; if (!allow_spaces_after_sign && (current != next_non_space)) { return junk_string_value_; } current = next_non_space; } if (infinity_symbol_ != NULL) { if (*current == infinity_symbol_[0]) { if (!ConsumeSubString(¤t, end, infinity_symbol_)) { return junk_string_value_; } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { return junk_string_value_; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value_; } ASSERT(buffer_pos == 0); *processed_characters_count = current - input; return sign ? -Double::Infinity() : Double::Infinity(); } } if (nan_symbol_ != NULL) { if (*current == nan_symbol_[0]) { if (!ConsumeSubString(¤t, end, nan_symbol_)) { return junk_string_value_; } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { return junk_string_value_; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value_; } ASSERT(buffer_pos == 0); *processed_characters_count = current - input; return sign ? -Double::NaN() : Double::NaN(); } } bool leading_zero = false; if (*current == '0') { ++current; if (current == end) { *processed_characters_count = current - input; return SignedZero(sign); } leading_zero = true; // It could be hexadecimal value. if ((flags_ & ALLOW_HEX) && (*current == 'x' || *current == 'X')) { ++current; if (current == end || !isDigit(*current, 16)) { return junk_string_value_; // "0x". } const char* tail_pointer = NULL; double result = RadixStringToIeee<4>(current, end, sign, allow_trailing_junk, junk_string_value_, read_as_double, &tail_pointer); if (tail_pointer != NULL) { if (allow_trailing_spaces) AdvanceToNonspace(&tail_pointer, end); *processed_characters_count = tail_pointer - input; } return result; } // Ignore leading zeros in the integer part. while (*current == '0') { ++current; if (current == end) { *processed_characters_count = current - input; return SignedZero(sign); } } } bool octal = leading_zero && (flags_ & ALLOW_OCTALS) != 0; // Copy significant digits of the integer part (if any) to the buffer. while (*current >= '0' && *current <= '9') { if (significant_digits < kMaxSignificantDigits) { ASSERT(buffer_pos < kBufferSize); buffer[buffer_pos++] = static_cast(*current); significant_digits++; // Will later check if it's an octal in the buffer. } else { insignificant_digits++; // Move the digit into the exponential part. nonzero_digit_dropped = nonzero_digit_dropped || *current != '0'; } octal = octal && *current < '8'; ++current; if (current == end) goto parsing_done; } if (significant_digits == 0) { octal = false; } if (*current == '.') { if (octal && !allow_trailing_junk) return junk_string_value_; if (octal) goto parsing_done; ++current; if (current == end) { if (significant_digits == 0 && !leading_zero) { return junk_string_value_; } else { goto parsing_done; } } if (significant_digits == 0) { // octal = false; // Integer part consists of 0 or is absent. Significant digits start after // leading zeros (if any). while (*current == '0') { ++current; if (current == end) { *processed_characters_count = current - input; return SignedZero(sign); } exponent--; // Move this 0 into the exponent. } } // There is a fractional part. // We don't emit a '.', but adjust the exponent instead. while (*current >= '0' && *current <= '9') { if (significant_digits < kMaxSignificantDigits) { ASSERT(buffer_pos < kBufferSize); buffer[buffer_pos++] = static_cast(*current); significant_digits++; exponent--; } else { // Ignore insignificant digits in the fractional part. nonzero_digit_dropped = nonzero_digit_dropped || *current != '0'; } ++current; if (current == end) goto parsing_done; } } if (!leading_zero && exponent == 0 && significant_digits == 0) { // If leading_zeros is true then the string contains zeros. // If exponent < 0 then string was [+-]\.0*... // If significant_digits != 0 the string is not equal to 0. // Otherwise there are no digits in the string. return junk_string_value_; } // Parse exponential part. if (*current == 'e' || *current == 'E') { if (octal && !allow_trailing_junk) return junk_string_value_; if (octal) goto parsing_done; ++current; if (current == end) { if (allow_trailing_junk) { goto parsing_done; } else { return junk_string_value_; } } char sign = '+'; if (*current == '+' || *current == '-') { sign = static_cast(*current); ++current; if (current == end) { if (allow_trailing_junk) { goto parsing_done; } else { return junk_string_value_; } } } if (current == end || *current < '0' || *current > '9') { if (allow_trailing_junk) { goto parsing_done; } else { return junk_string_value_; } } const int max_exponent = INT_MAX / 2; ASSERT(-max_exponent / 2 <= exponent && exponent <= max_exponent / 2); int num = 0; do { // Check overflow. int digit = *current - '0'; if (num >= max_exponent / 10 && !(num == max_exponent / 10 && digit <= max_exponent % 10)) { num = max_exponent; } else { num = num * 10 + digit; } ++current; } while (current != end && *current >= '0' && *current <= '9'); exponent += (sign == '-' ? -num : num); } if (!(allow_trailing_spaces || allow_trailing_junk) && (current != end)) { return junk_string_value_; } if (!allow_trailing_junk && AdvanceToNonspace(¤t, end)) { return junk_string_value_; } if (allow_trailing_spaces) { AdvanceToNonspace(¤t, end); } parsing_done: exponent += insignificant_digits; if (octal) { double result; const char* tail_pointer = NULL; result = RadixStringToIeee<3>(buffer, buffer + buffer_pos, sign, allow_trailing_junk, junk_string_value_, read_as_double, &tail_pointer); ASSERT(tail_pointer != NULL); *processed_characters_count = current - input; return result; } if (nonzero_digit_dropped) { buffer[buffer_pos++] = '1'; exponent--; } ASSERT(buffer_pos < kBufferSize); buffer[buffer_pos] = '\0'; double converted; if (read_as_double) { converted = Strtod(Vector(buffer, buffer_pos), exponent); } else { converted = Strtof(Vector(buffer, buffer_pos), exponent); } *processed_characters_count = current - input; return sign? -converted: converted; } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/double-conversion.h ================================================ // Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ #define DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ #include "utils.h" namespace double_conversion { class DoubleToStringConverter { public: // When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint // or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the // function returns false. static const int kMaxFixedDigitsBeforePoint = 60; static const int kMaxFixedDigitsAfterPoint = 60; // When calling ToExponential with a requested_digits // parameter > kMaxExponentialDigits then the function returns false. static const int kMaxExponentialDigits = 120; // When calling ToPrecision with a requested_digits // parameter < kMinPrecisionDigits or requested_digits > kMaxPrecisionDigits // then the function returns false. static const int kMinPrecisionDigits = 1; static const int kMaxPrecisionDigits = 120; enum Flags { NO_FLAGS = 0, EMIT_POSITIVE_EXPONENT_SIGN = 1, EMIT_TRAILING_DECIMAL_POINT = 2, EMIT_TRAILING_ZERO_AFTER_POINT = 4, UNIQUE_ZERO = 8 }; // Flags should be a bit-or combination of the possible Flags-enum. // - NO_FLAGS: no special flags. // - EMIT_POSITIVE_EXPONENT_SIGN: when the number is converted into exponent // form, emits a '+' for positive exponents. Example: 1.2e+2. // - EMIT_TRAILING_DECIMAL_POINT: when the input number is an integer and is // converted into decimal format then a trailing decimal point is appended. // Example: 2345.0 is converted to "2345.". // - EMIT_TRAILING_ZERO_AFTER_POINT: in addition to a trailing decimal point // emits a trailing '0'-character. This flag requires the // EXMIT_TRAILING_DECIMAL_POINT flag. // Example: 2345.0 is converted to "2345.0". // - UNIQUE_ZERO: "-0.0" is converted to "0.0". // // Infinity symbol and nan_symbol provide the string representation for these // special values. If the string is NULL and the special value is encountered // then the conversion functions return false. // // The exponent_character is used in exponential representations. It is // usually 'e' or 'E'. // // When converting to the shortest representation the converter will // represent input numbers in decimal format if they are in the interval // [10^decimal_in_shortest_low; 10^decimal_in_shortest_high[ // (lower boundary included, greater boundary excluded). // Example: with decimal_in_shortest_low = -6 and // decimal_in_shortest_high = 21: // ToShortest(0.000001) -> "0.000001" // ToShortest(0.0000001) -> "1e-7" // ToShortest(111111111111111111111.0) -> "111111111111111110000" // ToShortest(100000000000000000000.0) -> "100000000000000000000" // ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21" // // When converting to precision mode the converter may add // max_leading_padding_zeroes before returning the number in exponential // format. // Example with max_leading_padding_zeroes_in_precision_mode = 6. // ToPrecision(0.0000012345, 2) -> "0.0000012" // ToPrecision(0.00000012345, 2) -> "1.2e-7" // Similarily the converter may add up to // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid // returning an exponential representation. A zero added by the // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit. // Examples for max_trailing_padding_zeroes_in_precision_mode = 1: // ToPrecision(230.0, 2) -> "230" // ToPrecision(230.0, 2) -> "230." with EMIT_TRAILING_DECIMAL_POINT. // ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT. DoubleToStringConverter(int flags, const char* infinity_symbol, const char* nan_symbol, char exponent_character, int decimal_in_shortest_low, int decimal_in_shortest_high, int max_leading_padding_zeroes_in_precision_mode, int max_trailing_padding_zeroes_in_precision_mode) : flags_(flags), infinity_symbol_(infinity_symbol), nan_symbol_(nan_symbol), exponent_character_(exponent_character), decimal_in_shortest_low_(decimal_in_shortest_low), decimal_in_shortest_high_(decimal_in_shortest_high), max_leading_padding_zeroes_in_precision_mode_( max_leading_padding_zeroes_in_precision_mode), max_trailing_padding_zeroes_in_precision_mode_( max_trailing_padding_zeroes_in_precision_mode) { // When 'trailing zero after the point' is set, then 'trailing point' // must be set too. ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) || !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0)); } // Returns a converter following the EcmaScript specification. static const DoubleToStringConverter& EcmaScriptConverter(); // Computes the shortest string of digits that correctly represent the input // number. Depending on decimal_in_shortest_low and decimal_in_shortest_high // (see constructor) it then either returns a decimal representation, or an // exponential representation. // Example with decimal_in_shortest_low = -6, // decimal_in_shortest_high = 21, // EMIT_POSITIVE_EXPONENT_SIGN activated, and // EMIT_TRAILING_DECIMAL_POINT deactived: // ToShortest(0.000001) -> "0.000001" // ToShortest(0.0000001) -> "1e-7" // ToShortest(111111111111111111111.0) -> "111111111111111110000" // ToShortest(100000000000000000000.0) -> "100000000000000000000" // ToShortest(1111111111111111111111.0) -> "1.1111111111111111e+21" // // Note: the conversion may round the output if the returned string // is accurate enough to uniquely identify the input-number. // For example the most precise representation of the double 9e59 equals // "899999999999999918767229449717619953810131273674690656206848", but // the converter will return the shorter (but still correct) "9e59". // // Returns true if the conversion succeeds. The conversion always succeeds // except when the input value is special and no infinity_symbol or // nan_symbol has been given to the constructor. bool ToShortest(double value, StringBuilder* result_builder) const { return ToShortestIeeeNumber(value, result_builder, SHORTEST); } // Same as ToShortest, but for single-precision floats. bool ToShortestSingle(float value, StringBuilder* result_builder) const { return ToShortestIeeeNumber(value, result_builder, SHORTEST_SINGLE); } // Computes a decimal representation with a fixed number of digits after the // decimal point. The last emitted digit is rounded. // // Examples: // ToFixed(3.12, 1) -> "3.1" // ToFixed(3.1415, 3) -> "3.142" // ToFixed(1234.56789, 4) -> "1234.5679" // ToFixed(1.23, 5) -> "1.23000" // ToFixed(0.1, 4) -> "0.1000" // ToFixed(1e30, 2) -> "1000000000000000019884624838656.00" // ToFixed(0.1, 30) -> "0.100000000000000005551115123126" // ToFixed(0.1, 17) -> "0.10000000000000001" // // If requested_digits equals 0, then the tail of the result depends on // the EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT. // Examples, for requested_digits == 0, // let EMIT_TRAILING_DECIMAL_POINT and EMIT_TRAILING_ZERO_AFTER_POINT be // - false and false: then 123.45 -> 123 // 0.678 -> 1 // - true and false: then 123.45 -> 123. // 0.678 -> 1. // - true and true: then 123.45 -> 123.0 // 0.678 -> 1.0 // // Returns true if the conversion succeeds. The conversion always succeeds // except for the following cases: // - the input value is special and no infinity_symbol or nan_symbol has // been provided to the constructor, // - 'value' > 10^kMaxFixedDigitsBeforePoint, or // - 'requested_digits' > kMaxFixedDigitsAfterPoint. // The last two conditions imply that the result will never contain more than // 1 + kMaxFixedDigitsBeforePoint + 1 + kMaxFixedDigitsAfterPoint characters // (one additional character for the sign, and one for the decimal point). bool ToFixed(double value, int requested_digits, StringBuilder* result_builder) const; // Computes a representation in exponential format with requested_digits // after the decimal point. The last emitted digit is rounded. // If requested_digits equals -1, then the shortest exponential representation // is computed. // // Examples with EMIT_POSITIVE_EXPONENT_SIGN deactivated, and // exponent_character set to 'e'. // ToExponential(3.12, 1) -> "3.1e0" // ToExponential(5.0, 3) -> "5.000e0" // ToExponential(0.001, 2) -> "1.00e-3" // ToExponential(3.1415, -1) -> "3.1415e0" // ToExponential(3.1415, 4) -> "3.1415e0" // ToExponential(3.1415, 3) -> "3.142e0" // ToExponential(123456789000000, 3) -> "1.235e14" // ToExponential(1000000000000000019884624838656.0, -1) -> "1e30" // ToExponential(1000000000000000019884624838656.0, 32) -> // "1.00000000000000001988462483865600e30" // ToExponential(1234, 0) -> "1e3" // // Returns true if the conversion succeeds. The conversion always succeeds // except for the following cases: // - the input value is special and no infinity_symbol or nan_symbol has // been provided to the constructor, // - 'requested_digits' > kMaxExponentialDigits. // The last condition implies that the result will never contain more than // kMaxExponentialDigits + 8 characters (the sign, the digit before the // decimal point, the decimal point, the exponent character, the // exponent's sign, and at most 3 exponent digits). bool ToExponential(double value, int requested_digits, StringBuilder* result_builder) const; // Computes 'precision' leading digits of the given 'value' and returns them // either in exponential or decimal format, depending on // max_{leading|trailing}_padding_zeroes_in_precision_mode (given to the // constructor). // The last computed digit is rounded. // // Example with max_leading_padding_zeroes_in_precision_mode = 6. // ToPrecision(0.0000012345, 2) -> "0.0000012" // ToPrecision(0.00000012345, 2) -> "1.2e-7" // Similarily the converter may add up to // max_trailing_padding_zeroes_in_precision_mode in precision mode to avoid // returning an exponential representation. A zero added by the // EMIT_TRAILING_ZERO_AFTER_POINT flag is counted for this limit. // Examples for max_trailing_padding_zeroes_in_precision_mode = 1: // ToPrecision(230.0, 2) -> "230" // ToPrecision(230.0, 2) -> "230." with EMIT_TRAILING_DECIMAL_POINT. // ToPrecision(230.0, 2) -> "2.3e2" with EMIT_TRAILING_ZERO_AFTER_POINT. // Examples for max_trailing_padding_zeroes_in_precision_mode = 3, and no // EMIT_TRAILING_ZERO_AFTER_POINT: // ToPrecision(123450.0, 6) -> "123450" // ToPrecision(123450.0, 5) -> "123450" // ToPrecision(123450.0, 4) -> "123500" // ToPrecision(123450.0, 3) -> "123000" // ToPrecision(123450.0, 2) -> "1.2e5" // // Returns true if the conversion succeeds. The conversion always succeeds // except for the following cases: // - the input value is special and no infinity_symbol or nan_symbol has // been provided to the constructor, // - precision < kMinPericisionDigits // - precision > kMaxPrecisionDigits // The last condition implies that the result will never contain more than // kMaxPrecisionDigits + 7 characters (the sign, the decimal point, the // exponent character, the exponent's sign, and at most 3 exponent digits). bool ToPrecision(double value, int precision, StringBuilder* result_builder) const; enum DtoaMode { // Produce the shortest correct representation. // For example the output of 0.299999999999999988897 is (the less accurate // but correct) 0.3. SHORTEST, // Same as SHORTEST, but for single-precision floats. SHORTEST_SINGLE, // Produce a fixed number of digits after the decimal point. // For instance fixed(0.1, 4) becomes 0.1000 // If the input number is big, the output will be big. FIXED, // Fixed number of digits (independent of the decimal point). PRECISION }; // The maximal number of digits that are needed to emit a double in base 10. // A higher precision can be achieved by using more digits, but the shortest // accurate representation of any double will never use more digits than // kBase10MaximalLength. // Note that DoubleToAscii null-terminates its input. So the given buffer // should be at least kBase10MaximalLength + 1 characters long. static const int kBase10MaximalLength = 17; // Converts the given double 'v' to ascii. 'v' must not be NaN, +Infinity, or // -Infinity. In SHORTEST_SINGLE-mode this restriction also applies to 'v' // after it has been casted to a single-precision float. That is, in this // mode static_cast(v) must not be NaN, +Infinity or -Infinity. // // The result should be interpreted as buffer * 10^(point-length). // // The output depends on the given mode: // - SHORTEST: produce the least amount of digits for which the internal // identity requirement is still satisfied. If the digits are printed // (together with the correct exponent) then reading this number will give // 'v' again. The buffer will choose the representation that is closest to // 'v'. If there are two at the same distance, than the one farther away // from 0 is chosen (halfway cases - ending with 5 - are rounded up). // In this mode the 'requested_digits' parameter is ignored. // - SHORTEST_SINGLE: same as SHORTEST but with single-precision. // - FIXED: produces digits necessary to print a given number with // 'requested_digits' digits after the decimal point. The produced digits // might be too short in which case the caller has to fill the remainder // with '0's. // Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2. // Halfway cases are rounded towards +/-Infinity (away from 0). The call // toFixed(0.15, 2) thus returns buffer="2", point=0. // The returned buffer may contain digits that would be truncated from the // shortest representation of the input. // - PRECISION: produces 'requested_digits' where the first digit is not '0'. // Even though the length of produced digits usually equals // 'requested_digits', the function is allowed to return fewer digits, in // which case the caller has to fill the missing digits with '0's. // Halfway cases are again rounded away from 0. // DoubleToAscii expects the given buffer to be big enough to hold all // digits and a terminating null-character. In SHORTEST-mode it expects a // buffer of at least kBase10MaximalLength + 1. In all other modes the // requested_digits parameter and the padding-zeroes limit the size of the // output. Don't forget the decimal point, the exponent character and the // terminating null-character when computing the maximal output size. // The given length is only used in debug mode to ensure the buffer is big // enough. static void DoubleToAscii(double v, DtoaMode mode, int requested_digits, char* buffer, int buffer_length, bool* sign, int* length, int* point); private: // Implementation for ToShortest and ToShortestSingle. bool ToShortestIeeeNumber(double value, StringBuilder* result_builder, DtoaMode mode) const; // If the value is a special value (NaN or Infinity) constructs the // corresponding string using the configured infinity/nan-symbol. // If either of them is NULL or the value is not special then the // function returns false. bool HandleSpecialValues(double value, StringBuilder* result_builder) const; // Constructs an exponential representation (i.e. 1.234e56). // The given exponent assumes a decimal point after the first decimal digit. void CreateExponentialRepresentation(const char* decimal_digits, int length, int exponent, StringBuilder* result_builder) const; // Creates a decimal representation (i.e 1234.5678). void CreateDecimalRepresentation(const char* decimal_digits, int length, int decimal_point, int digits_after_point, StringBuilder* result_builder) const; const int flags_; const char* const infinity_symbol_; const char* const nan_symbol_; const char exponent_character_; const int decimal_in_shortest_low_; const int decimal_in_shortest_high_; const int max_leading_padding_zeroes_in_precision_mode_; const int max_trailing_padding_zeroes_in_precision_mode_; DISALLOW_IMPLICIT_CONSTRUCTORS(DoubleToStringConverter); }; class StringToDoubleConverter { public: // Enumeration for allowing octals and ignoring junk when converting // strings to numbers. enum Flags { NO_FLAGS = 0, ALLOW_HEX = 1, ALLOW_OCTALS = 2, ALLOW_TRAILING_JUNK = 4, ALLOW_LEADING_SPACES = 8, ALLOW_TRAILING_SPACES = 16, ALLOW_SPACES_AFTER_SIGN = 32 }; // Flags should be a bit-or combination of the possible Flags-enum. // - NO_FLAGS: no special flags. // - ALLOW_HEX: recognizes the prefix "0x". Hex numbers may only be integers. // Ex: StringToDouble("0x1234") -> 4660.0 // In StringToDouble("0x1234.56") the characters ".56" are trailing // junk. The result of the call is hence dependent on // the ALLOW_TRAILING_JUNK flag and/or the junk value. // With this flag "0x" is a junk-string. Even with ALLOW_TRAILING_JUNK, // the string will not be parsed as "0" followed by junk. // // - ALLOW_OCTALS: recognizes the prefix "0" for octals: // If a sequence of octal digits starts with '0', then the number is // read as octal integer. Octal numbers may only be integers. // Ex: StringToDouble("01234") -> 668.0 // StringToDouble("012349") -> 12349.0 // Not a sequence of octal // // digits. // In StringToDouble("01234.56") the characters ".56" are trailing // junk. The result of the call is hence dependent on // the ALLOW_TRAILING_JUNK flag and/or the junk value. // In StringToDouble("01234e56") the characters "e56" are trailing // junk, too. // - ALLOW_TRAILING_JUNK: ignore trailing characters that are not part of // a double literal. // - ALLOW_LEADING_SPACES: skip over leading spaces. // - ALLOW_TRAILING_SPACES: ignore trailing spaces. // - ALLOW_SPACES_AFTER_SIGN: ignore spaces after the sign. // Ex: StringToDouble("- 123.2") -> -123.2. // StringToDouble("+ 123.2") -> 123.2 // // empty_string_value is returned when an empty string is given as input. // If ALLOW_LEADING_SPACES or ALLOW_TRAILING_SPACES are set, then a string // containing only spaces is converted to the 'empty_string_value', too. // // junk_string_value is returned when // a) ALLOW_TRAILING_JUNK is not set, and a junk character (a character not // part of a double-literal) is found. // b) ALLOW_TRAILING_JUNK is set, but the string does not start with a // double literal. // // infinity_symbol and nan_symbol are strings that are used to detect // inputs that represent infinity and NaN. They can be null, in which case // they are ignored. // The conversion routine first reads any possible signs. Then it compares the // following character of the input-string with the first character of // the infinity, and nan-symbol. If either matches, the function assumes, that // a match has been found, and expects the following input characters to match // the remaining characters of the special-value symbol. // This means that the following restrictions apply to special-value symbols: // - they must not start with signs ('+', or '-'), // - they must not have the same first character. // - they must not start with digits. // // Examples: // flags = ALLOW_HEX | ALLOW_TRAILING_JUNK, // empty_string_value = 0.0, // junk_string_value = NaN, // infinity_symbol = "infinity", // nan_symbol = "nan": // StringToDouble("0x1234") -> 4660.0. // StringToDouble("0x1234K") -> 4660.0. // StringToDouble("") -> 0.0 // empty_string_value. // StringToDouble(" ") -> NaN // junk_string_value. // StringToDouble(" 1") -> NaN // junk_string_value. // StringToDouble("0x") -> NaN // junk_string_value. // StringToDouble("-123.45") -> -123.45. // StringToDouble("--123.45") -> NaN // junk_string_value. // StringToDouble("123e45") -> 123e45. // StringToDouble("123E45") -> 123e45. // StringToDouble("123e+45") -> 123e45. // StringToDouble("123E-45") -> 123e-45. // StringToDouble("123e") -> 123.0 // trailing junk ignored. // StringToDouble("123e-") -> 123.0 // trailing junk ignored. // StringToDouble("+NaN") -> NaN // NaN string literal. // StringToDouble("-infinity") -> -inf. // infinity literal. // StringToDouble("Infinity") -> NaN // junk_string_value. // // flags = ALLOW_OCTAL | ALLOW_LEADING_SPACES, // empty_string_value = 0.0, // junk_string_value = NaN, // infinity_symbol = NULL, // nan_symbol = NULL: // StringToDouble("0x1234") -> NaN // junk_string_value. // StringToDouble("01234") -> 668.0. // StringToDouble("") -> 0.0 // empty_string_value. // StringToDouble(" ") -> 0.0 // empty_string_value. // StringToDouble(" 1") -> 1.0 // StringToDouble("0x") -> NaN // junk_string_value. // StringToDouble("0123e45") -> NaN // junk_string_value. // StringToDouble("01239E45") -> 1239e45. // StringToDouble("-infinity") -> NaN // junk_string_value. // StringToDouble("NaN") -> NaN // junk_string_value. StringToDoubleConverter(int flags, double empty_string_value, double junk_string_value, const char* infinity_symbol, const char* nan_symbol) : flags_(flags), empty_string_value_(empty_string_value), junk_string_value_(junk_string_value), infinity_symbol_(infinity_symbol), nan_symbol_(nan_symbol) { } // Performs the conversion. // The output parameter 'processed_characters_count' is set to the number // of characters that have been processed to read the number. // Spaces than are processed with ALLOW_{LEADING|TRAILING}_SPACES are included // in the 'processed_characters_count'. Trailing junk is never included. double StringToDouble(const char* buffer, int length, int* processed_characters_count) const { return StringToIeee(buffer, length, processed_characters_count, true); } // Same as StringToDouble but reads a float. // Note that this is not equivalent to static_cast(StringToDouble(...)) // due to potential double-rounding. float StringToFloat(const char* buffer, int length, int* processed_characters_count) const { return static_cast(StringToIeee(buffer, length, processed_characters_count, false)); } private: const int flags_; const double empty_string_value_; const double junk_string_value_; const char* const infinity_symbol_; const char* const nan_symbol_; double StringToIeee(const char* buffer, int length, int* processed_characters_count, bool read_as_double) const; DISALLOW_IMPLICIT_CONSTRUCTORS(StringToDoubleConverter); }; } // namespace double_conversion #endif // DOUBLE_CONVERSION_DOUBLE_CONVERSION_H_ ================================================ FILE: src/kenlm/util/double-conversion/fast-dtoa.cc ================================================ // Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "fast-dtoa.h" #include "cached-powers.h" #include "diy-fp.h" #include "ieee.h" namespace double_conversion { // The minimal and maximal target exponent define the range of w's binary // exponent, where 'w' is the result of multiplying the input by a cached power // of ten. // // A different range might be chosen on a different platform, to optimize digit // generation, but a smaller range requires more powers of ten to be cached. static const int kMinimalTargetExponent = -60; static const int kMaximalTargetExponent = -32; // Adjusts the last digit of the generated number, and screens out generated // solutions that may be inaccurate. A solution may be inaccurate if it is // outside the safe interval, or if we cannot prove that it is closer to the // input than a neighboring representation of the same length. // // Input: * buffer containing the digits of too_high / 10^kappa // * the buffer's length // * distance_too_high_w == (too_high - w).f() * unit // * unsafe_interval == (too_high - too_low).f() * unit // * rest = (too_high - buffer * 10^kappa).f() * unit // * ten_kappa = 10^kappa * unit // * unit = the common multiplier // Output: returns true if the buffer is guaranteed to contain the closest // representable number to the input. // Modifies the generated digits in the buffer to approach (round towards) w. static bool RoundWeed(Vector buffer, int length, uint64_t distance_too_high_w, uint64_t unsafe_interval, uint64_t rest, uint64_t ten_kappa, uint64_t unit) { uint64_t small_distance = distance_too_high_w - unit; uint64_t big_distance = distance_too_high_w + unit; // Let w_low = too_high - big_distance, and // w_high = too_high - small_distance. // Note: w_low < w < w_high // // The real w (* unit) must lie somewhere inside the interval // ]w_low; w_high[ (often written as "(w_low; w_high)") // Basically the buffer currently contains a number in the unsafe interval // ]too_low; too_high[ with too_low < w < too_high // // too_high - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // ^v 1 unit ^ ^ ^ ^ // boundary_high --------------------- . . . . // ^v 1 unit . . . . // - - - - - - - - - - - - - - - - - - - + - - + - - - - - - . . // . . ^ . . // . big_distance . . . // . . . . rest // small_distance . . . . // v . . . . // w_high - - - - - - - - - - - - - - - - - - . . . . // ^v 1 unit . . . . // w ---------------------------------------- . . . . // ^v 1 unit v . . . // w_low - - - - - - - - - - - - - - - - - - - - - . . . // . . v // buffer --------------------------------------------------+-------+-------- // . . // safe_interval . // v . // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - . // ^v 1 unit . // boundary_low ------------------------- unsafe_interval // ^v 1 unit v // too_low - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // // // Note that the value of buffer could lie anywhere inside the range too_low // to too_high. // // boundary_low, boundary_high and w are approximations of the real boundaries // and v (the input number). They are guaranteed to be precise up to one unit. // In fact the error is guaranteed to be strictly less than one unit. // // Anything that lies outside the unsafe interval is guaranteed not to round // to v when read again. // Anything that lies inside the safe interval is guaranteed to round to v // when read again. // If the number inside the buffer lies inside the unsafe interval but not // inside the safe interval then we simply do not know and bail out (returning // false). // // Similarly we have to take into account the imprecision of 'w' when finding // the closest representation of 'w'. If we have two potential // representations, and one is closer to both w_low and w_high, then we know // it is closer to the actual value v. // // By generating the digits of too_high we got the largest (closest to // too_high) buffer that is still in the unsafe interval. In the case where // w_high < buffer < too_high we try to decrement the buffer. // This way the buffer approaches (rounds towards) w. // There are 3 conditions that stop the decrementation process: // 1) the buffer is already below w_high // 2) decrementing the buffer would make it leave the unsafe interval // 3) decrementing the buffer would yield a number below w_high and farther // away than the current number. In other words: // (buffer{-1} < w_high) && w_high - buffer{-1} > buffer - w_high // Instead of using the buffer directly we use its distance to too_high. // Conceptually rest ~= too_high - buffer // We need to do the following tests in this order to avoid over- and // underflows. ASSERT(rest <= unsafe_interval); while (rest < small_distance && // Negated condition 1 unsafe_interval - rest >= ten_kappa && // Negated condition 2 (rest + ten_kappa < small_distance || // buffer{-1} > w_high small_distance - rest >= rest + ten_kappa - small_distance)) { buffer[length - 1]--; rest += ten_kappa; } // We have approached w+ as much as possible. We now test if approaching w- // would require changing the buffer. If yes, then we have two possible // representations close to w, but we cannot decide which one is closer. if (rest < big_distance && unsafe_interval - rest >= ten_kappa && (rest + ten_kappa < big_distance || big_distance - rest > rest + ten_kappa - big_distance)) { return false; } // Weeding test. // The safe interval is [too_low + 2 ulp; too_high - 2 ulp] // Since too_low = too_high - unsafe_interval this is equivalent to // [too_high - unsafe_interval + 4 ulp; too_high - 2 ulp] // Conceptually we have: rest ~= too_high - buffer return (2 * unit <= rest) && (rest <= unsafe_interval - 4 * unit); } // Rounds the buffer upwards if the result is closer to v by possibly adding // 1 to the buffer. If the precision of the calculation is not sufficient to // round correctly, return false. // The rounding might shift the whole buffer in which case the kappa is // adjusted. For example "99", kappa = 3 might become "10", kappa = 4. // // If 2*rest > ten_kappa then the buffer needs to be round up. // rest can have an error of +/- 1 unit. This function accounts for the // imprecision and returns false, if the rounding direction cannot be // unambiguously determined. // // Precondition: rest < ten_kappa. static bool RoundWeedCounted(Vector buffer, int length, uint64_t rest, uint64_t ten_kappa, uint64_t unit, int* kappa) { ASSERT(rest < ten_kappa); // The following tests are done in a specific order to avoid overflows. They // will work correctly with any uint64 values of rest < ten_kappa and unit. // // If the unit is too big, then we don't know which way to round. For example // a unit of 50 means that the real number lies within rest +/- 50. If // 10^kappa == 40 then there is no way to tell which way to round. if (unit >= ten_kappa) return false; // Even if unit is just half the size of 10^kappa we are already completely // lost. (And after the previous test we know that the expression will not // over/underflow.) if (ten_kappa - unit <= unit) return false; // If 2 * (rest + unit) <= 10^kappa we can safely round down. if ((ten_kappa - rest > rest) && (ten_kappa - 2 * rest >= 2 * unit)) { return true; } // If 2 * (rest - unit) >= 10^kappa, then we can safely round up. if ((rest > unit) && (ten_kappa - (rest - unit) <= (rest - unit))) { // Increment the last digit recursively until we find a non '9' digit. buffer[length - 1]++; for (int i = length - 1; i > 0; --i) { if (buffer[i] != '0' + 10) break; buffer[i] = '0'; buffer[i - 1]++; } // If the first digit is now '0'+ 10 we had a buffer with all '9's. With the // exception of the first digit all digits are now '0'. Simply switch the // first digit to '1' and adjust the kappa. Example: "99" becomes "10" and // the power (the kappa) is increased. if (buffer[0] == '0' + 10) { buffer[0] = '1'; (*kappa) += 1; } return true; } return false; } // Returns the biggest power of ten that is less than or equal to the given // number. We furthermore receive the maximum number of bits 'number' has. // // Returns power == 10^(exponent_plus_one-1) such that // power <= number < power * 10. // If number_bits == 0 then 0^(0-1) is returned. // The number of bits must be <= 32. // Precondition: number < (1 << (number_bits + 1)). // Inspired by the method for finding an integer log base 10 from here: // http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10 static unsigned int const kSmallPowersOfTen[] = {0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; static void BiggestPowerTen(uint32_t number, int number_bits, uint32_t* power, int* exponent_plus_one) { ASSERT(number < (1u << (number_bits + 1))); // 1233/4096 is approximately 1/lg(10). int exponent_plus_one_guess = ((number_bits + 1) * 1233 >> 12); // We increment to skip over the first entry in the kPowersOf10 table. // Note: kPowersOf10[i] == 10^(i-1). exponent_plus_one_guess++; // We don't have any guarantees that 2^number_bits <= number. // TODO(floitsch): can we change the 'while' into an 'if'? We definitely see // number < (2^number_bits - 1), but I haven't encountered // number < (2^number_bits - 2) yet. while (number < kSmallPowersOfTen[exponent_plus_one_guess]) { exponent_plus_one_guess--; } *power = kSmallPowersOfTen[exponent_plus_one_guess]; *exponent_plus_one = exponent_plus_one_guess; } // Generates the digits of input number w. // w is a floating-point number (DiyFp), consisting of a significand and an // exponent. Its exponent is bounded by kMinimalTargetExponent and // kMaximalTargetExponent. // Hence -60 <= w.e() <= -32. // // Returns false if it fails, in which case the generated digits in the buffer // should not be used. // Preconditions: // * low, w and high are correct up to 1 ulp (unit in the last place). That // is, their error must be less than a unit of their last digits. // * low.e() == w.e() == high.e() // * low < w < high, and taking into account their error: low~ <= high~ // * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent // Postconditions: returns false if procedure fails. // otherwise: // * buffer is not null-terminated, but len contains the number of digits. // * buffer contains the shortest possible decimal digit-sequence // such that LOW < buffer * 10^kappa < HIGH, where LOW and HIGH are the // correct values of low and high (without their error). // * if more than one decimal representation gives the minimal number of // decimal digits then the one closest to W (where W is the correct value // of w) is chosen. // Remark: this procedure takes into account the imprecision of its input // numbers. If the precision is not enough to guarantee all the postconditions // then false is returned. This usually happens rarely (~0.5%). // // Say, for the sake of example, that // w.e() == -48, and w.f() == 0x1234567890abcdef // w's value can be computed by w.f() * 2^w.e() // We can obtain w's integral digits by simply shifting w.f() by -w.e(). // -> w's integral part is 0x1234 // w's fractional part is therefore 0x567890abcdef. // Printing w's integral part is easy (simply print 0x1234 in decimal). // In order to print its fraction we repeatedly multiply the fraction by 10 and // get each digit. Example the first digit after the point would be computed by // (0x567890abcdef * 10) >> 48. -> 3 // The whole thing becomes slightly more complicated because we want to stop // once we have enough digits. That is, once the digits inside the buffer // represent 'w' we can stop. Everything inside the interval low - high // represents w. However we have to pay attention to low, high and w's // imprecision. static bool DigitGen(DiyFp low, DiyFp w, DiyFp high, Vector buffer, int* length, int* kappa) { ASSERT(low.e() == w.e() && w.e() == high.e()); ASSERT(low.f() + 1 <= high.f() - 1); ASSERT(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent); // low, w and high are imprecise, but by less than one ulp (unit in the last // place). // If we remove (resp. add) 1 ulp from low (resp. high) we are certain that // the new numbers are outside of the interval we want the final // representation to lie in. // Inversely adding (resp. removing) 1 ulp from low (resp. high) would yield // numbers that are certain to lie in the interval. We will use this fact // later on. // We will now start by generating the digits within the uncertain // interval. Later we will weed out representations that lie outside the safe // interval and thus _might_ lie outside the correct interval. uint64_t unit = 1; DiyFp too_low = DiyFp(low.f() - unit, low.e()); DiyFp too_high = DiyFp(high.f() + unit, high.e()); // too_low and too_high are guaranteed to lie outside the interval we want the // generated number in. DiyFp unsafe_interval = DiyFp::Minus(too_high, too_low); // We now cut the input number into two parts: the integral digits and the // fractionals. We will not write any decimal separator though, but adapt // kappa instead. // Reminder: we are currently computing the digits (stored inside the buffer) // such that: too_low < buffer * 10^kappa < too_high // We use too_high for the digit_generation and stop as soon as possible. // If we stop early we effectively round down. DiyFp one = DiyFp(static_cast(1) << -w.e(), w.e()); // Division by one is a shift. uint32_t integrals = static_cast(too_high.f() >> -one.e()); // Modulo by one is an and. uint64_t fractionals = too_high.f() & (one.f() - 1); uint32_t divisor; int divisor_exponent_plus_one; BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), &divisor, &divisor_exponent_plus_one); *kappa = divisor_exponent_plus_one; *length = 0; // Loop invariant: buffer = too_high / 10^kappa (integer division) // The invariant holds for the first iteration: kappa has been initialized // with the divisor exponent + 1. And the divisor is the biggest power of ten // that is smaller than integrals. while (*kappa > 0) { int digit = integrals / divisor; buffer[*length] = '0' + digit; (*length)++; integrals %= divisor; (*kappa)--; // Note that kappa now equals the exponent of the divisor and that the // invariant thus holds again. uint64_t rest = (static_cast(integrals) << -one.e()) + fractionals; // Invariant: too_high = buffer * 10^kappa + DiyFp(rest, one.e()) // Reminder: unsafe_interval.e() == one.e() if (rest < unsafe_interval.f()) { // Rounding down (by not emitting the remaining digits) yields a number // that lies within the unsafe interval. return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f(), unsafe_interval.f(), rest, static_cast(divisor) << -one.e(), unit); } divisor /= 10; } // The integrals have been generated. We are at the point of the decimal // separator. In the following loop we simply multiply the remaining digits by // 10 and divide by one. We just need to pay attention to multiply associated // data (like the interval or 'unit'), too. // Note that the multiplication by 10 does not overflow, because w.e >= -60 // and thus one.e >= -60. ASSERT(one.e() >= -60); ASSERT(fractionals < one.f()); ASSERT(UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); while (true) { fractionals *= 10; unit *= 10; unsafe_interval.set_f(unsafe_interval.f() * 10); // Integer division by one. int digit = static_cast(fractionals >> -one.e()); buffer[*length] = '0' + digit; (*length)++; fractionals &= one.f() - 1; // Modulo by one. (*kappa)--; if (fractionals < unsafe_interval.f()) { return RoundWeed(buffer, *length, DiyFp::Minus(too_high, w).f() * unit, unsafe_interval.f(), fractionals, one.f(), unit); } } } // Generates (at most) requested_digits digits of input number w. // w is a floating-point number (DiyFp), consisting of a significand and an // exponent. Its exponent is bounded by kMinimalTargetExponent and // kMaximalTargetExponent. // Hence -60 <= w.e() <= -32. // // Returns false if it fails, in which case the generated digits in the buffer // should not be used. // Preconditions: // * w is correct up to 1 ulp (unit in the last place). That // is, its error must be strictly less than a unit of its last digit. // * kMinimalTargetExponent <= w.e() <= kMaximalTargetExponent // // Postconditions: returns false if procedure fails. // otherwise: // * buffer is not null-terminated, but length contains the number of // digits. // * the representation in buffer is the most precise representation of // requested_digits digits. // * buffer contains at most requested_digits digits of w. If there are less // than requested_digits digits then some trailing '0's have been removed. // * kappa is such that // w = buffer * 10^kappa + eps with |eps| < 10^kappa / 2. // // Remark: This procedure takes into account the imprecision of its input // numbers. If the precision is not enough to guarantee all the postconditions // then false is returned. This usually happens rarely, but the failure-rate // increases with higher requested_digits. static bool DigitGenCounted(DiyFp w, int requested_digits, Vector buffer, int* length, int* kappa) { ASSERT(kMinimalTargetExponent <= w.e() && w.e() <= kMaximalTargetExponent); ASSERT(kMinimalTargetExponent >= -60); ASSERT(kMaximalTargetExponent <= -32); // w is assumed to have an error less than 1 unit. Whenever w is scaled we // also scale its error. uint64_t w_error = 1; // We cut the input number into two parts: the integral digits and the // fractional digits. We don't emit any decimal separator, but adapt kappa // instead. Example: instead of writing "1.2" we put "12" into the buffer and // increase kappa by 1. DiyFp one = DiyFp(static_cast(1) << -w.e(), w.e()); // Division by one is a shift. uint32_t integrals = static_cast(w.f() >> -one.e()); // Modulo by one is an and. uint64_t fractionals = w.f() & (one.f() - 1); uint32_t divisor; int divisor_exponent_plus_one; BiggestPowerTen(integrals, DiyFp::kSignificandSize - (-one.e()), &divisor, &divisor_exponent_plus_one); *kappa = divisor_exponent_plus_one; *length = 0; // Loop invariant: buffer = w / 10^kappa (integer division) // The invariant holds for the first iteration: kappa has been initialized // with the divisor exponent + 1. And the divisor is the biggest power of ten // that is smaller than 'integrals'. while (*kappa > 0) { int digit = integrals / divisor; buffer[*length] = '0' + digit; (*length)++; requested_digits--; integrals %= divisor; (*kappa)--; // Note that kappa now equals the exponent of the divisor and that the // invariant thus holds again. if (requested_digits == 0) break; divisor /= 10; } if (requested_digits == 0) { uint64_t rest = (static_cast(integrals) << -one.e()) + fractionals; return RoundWeedCounted(buffer, *length, rest, static_cast(divisor) << -one.e(), w_error, kappa); } // The integrals have been generated. We are at the point of the decimal // separator. In the following loop we simply multiply the remaining digits by // 10 and divide by one. We just need to pay attention to multiply associated // data (the 'unit'), too. // Note that the multiplication by 10 does not overflow, because w.e >= -60 // and thus one.e >= -60. ASSERT(one.e() >= -60); ASSERT(fractionals < one.f()); ASSERT(UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF) / 10 >= one.f()); while (requested_digits > 0 && fractionals > w_error) { fractionals *= 10; w_error *= 10; // Integer division by one. int digit = static_cast(fractionals >> -one.e()); buffer[*length] = '0' + digit; (*length)++; requested_digits--; fractionals &= one.f() - 1; // Modulo by one. (*kappa)--; } if (requested_digits != 0) return false; return RoundWeedCounted(buffer, *length, fractionals, one.f(), w_error, kappa); } // Provides a decimal representation of v. // Returns true if it succeeds, otherwise the result cannot be trusted. // There will be *length digits inside the buffer (not null-terminated). // If the function returns true then // v == (double) (buffer * 10^decimal_exponent). // The digits in the buffer are the shortest representation possible: no // 0.09999999999999999 instead of 0.1. The shorter representation will even be // chosen even if the longer one would be closer to v. // The last digit will be closest to the actual v. That is, even if several // digits might correctly yield 'v' when read again, the closest will be // computed. static bool Grisu3(double v, FastDtoaMode mode, Vector buffer, int* length, int* decimal_exponent) { DiyFp w = Double(v).AsNormalizedDiyFp(); // boundary_minus and boundary_plus are the boundaries between v and its // closest floating-point neighbors. Any number strictly between // boundary_minus and boundary_plus will round to v when convert to a double. // Grisu3 will never output representations that lie exactly on a boundary. DiyFp boundary_minus, boundary_plus; if (mode == FAST_DTOA_SHORTEST) { Double(v).NormalizedBoundaries(&boundary_minus, &boundary_plus); } else { ASSERT(mode == FAST_DTOA_SHORTEST_SINGLE); float single_v = static_cast(v); Single(single_v).NormalizedBoundaries(&boundary_minus, &boundary_plus); } ASSERT(boundary_plus.e() == w.e()); DiyFp ten_mk; // Cached power of ten: 10^-k int mk; // -k int ten_mk_minimal_binary_exponent = kMinimalTargetExponent - (w.e() + DiyFp::kSignificandSize); int ten_mk_maximal_binary_exponent = kMaximalTargetExponent - (w.e() + DiyFp::kSignificandSize); PowersOfTenCache::GetCachedPowerForBinaryExponentRange( ten_mk_minimal_binary_exponent, ten_mk_maximal_binary_exponent, &ten_mk, &mk); ASSERT((kMinimalTargetExponent <= w.e() + ten_mk.e() + DiyFp::kSignificandSize) && (kMaximalTargetExponent >= w.e() + ten_mk.e() + DiyFp::kSignificandSize)); // Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a // 64 bit significand and ten_mk is thus only precise up to 64 bits. // The DiyFp::Times procedure rounds its result, and ten_mk is approximated // too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now // off by a small amount. // In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w. // In other words: let f = scaled_w.f() and e = scaled_w.e(), then // (f-1) * 2^e < w*10^k < (f+1) * 2^e DiyFp scaled_w = DiyFp::Times(w, ten_mk); ASSERT(scaled_w.e() == boundary_plus.e() + ten_mk.e() + DiyFp::kSignificandSize); // In theory it would be possible to avoid some recomputations by computing // the difference between w and boundary_minus/plus (a power of 2) and to // compute scaled_boundary_minus/plus by subtracting/adding from // scaled_w. However the code becomes much less readable and the speed // enhancements are not terriffic. DiyFp scaled_boundary_minus = DiyFp::Times(boundary_minus, ten_mk); DiyFp scaled_boundary_plus = DiyFp::Times(boundary_plus, ten_mk); // DigitGen will generate the digits of scaled_w. Therefore we have // v == (double) (scaled_w * 10^-mk). // Set decimal_exponent == -mk and pass it to DigitGen. If scaled_w is not an // integer than it will be updated. For instance if scaled_w == 1.23 then // the buffer will be filled with "123" und the decimal_exponent will be // decreased by 2. int kappa; bool result = DigitGen(scaled_boundary_minus, scaled_w, scaled_boundary_plus, buffer, length, &kappa); *decimal_exponent = -mk + kappa; return result; } // The "counted" version of grisu3 (see above) only generates requested_digits // number of digits. This version does not generate the shortest representation, // and with enough requested digits 0.1 will at some point print as 0.9999999... // Grisu3 is too imprecise for real halfway cases (1.5 will not work) and // therefore the rounding strategy for halfway cases is irrelevant. static bool Grisu3Counted(double v, int requested_digits, Vector buffer, int* length, int* decimal_exponent) { DiyFp w = Double(v).AsNormalizedDiyFp(); DiyFp ten_mk; // Cached power of ten: 10^-k int mk; // -k int ten_mk_minimal_binary_exponent = kMinimalTargetExponent - (w.e() + DiyFp::kSignificandSize); int ten_mk_maximal_binary_exponent = kMaximalTargetExponent - (w.e() + DiyFp::kSignificandSize); PowersOfTenCache::GetCachedPowerForBinaryExponentRange( ten_mk_minimal_binary_exponent, ten_mk_maximal_binary_exponent, &ten_mk, &mk); ASSERT((kMinimalTargetExponent <= w.e() + ten_mk.e() + DiyFp::kSignificandSize) && (kMaximalTargetExponent >= w.e() + ten_mk.e() + DiyFp::kSignificandSize)); // Note that ten_mk is only an approximation of 10^-k. A DiyFp only contains a // 64 bit significand and ten_mk is thus only precise up to 64 bits. // The DiyFp::Times procedure rounds its result, and ten_mk is approximated // too. The variable scaled_w (as well as scaled_boundary_minus/plus) are now // off by a small amount. // In fact: scaled_w - w*10^k < 1ulp (unit in the last place) of scaled_w. // In other words: let f = scaled_w.f() and e = scaled_w.e(), then // (f-1) * 2^e < w*10^k < (f+1) * 2^e DiyFp scaled_w = DiyFp::Times(w, ten_mk); // We now have (double) (scaled_w * 10^-mk). // DigitGen will generate the first requested_digits digits of scaled_w and // return together with a kappa such that scaled_w ~= buffer * 10^kappa. (It // will not always be exactly the same since DigitGenCounted only produces a // limited number of digits.) int kappa; bool result = DigitGenCounted(scaled_w, requested_digits, buffer, length, &kappa); *decimal_exponent = -mk + kappa; return result; } bool FastDtoa(double v, FastDtoaMode mode, int requested_digits, Vector buffer, int* length, int* decimal_point) { ASSERT(v > 0); ASSERT(!Double(v).IsSpecial()); bool result = false; int decimal_exponent = 0; switch (mode) { case FAST_DTOA_SHORTEST: case FAST_DTOA_SHORTEST_SINGLE: result = Grisu3(v, mode, buffer, length, &decimal_exponent); break; case FAST_DTOA_PRECISION: result = Grisu3Counted(v, requested_digits, buffer, length, &decimal_exponent); break; default: UNREACHABLE(); } if (result) { *decimal_point = *length + decimal_exponent; buffer[*length] = '\0'; } return result; } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/fast-dtoa.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_FAST_DTOA_H_ #define DOUBLE_CONVERSION_FAST_DTOA_H_ #include "utils.h" namespace double_conversion { enum FastDtoaMode { // Computes the shortest representation of the given input. The returned // result will be the most accurate number of this length. Longer // representations might be more accurate. FAST_DTOA_SHORTEST, // Same as FAST_DTOA_SHORTEST but for single-precision floats. FAST_DTOA_SHORTEST_SINGLE, // Computes a representation where the precision (number of digits) is // given as input. The precision is independent of the decimal point. FAST_DTOA_PRECISION }; // FastDtoa will produce at most kFastDtoaMaximalLength digits. This does not // include the terminating '\0' character. static const int kFastDtoaMaximalLength = 17; // Same for single-precision numbers. static const int kFastDtoaMaximalSingleLength = 9; // Provides a decimal representation of v. // The result should be interpreted as buffer * 10^(point - length). // // Precondition: // * v must be a strictly positive finite double. // // Returns true if it succeeds, otherwise the result can not be trusted. // There will be *length digits inside the buffer followed by a null terminator. // If the function returns true and mode equals // - FAST_DTOA_SHORTEST, then // the parameter requested_digits is ignored. // The result satisfies // v == (double) (buffer * 10^(point - length)). // The digits in the buffer are the shortest representation possible. E.g. // if 0.099999999999 and 0.1 represent the same double then "1" is returned // with point = 0. // The last digit will be closest to the actual v. That is, even if several // digits might correctly yield 'v' when read again, the buffer will contain // the one closest to v. // - FAST_DTOA_PRECISION, then // the buffer contains requested_digits digits. // the difference v - (buffer * 10^(point-length)) is closest to zero for // all possible representations of requested_digits digits. // If there are two values that are equally close, then FastDtoa returns // false. // For both modes the buffer must be large enough to hold the result. bool FastDtoa(double d, FastDtoaMode mode, int requested_digits, Vector buffer, int* length, int* decimal_point); } // namespace double_conversion #endif // DOUBLE_CONVERSION_FAST_DTOA_H_ ================================================ FILE: src/kenlm/util/double-conversion/fixed-dtoa.cc ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "fixed-dtoa.h" #include "ieee.h" namespace double_conversion { // Represents a 128bit type. This class should be replaced by a native type on // platforms that support 128bit integers. class UInt128 { public: UInt128() : high_bits_(0), low_bits_(0) { } UInt128(uint64_t high, uint64_t low) : high_bits_(high), low_bits_(low) { } void Multiply(uint32_t multiplicand) { uint64_t accumulator; accumulator = (low_bits_ & kMask32) * multiplicand; uint32_t part = static_cast(accumulator & kMask32); accumulator >>= 32; accumulator = accumulator + (low_bits_ >> 32) * multiplicand; low_bits_ = (accumulator << 32) + part; accumulator >>= 32; accumulator = accumulator + (high_bits_ & kMask32) * multiplicand; part = static_cast(accumulator & kMask32); accumulator >>= 32; accumulator = accumulator + (high_bits_ >> 32) * multiplicand; high_bits_ = (accumulator << 32) + part; ASSERT((accumulator >> 32) == 0); } void Shift(int shift_amount) { ASSERT(-64 <= shift_amount && shift_amount <= 64); if (shift_amount == 0) { return; } else if (shift_amount == -64) { high_bits_ = low_bits_; low_bits_ = 0; } else if (shift_amount == 64) { low_bits_ = high_bits_; high_bits_ = 0; } else if (shift_amount <= 0) { high_bits_ <<= -shift_amount; high_bits_ += low_bits_ >> (64 + shift_amount); low_bits_ <<= -shift_amount; } else { low_bits_ >>= shift_amount; low_bits_ += high_bits_ << (64 - shift_amount); high_bits_ >>= shift_amount; } } // Modifies *this to *this MOD (2^power). // Returns *this DIV (2^power). int DivModPowerOf2(int power) { if (power >= 64) { int result = static_cast(high_bits_ >> (power - 64)); high_bits_ -= static_cast(result) << (power - 64); return result; } else { uint64_t part_low = low_bits_ >> power; uint64_t part_high = high_bits_ << (64 - power); int result = static_cast(part_low + part_high); high_bits_ = 0; low_bits_ -= part_low << power; return result; } } bool IsZero() const { return high_bits_ == 0 && low_bits_ == 0; } int BitAt(int position) { if (position >= 64) { return static_cast(high_bits_ >> (position - 64)) & 1; } else { return static_cast(low_bits_ >> position) & 1; } } private: static const uint64_t kMask32 = 0xFFFFFFFF; // Value == (high_bits_ << 64) + low_bits_ uint64_t high_bits_; uint64_t low_bits_; }; static const int kDoubleSignificandSize = 53; // Includes the hidden bit. static void FillDigits32FixedLength(uint32_t number, int requested_length, Vector buffer, int* length) { for (int i = requested_length - 1; i >= 0; --i) { buffer[(*length) + i] = '0' + number % 10; number /= 10; } *length += requested_length; } static void FillDigits32(uint32_t number, Vector buffer, int* length) { int number_length = 0; // We fill the digits in reverse order and exchange them afterwards. while (number != 0) { int digit = number % 10; number /= 10; buffer[(*length) + number_length] = '0' + digit; number_length++; } // Exchange the digits. int i = *length; int j = *length + number_length - 1; while (i < j) { char tmp = buffer[i]; buffer[i] = buffer[j]; buffer[j] = tmp; i++; j--; } *length += number_length; } static void FillDigits64FixedLength(uint64_t number, int requested_length, Vector buffer, int* length) { const uint32_t kTen7 = 10000000; // For efficiency cut the number into 3 uint32_t parts, and print those. uint32_t part2 = static_cast(number % kTen7); number /= kTen7; uint32_t part1 = static_cast(number % kTen7); uint32_t part0 = static_cast(number / kTen7); FillDigits32FixedLength(part0, 3, buffer, length); FillDigits32FixedLength(part1, 7, buffer, length); FillDigits32FixedLength(part2, 7, buffer, length); } static void FillDigits64(uint64_t number, Vector buffer, int* length) { const uint32_t kTen7 = 10000000; // For efficiency cut the number into 3 uint32_t parts, and print those. uint32_t part2 = static_cast(number % kTen7); number /= kTen7; uint32_t part1 = static_cast(number % kTen7); uint32_t part0 = static_cast(number / kTen7); if (part0 != 0) { FillDigits32(part0, buffer, length); FillDigits32FixedLength(part1, 7, buffer, length); FillDigits32FixedLength(part2, 7, buffer, length); } else if (part1 != 0) { FillDigits32(part1, buffer, length); FillDigits32FixedLength(part2, 7, buffer, length); } else { FillDigits32(part2, buffer, length); } } static void RoundUp(Vector buffer, int* length, int* decimal_point) { // An empty buffer represents 0. if (*length == 0) { buffer[0] = '1'; *decimal_point = 1; *length = 1; return; } // Round the last digit until we either have a digit that was not '9' or until // we reached the first digit. buffer[(*length) - 1]++; for (int i = (*length) - 1; i > 0; --i) { if (buffer[i] != '0' + 10) { return; } buffer[i] = '0'; buffer[i - 1]++; } // If the first digit is now '0' + 10, we would need to set it to '0' and add // a '1' in front. However we reach the first digit only if all following // digits had been '9' before rounding up. Now all trailing digits are '0' and // we simply switch the first digit to '1' and update the decimal-point // (indicating that the point is now one digit to the right). if (buffer[0] == '0' + 10) { buffer[0] = '1'; (*decimal_point)++; } } // The given fractionals number represents a fixed-point number with binary // point at bit (-exponent). // Preconditions: // -128 <= exponent <= 0. // 0 <= fractionals * 2^exponent < 1 // The buffer holds the result. // The function will round its result. During the rounding-process digits not // generated by this function might be updated, and the decimal-point variable // might be updated. If this function generates the digits 99 and the buffer // already contained "199" (thus yielding a buffer of "19999") then a // rounding-up will change the contents of the buffer to "20000". static void FillFractionals(uint64_t fractionals, int exponent, int fractional_count, Vector buffer, int* length, int* decimal_point) { ASSERT(-128 <= exponent && exponent <= 0); // 'fractionals' is a fixed-point number, with binary point at bit // (-exponent). Inside the function the non-converted remainder of fractionals // is a fixed-point number, with binary point at bit 'point'. if (-exponent <= 64) { // One 64 bit number is sufficient. ASSERT(fractionals >> 56 == 0); int point = -exponent; for (int i = 0; i < fractional_count; ++i) { if (fractionals == 0) break; // Instead of multiplying by 10 we multiply by 5 and adjust the point // location. This way the fractionals variable will not overflow. // Invariant at the beginning of the loop: fractionals < 2^point. // Initially we have: point <= 64 and fractionals < 2^56 // After each iteration the point is decremented by one. // Note that 5^3 = 125 < 128 = 2^7. // Therefore three iterations of this loop will not overflow fractionals // (even without the subtraction at the end of the loop body). At this // time point will satisfy point <= 61 and therefore fractionals < 2^point // and any further multiplication of fractionals by 5 will not overflow. fractionals *= 5; point--; int digit = static_cast(fractionals >> point); buffer[*length] = '0' + digit; (*length)++; fractionals -= static_cast(digit) << point; } // If the first bit after the point is set we have to round up. if (((fractionals >> (point - 1)) & 1) == 1) { RoundUp(buffer, length, decimal_point); } } else { // We need 128 bits. ASSERT(64 < -exponent && -exponent <= 128); UInt128 fractionals128 = UInt128(fractionals, 0); fractionals128.Shift(-exponent - 64); int point = 128; for (int i = 0; i < fractional_count; ++i) { if (fractionals128.IsZero()) break; // As before: instead of multiplying by 10 we multiply by 5 and adjust the // point location. // This multiplication will not overflow for the same reasons as before. fractionals128.Multiply(5); point--; int digit = fractionals128.DivModPowerOf2(point); buffer[*length] = '0' + digit; (*length)++; } if (fractionals128.BitAt(point - 1) == 1) { RoundUp(buffer, length, decimal_point); } } } // Removes leading and trailing zeros. // If leading zeros are removed then the decimal point position is adjusted. static void TrimZeros(Vector buffer, int* length, int* decimal_point) { while (*length > 0 && buffer[(*length) - 1] == '0') { (*length)--; } int first_non_zero = 0; while (first_non_zero < *length && buffer[first_non_zero] == '0') { first_non_zero++; } if (first_non_zero != 0) { for (int i = first_non_zero; i < *length; ++i) { buffer[i - first_non_zero] = buffer[i]; } *length -= first_non_zero; *decimal_point -= first_non_zero; } } bool FastFixedDtoa(double v, int fractional_count, Vector buffer, int* length, int* decimal_point) { const uint32_t kMaxUInt32 = 0xFFFFFFFF; uint64_t significand = Double(v).Significand(); int exponent = Double(v).Exponent(); // v = significand * 2^exponent (with significand a 53bit integer). // If the exponent is larger than 20 (i.e. we may have a 73bit number) then we // don't know how to compute the representation. 2^73 ~= 9.5*10^21. // If necessary this limit could probably be increased, but we don't need // more. if (exponent > 20) return false; if (fractional_count > 20) return false; *length = 0; // At most kDoubleSignificandSize bits of the significand are non-zero. // Given a 64 bit integer we have 11 0s followed by 53 potentially non-zero // bits: 0..11*..0xxx..53*..xx if (exponent + kDoubleSignificandSize > 64) { // The exponent must be > 11. // // We know that v = significand * 2^exponent. // And the exponent > 11. // We simplify the task by dividing v by 10^17. // The quotient delivers the first digits, and the remainder fits into a 64 // bit number. // Dividing by 10^17 is equivalent to dividing by 5^17*2^17. const uint64_t kFive17 = UINT64_2PART_C(0xB1, A2BC2EC5); // 5^17 uint64_t divisor = kFive17; int divisor_power = 17; uint64_t dividend = significand; uint32_t quotient; uint64_t remainder; // Let v = f * 2^e with f == significand and e == exponent. // Then need q (quotient) and r (remainder) as follows: // v = q * 10^17 + r // f * 2^e = q * 10^17 + r // f * 2^e = q * 5^17 * 2^17 + r // If e > 17 then // f * 2^(e-17) = q * 5^17 + r/2^17 // else // f = q * 5^17 * 2^(17-e) + r/2^e if (exponent > divisor_power) { // We only allow exponents of up to 20 and therefore (17 - e) <= 3 dividend <<= exponent - divisor_power; quotient = static_cast(dividend / divisor); remainder = (dividend % divisor) << divisor_power; } else { divisor <<= divisor_power - exponent; quotient = static_cast(dividend / divisor); remainder = (dividend % divisor) << exponent; } FillDigits32(quotient, buffer, length); FillDigits64FixedLength(remainder, divisor_power, buffer, length); *decimal_point = *length; } else if (exponent >= 0) { // 0 <= exponent <= 11 significand <<= exponent; FillDigits64(significand, buffer, length); *decimal_point = *length; } else if (exponent > -kDoubleSignificandSize) { // We have to cut the number. uint64_t integrals = significand >> -exponent; uint64_t fractionals = significand - (integrals << -exponent); if (integrals > kMaxUInt32) { FillDigits64(integrals, buffer, length); } else { FillDigits32(static_cast(integrals), buffer, length); } *decimal_point = *length; FillFractionals(fractionals, exponent, fractional_count, buffer, length, decimal_point); } else if (exponent < -128) { // This configuration (with at most 20 digits) means that all digits must be // 0. ASSERT(fractional_count <= 20); buffer[0] = '\0'; *length = 0; *decimal_point = -fractional_count; } else { *decimal_point = 0; FillFractionals(significand, exponent, fractional_count, buffer, length, decimal_point); } TrimZeros(buffer, length, decimal_point); buffer[*length] = '\0'; if ((*length) == 0) { // The string is empty and the decimal_point thus has no importance. Mimick // Gay's dtoa and and set it to -fractional_count. *decimal_point = -fractional_count; } return true; } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/fixed-dtoa.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_FIXED_DTOA_H_ #define DOUBLE_CONVERSION_FIXED_DTOA_H_ #include "utils.h" namespace double_conversion { // Produces digits necessary to print a given number with // 'fractional_count' digits after the decimal point. // The buffer must be big enough to hold the result plus one terminating null // character. // // The produced digits might be too short in which case the caller has to fill // the gaps with '0's. // Example: FastFixedDtoa(0.001, 5, ...) is allowed to return buffer = "1", and // decimal_point = -2. // Halfway cases are rounded towards +/-Infinity (away from 0). The call // FastFixedDtoa(0.15, 2, ...) thus returns buffer = "2", decimal_point = 0. // The returned buffer may contain digits that would be truncated from the // shortest representation of the input. // // This method only works for some parameters. If it can't handle the input it // returns false. The output is null-terminated when the function succeeds. bool FastFixedDtoa(double v, int fractional_count, Vector buffer, int* length, int* decimal_point); } // namespace double_conversion #endif // DOUBLE_CONVERSION_FIXED_DTOA_H_ ================================================ FILE: src/kenlm/util/double-conversion/ieee.h ================================================ // Copyright 2012 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_DOUBLE_H_ #define DOUBLE_CONVERSION_DOUBLE_H_ #include "diy-fp.h" namespace double_conversion { // We assume that doubles and uint64_t have the same endianness. static uint64_t double_to_uint64(double d) { return BitCast(d); } static double uint64_to_double(uint64_t d64) { return BitCast(d64); } static uint32_t float_to_uint32(float f) { return BitCast(f); } static float uint32_to_float(uint32_t d32) { return BitCast(d32); } // Helper functions for doubles. class Double { public: static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000); static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000); static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF); static const uint64_t kHiddenBit = UINT64_2PART_C(0x00100000, 00000000); static const int kPhysicalSignificandSize = 52; // Excludes the hidden bit. static const int kSignificandSize = 53; Double() : d64_(0) {} explicit Double(double d) : d64_(double_to_uint64(d)) {} explicit Double(uint64_t d64) : d64_(d64) {} explicit Double(DiyFp diy_fp) : d64_(DiyFpToUint64(diy_fp)) {} // The value encoded by this Double must be greater or equal to +0.0. // It must not be special (infinity, or NaN). DiyFp AsDiyFp() const { ASSERT(Sign() > 0); ASSERT(!IsSpecial()); return DiyFp(Significand(), Exponent()); } // The value encoded by this Double must be strictly greater than 0. DiyFp AsNormalizedDiyFp() const { ASSERT(value() > 0.0); uint64_t f = Significand(); int e = Exponent(); // The current double could be a denormal. while ((f & kHiddenBit) == 0) { f <<= 1; e--; } // Do the final shifts in one go. f <<= DiyFp::kSignificandSize - kSignificandSize; e -= DiyFp::kSignificandSize - kSignificandSize; return DiyFp(f, e); } // Returns the double's bit as uint64. uint64_t AsUint64() const { return d64_; } // Returns the next greater double. Returns +infinity on input +infinity. double NextDouble() const { if (d64_ == kInfinity) return Double(kInfinity).value(); if (Sign() < 0 && Significand() == 0) { // -0.0 return 0.0; } if (Sign() < 0) { return Double(d64_ - 1).value(); } else { return Double(d64_ + 1).value(); } } double PreviousDouble() const { if (d64_ == (kInfinity | kSignMask)) return -Double::Infinity(); if (Sign() < 0) { return Double(d64_ + 1).value(); } else { if (Significand() == 0) return -0.0; return Double(d64_ - 1).value(); } } int Exponent() const { if (IsDenormal()) return kDenormalExponent; uint64_t d64 = AsUint64(); int biased_e = static_cast((d64 & kExponentMask) >> kPhysicalSignificandSize); return biased_e - kExponentBias; } uint64_t Significand() const { uint64_t d64 = AsUint64(); uint64_t significand = d64 & kSignificandMask; if (!IsDenormal()) { return significand + kHiddenBit; } else { return significand; } } // Returns true if the double is a denormal. bool IsDenormal() const { uint64_t d64 = AsUint64(); return (d64 & kExponentMask) == 0; } // We consider denormals not to be special. // Hence only Infinity and NaN are special. bool IsSpecial() const { uint64_t d64 = AsUint64(); return (d64 & kExponentMask) == kExponentMask; } bool IsNan() const { uint64_t d64 = AsUint64(); return ((d64 & kExponentMask) == kExponentMask) && ((d64 & kSignificandMask) != 0); } bool IsInfinite() const { uint64_t d64 = AsUint64(); return ((d64 & kExponentMask) == kExponentMask) && ((d64 & kSignificandMask) == 0); } int Sign() const { uint64_t d64 = AsUint64(); return (d64 & kSignMask) == 0? 1: -1; } // Precondition: the value encoded by this Double must be greater or equal // than +0.0. DiyFp UpperBoundary() const { ASSERT(Sign() > 0); return DiyFp(Significand() * 2 + 1, Exponent() - 1); } // Computes the two boundaries of this. // The bigger boundary (m_plus) is normalized. The lower boundary has the same // exponent as m_plus. // Precondition: the value encoded by this Double must be greater than 0. void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const { ASSERT(value() > 0.0); DiyFp v = this->AsDiyFp(); DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1)); DiyFp m_minus; if (LowerBoundaryIsCloser()) { m_minus = DiyFp((v.f() << 2) - 1, v.e() - 2); } else { m_minus = DiyFp((v.f() << 1) - 1, v.e() - 1); } m_minus.set_f(m_minus.f() << (m_minus.e() - m_plus.e())); m_minus.set_e(m_plus.e()); *out_m_plus = m_plus; *out_m_minus = m_minus; } bool LowerBoundaryIsCloser() const { // The boundary is closer if the significand is of the form f == 2^p-1 then // the lower boundary is closer. // Think of v = 1000e10 and v- = 9999e9. // Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but // at a distance of 1e8. // The only exception is for the smallest normal: the largest denormal is // at the same distance as its successor. // Note: denormals have the same exponent as the smallest normals. bool physical_significand_is_zero = ((AsUint64() & kSignificandMask) == 0); return physical_significand_is_zero && (Exponent() != kDenormalExponent); } double value() const { return uint64_to_double(d64_); } // Returns the significand size for a given order of magnitude. // If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude. // This function returns the number of significant binary digits v will have // once it's encoded into a double. In almost all cases this is equal to // kSignificandSize. The only exceptions are denormals. They start with // leading zeroes and their effective significand-size is hence smaller. static int SignificandSizeForOrderOfMagnitude(int order) { if (order >= (kDenormalExponent + kSignificandSize)) { return kSignificandSize; } if (order <= kDenormalExponent) return 0; return order - kDenormalExponent; } static double Infinity() { return Double(kInfinity).value(); } static double NaN() { return Double(kNaN).value(); } private: static const int kExponentBias = 0x3FF + kPhysicalSignificandSize; static const int kDenormalExponent = -kExponentBias + 1; static const int kMaxExponent = 0x7FF - kExponentBias; static const uint64_t kInfinity = UINT64_2PART_C(0x7FF00000, 00000000); static const uint64_t kNaN = UINT64_2PART_C(0x7FF80000, 00000000); const uint64_t d64_; static uint64_t DiyFpToUint64(DiyFp diy_fp) { uint64_t significand = diy_fp.f(); int exponent = diy_fp.e(); while (significand > kHiddenBit + kSignificandMask) { significand >>= 1; exponent++; } if (exponent >= kMaxExponent) { return kInfinity; } if (exponent < kDenormalExponent) { return 0; } while (exponent > kDenormalExponent && (significand & kHiddenBit) == 0) { significand <<= 1; exponent--; } uint64_t biased_exponent; if (exponent == kDenormalExponent && (significand & kHiddenBit) == 0) { biased_exponent = 0; } else { biased_exponent = static_cast(exponent + kExponentBias); } return (significand & kSignificandMask) | (biased_exponent << kPhysicalSignificandSize); } }; class Single { public: static const uint32_t kSignMask = 0x80000000; static const uint32_t kExponentMask = 0x7F800000; static const uint32_t kSignificandMask = 0x007FFFFF; static const uint32_t kHiddenBit = 0x00800000; static const int kPhysicalSignificandSize = 23; // Excludes the hidden bit. static const int kSignificandSize = 24; Single() : d32_(0) {} explicit Single(float f) : d32_(float_to_uint32(f)) {} explicit Single(uint32_t d32) : d32_(d32) {} // The value encoded by this Single must be greater or equal to +0.0. // It must not be special (infinity, or NaN). DiyFp AsDiyFp() const { ASSERT(Sign() > 0); ASSERT(!IsSpecial()); return DiyFp(Significand(), Exponent()); } // Returns the single's bit as uint64. uint32_t AsUint32() const { return d32_; } int Exponent() const { if (IsDenormal()) return kDenormalExponent; uint32_t d32 = AsUint32(); int biased_e = static_cast((d32 & kExponentMask) >> kPhysicalSignificandSize); return biased_e - kExponentBias; } uint32_t Significand() const { uint32_t d32 = AsUint32(); uint32_t significand = d32 & kSignificandMask; if (!IsDenormal()) { return significand + kHiddenBit; } else { return significand; } } // Returns true if the single is a denormal. bool IsDenormal() const { uint32_t d32 = AsUint32(); return (d32 & kExponentMask) == 0; } // We consider denormals not to be special. // Hence only Infinity and NaN are special. bool IsSpecial() const { uint32_t d32 = AsUint32(); return (d32 & kExponentMask) == kExponentMask; } bool IsNan() const { uint32_t d32 = AsUint32(); return ((d32 & kExponentMask) == kExponentMask) && ((d32 & kSignificandMask) != 0); } bool IsInfinite() const { uint32_t d32 = AsUint32(); return ((d32 & kExponentMask) == kExponentMask) && ((d32 & kSignificandMask) == 0); } int Sign() const { uint32_t d32 = AsUint32(); return (d32 & kSignMask) == 0? 1: -1; } // Computes the two boundaries of this. // The bigger boundary (m_plus) is normalized. The lower boundary has the same // exponent as m_plus. // Precondition: the value encoded by this Single must be greater than 0. void NormalizedBoundaries(DiyFp* out_m_minus, DiyFp* out_m_plus) const { ASSERT(value() > 0.0); DiyFp v = this->AsDiyFp(); DiyFp m_plus = DiyFp::Normalize(DiyFp((v.f() << 1) + 1, v.e() - 1)); DiyFp m_minus; if (LowerBoundaryIsCloser()) { m_minus = DiyFp((v.f() << 2) - 1, v.e() - 2); } else { m_minus = DiyFp((v.f() << 1) - 1, v.e() - 1); } m_minus.set_f(m_minus.f() << (m_minus.e() - m_plus.e())); m_minus.set_e(m_plus.e()); *out_m_plus = m_plus; *out_m_minus = m_minus; } // Precondition: the value encoded by this Single must be greater or equal // than +0.0. DiyFp UpperBoundary() const { ASSERT(Sign() > 0); return DiyFp(Significand() * 2 + 1, Exponent() - 1); } bool LowerBoundaryIsCloser() const { // The boundary is closer if the significand is of the form f == 2^p-1 then // the lower boundary is closer. // Think of v = 1000e10 and v- = 9999e9. // Then the boundary (== (v - v-)/2) is not just at a distance of 1e9 but // at a distance of 1e8. // The only exception is for the smallest normal: the largest denormal is // at the same distance as its successor. // Note: denormals have the same exponent as the smallest normals. bool physical_significand_is_zero = ((AsUint32() & kSignificandMask) == 0); return physical_significand_is_zero && (Exponent() != kDenormalExponent); } float value() const { return uint32_to_float(d32_); } static float Infinity() { return Single(kInfinity).value(); } static float NaN() { return Single(kNaN).value(); } private: static const int kExponentBias = 0x7F + kPhysicalSignificandSize; static const int kDenormalExponent = -kExponentBias + 1; static const int kMaxExponent = 0xFF - kExponentBias; static const uint32_t kInfinity = 0x7F800000; static const uint32_t kNaN = 0x7FC00000; const uint32_t d32_; }; } // namespace double_conversion #endif // DOUBLE_CONVERSION_DOUBLE_H_ ================================================ FILE: src/kenlm/util/double-conversion/strtod.cc ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include "strtod.h" #include "bignum.h" #include "cached-powers.h" #include "ieee.h" namespace double_conversion { // 2^53 = 9007199254740992. // Any integer with at most 15 decimal digits will hence fit into a double // (which has a 53bit significand) without loss of precision. static const int kMaxExactDoubleIntegerDecimalDigits = 15; // 2^64 = 18446744073709551616 > 10^19 static const int kMaxUint64DecimalDigits = 19; // Max double: 1.7976931348623157 x 10^308 // Min non-zero double: 4.9406564584124654 x 10^-324 // Any x >= 10^309 is interpreted as +infinity. // Any x <= 10^-324 is interpreted as 0. // Note that 2.5e-324 (despite being smaller than the min double) will be read // as non-zero (equal to the min non-zero double). static const int kMaxDecimalPower = 309; static const int kMinDecimalPower = -324; // 2^64 = 18446744073709551616 static const uint64_t kMaxUint64 = UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF); static const double exact_powers_of_ten[] = { 1.0, // 10^0 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 10000000.0, 100000000.0, 1000000000.0, 10000000000.0, // 10^10 100000000000.0, 1000000000000.0, 10000000000000.0, 100000000000000.0, 1000000000000000.0, 10000000000000000.0, 100000000000000000.0, 1000000000000000000.0, 10000000000000000000.0, 100000000000000000000.0, // 10^20 1000000000000000000000.0, // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22 10000000000000000000000.0 }; static const int kExactPowersOfTenSize = ARRAY_SIZE(exact_powers_of_ten); // Maximum number of significant digits in the decimal representation. // In fact the value is 772 (see conversions.cc), but to give us some margin // we round up to 780. static const int kMaxSignificantDecimalDigits = 780; static Vector TrimLeadingZeros(Vector buffer) { for (int i = 0; i < buffer.length(); i++) { if (buffer[i] != '0') { return buffer.SubVector(i, buffer.length()); } } return Vector(buffer.start(), 0); } static Vector TrimTrailingZeros(Vector buffer) { for (int i = buffer.length() - 1; i >= 0; --i) { if (buffer[i] != '0') { return buffer.SubVector(0, i + 1); } } return Vector(buffer.start(), 0); } static void CutToMaxSignificantDigits(Vector buffer, int exponent, char* significant_buffer, int* significant_exponent) { for (int i = 0; i < kMaxSignificantDecimalDigits - 1; ++i) { significant_buffer[i] = buffer[i]; } // The input buffer has been trimmed. Therefore the last digit must be // different from '0'. ASSERT(buffer[buffer.length() - 1] != '0'); // Set the last digit to be non-zero. This is sufficient to guarantee // correct rounding. significant_buffer[kMaxSignificantDecimalDigits - 1] = '1'; *significant_exponent = exponent + (buffer.length() - kMaxSignificantDecimalDigits); } // Trims the buffer and cuts it to at most kMaxSignificantDecimalDigits. // If possible the input-buffer is reused, but if the buffer needs to be // modified (due to cutting), then the input needs to be copied into the // buffer_copy_space. static void TrimAndCut(Vector buffer, int exponent, char* buffer_copy_space, int space_size, Vector* trimmed, int* updated_exponent) { Vector left_trimmed = TrimLeadingZeros(buffer); Vector right_trimmed = TrimTrailingZeros(left_trimmed); exponent += left_trimmed.length() - right_trimmed.length(); if (right_trimmed.length() > kMaxSignificantDecimalDigits) { ASSERT(space_size >= kMaxSignificantDecimalDigits); CutToMaxSignificantDigits(right_trimmed, exponent, buffer_copy_space, updated_exponent); *trimmed = Vector(buffer_copy_space, kMaxSignificantDecimalDigits); } else { *trimmed = right_trimmed; *updated_exponent = exponent; } } // Reads digits from the buffer and converts them to a uint64. // Reads in as many digits as fit into a uint64. // When the string starts with "1844674407370955161" no further digit is read. // Since 2^64 = 18446744073709551616 it would still be possible read another // digit if it was less or equal than 6, but this would complicate the code. static uint64_t ReadUint64(Vector buffer, int* number_of_read_digits) { uint64_t result = 0; int i = 0; while (i < buffer.length() && result <= (kMaxUint64 / 10 - 1)) { int digit = buffer[i++] - '0'; ASSERT(0 <= digit && digit <= 9); result = 10 * result + digit; } *number_of_read_digits = i; return result; } // Reads a DiyFp from the buffer. // The returned DiyFp is not necessarily normalized. // If remaining_decimals is zero then the returned DiyFp is accurate. // Otherwise it has been rounded and has error of at most 1/2 ulp. static void ReadDiyFp(Vector buffer, DiyFp* result, int* remaining_decimals) { int read_digits; uint64_t significand = ReadUint64(buffer, &read_digits); if (buffer.length() == read_digits) { *result = DiyFp(significand, 0); *remaining_decimals = 0; } else { // Round the significand. if (buffer[read_digits] >= '5') { significand++; } // Compute the binary exponent. int exponent = 0; *result = DiyFp(significand, exponent); *remaining_decimals = buffer.length() - read_digits; } } static bool DoubleStrtod(Vector trimmed, int exponent, double* result) { #if !defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS) // On x86 the floating-point stack can be 64 or 80 bits wide. If it is // 80 bits wide (as is the case on Linux) then double-rounding occurs and the // result is not accurate. // We know that Windows32 uses 64 bits and is therefore accurate. // Note that the ARM simulator is compiled for 32bits. It therefore exhibits // the same problem. return false; #endif if (trimmed.length() <= kMaxExactDoubleIntegerDecimalDigits) { int read_digits; // The trimmed input fits into a double. // If the 10^exponent (resp. 10^-exponent) fits into a double too then we // can compute the result-double simply by multiplying (resp. dividing) the // two numbers. // This is possible because IEEE guarantees that floating-point operations // return the best possible approximation. if (exponent < 0 && -exponent < kExactPowersOfTenSize) { // 10^-exponent fits into a double. *result = static_cast(ReadUint64(trimmed, &read_digits)); ASSERT(read_digits == trimmed.length()); *result /= exact_powers_of_ten[-exponent]; return true; } if (0 <= exponent && exponent < kExactPowersOfTenSize) { // 10^exponent fits into a double. *result = static_cast(ReadUint64(trimmed, &read_digits)); ASSERT(read_digits == trimmed.length()); *result *= exact_powers_of_ten[exponent]; return true; } int remaining_digits = kMaxExactDoubleIntegerDecimalDigits - trimmed.length(); if ((0 <= exponent) && (exponent - remaining_digits < kExactPowersOfTenSize)) { // The trimmed string was short and we can multiply it with // 10^remaining_digits. As a result the remaining exponent now fits // into a double too. *result = static_cast(ReadUint64(trimmed, &read_digits)); ASSERT(read_digits == trimmed.length()); *result *= exact_powers_of_ten[remaining_digits]; *result *= exact_powers_of_ten[exponent - remaining_digits]; return true; } } return false; } // Returns 10^exponent as an exact DiyFp. // The given exponent must be in the range [1; kDecimalExponentDistance[. static DiyFp AdjustmentPowerOfTen(int exponent) { ASSERT(0 < exponent); ASSERT(exponent < PowersOfTenCache::kDecimalExponentDistance); // Simply hardcode the remaining powers for the given decimal exponent // distance. ASSERT(PowersOfTenCache::kDecimalExponentDistance == 8); switch (exponent) { case 1: return DiyFp(UINT64_2PART_C(0xa0000000, 00000000), -60); case 2: return DiyFp(UINT64_2PART_C(0xc8000000, 00000000), -57); case 3: return DiyFp(UINT64_2PART_C(0xfa000000, 00000000), -54); case 4: return DiyFp(UINT64_2PART_C(0x9c400000, 00000000), -50); case 5: return DiyFp(UINT64_2PART_C(0xc3500000, 00000000), -47); case 6: return DiyFp(UINT64_2PART_C(0xf4240000, 00000000), -44); case 7: return DiyFp(UINT64_2PART_C(0x98968000, 00000000), -40); default: UNREACHABLE(); return DiyFp(0, 0); } } // If the function returns true then the result is the correct double. // Otherwise it is either the correct double or the double that is just below // the correct double. static bool DiyFpStrtod(Vector buffer, int exponent, double* result) { DiyFp input; int remaining_decimals; ReadDiyFp(buffer, &input, &remaining_decimals); // Since we may have dropped some digits the input is not accurate. // If remaining_decimals is different than 0 than the error is at most // .5 ulp (unit in the last place). // We don't want to deal with fractions and therefore keep a common // denominator. const int kDenominatorLog = 3; const int kDenominator = 1 << kDenominatorLog; // Move the remaining decimals into the exponent. exponent += remaining_decimals; int error = (remaining_decimals == 0 ? 0 : kDenominator / 2); int old_e = input.e(); input.Normalize(); error <<= old_e - input.e(); ASSERT(exponent <= PowersOfTenCache::kMaxDecimalExponent); if (exponent < PowersOfTenCache::kMinDecimalExponent) { *result = 0.0; return true; } DiyFp cached_power; int cached_decimal_exponent; PowersOfTenCache::GetCachedPowerForDecimalExponent(exponent, &cached_power, &cached_decimal_exponent); if (cached_decimal_exponent != exponent) { int adjustment_exponent = exponent - cached_decimal_exponent; DiyFp adjustment_power = AdjustmentPowerOfTen(adjustment_exponent); input.Multiply(adjustment_power); if (kMaxUint64DecimalDigits - buffer.length() >= adjustment_exponent) { // The product of input with the adjustment power fits into a 64 bit // integer. ASSERT(DiyFp::kSignificandSize == 64); } else { // The adjustment power is exact. There is hence only an error of 0.5. error += kDenominator / 2; } } input.Multiply(cached_power); // The error introduced by a multiplication of a*b equals // error_a + error_b + error_a*error_b/2^64 + 0.5 // Substituting a with 'input' and b with 'cached_power' we have // error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), // error_ab = 0 or 1 / kDenominator > error_a*error_b/ 2^64 int error_b = kDenominator / 2; int error_ab = (error == 0 ? 0 : 1); // We round up to 1. int fixed_error = kDenominator / 2; error += error_b + error_ab + fixed_error; old_e = input.e(); input.Normalize(); error <<= old_e - input.e(); // See if the double's significand changes if we add/subtract the error. int order_of_magnitude = DiyFp::kSignificandSize + input.e(); int effective_significand_size = Double::SignificandSizeForOrderOfMagnitude(order_of_magnitude); int precision_digits_count = DiyFp::kSignificandSize - effective_significand_size; if (precision_digits_count + kDenominatorLog >= DiyFp::kSignificandSize) { // This can only happen for very small denormals. In this case the // half-way multiplied by the denominator exceeds the range of an uint64. // Simply shift everything to the right. int shift_amount = (precision_digits_count + kDenominatorLog) - DiyFp::kSignificandSize + 1; input.set_f(input.f() >> shift_amount); input.set_e(input.e() + shift_amount); // We add 1 for the lost precision of error, and kDenominator for // the lost precision of input.f(). error = (error >> shift_amount) + 1 + kDenominator; precision_digits_count -= shift_amount; } // We use uint64_ts now. This only works if the DiyFp uses uint64_ts too. ASSERT(DiyFp::kSignificandSize == 64); ASSERT(precision_digits_count < 64); uint64_t one64 = 1; uint64_t precision_bits_mask = (one64 << precision_digits_count) - 1; uint64_t precision_bits = input.f() & precision_bits_mask; uint64_t half_way = one64 << (precision_digits_count - 1); precision_bits *= kDenominator; half_way *= kDenominator; DiyFp rounded_input(input.f() >> precision_digits_count, input.e() + precision_digits_count); if (precision_bits >= half_way + error) { rounded_input.set_f(rounded_input.f() + 1); } // If the last_bits are too close to the half-way case than we are too // inaccurate and round down. In this case we return false so that we can // fall back to a more precise algorithm. *result = Double(rounded_input).value(); if (half_way - error < precision_bits && precision_bits < half_way + error) { // Too imprecise. The caller will have to fall back to a slower version. // However the returned number is guaranteed to be either the correct // double, or the next-lower double. return false; } else { return true; } } // Returns // - -1 if buffer*10^exponent < diy_fp. // - 0 if buffer*10^exponent == diy_fp. // - +1 if buffer*10^exponent > diy_fp. // Preconditions: // buffer.length() + exponent <= kMaxDecimalPower + 1 // buffer.length() + exponent > kMinDecimalPower // buffer.length() <= kMaxDecimalSignificantDigits static int CompareBufferWithDiyFp(Vector buffer, int exponent, DiyFp diy_fp) { ASSERT(buffer.length() + exponent <= kMaxDecimalPower + 1); ASSERT(buffer.length() + exponent > kMinDecimalPower); ASSERT(buffer.length() <= kMaxSignificantDecimalDigits); // Make sure that the Bignum will be able to hold all our numbers. // Our Bignum implementation has a separate field for exponents. Shifts will // consume at most one bigit (< 64 bits). // ln(10) == 3.3219... ASSERT(((kMaxDecimalPower + 1) * 333 / 100) < Bignum::kMaxSignificantBits); Bignum buffer_bignum; Bignum diy_fp_bignum; buffer_bignum.AssignDecimalString(buffer); diy_fp_bignum.AssignUInt64(diy_fp.f()); if (exponent >= 0) { buffer_bignum.MultiplyByPowerOfTen(exponent); } else { diy_fp_bignum.MultiplyByPowerOfTen(-exponent); } if (diy_fp.e() > 0) { diy_fp_bignum.ShiftLeft(diy_fp.e()); } else { buffer_bignum.ShiftLeft(-diy_fp.e()); } return Bignum::Compare(buffer_bignum, diy_fp_bignum); } // Returns true if the guess is the correct double. // Returns false, when guess is either correct or the next-lower double. static bool ComputeGuess(Vector trimmed, int exponent, double* guess) { if (trimmed.length() == 0) { *guess = 0.0; return true; } if (exponent + trimmed.length() - 1 >= kMaxDecimalPower) { *guess = Double::Infinity(); return true; } if (exponent + trimmed.length() <= kMinDecimalPower) { *guess = 0.0; return true; } if (DoubleStrtod(trimmed, exponent, guess) || DiyFpStrtod(trimmed, exponent, guess)) { return true; } if (*guess == Double::Infinity()) { return true; } return false; } double Strtod(Vector buffer, int exponent) { char copy_buffer[kMaxSignificantDecimalDigits]; Vector trimmed; int updated_exponent; TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, &trimmed, &updated_exponent); exponent = updated_exponent; double guess; bool is_correct = ComputeGuess(trimmed, exponent, &guess); if (is_correct) return guess; DiyFp upper_boundary = Double(guess).UpperBoundary(); int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); if (comparison < 0) { return guess; } else if (comparison > 0) { return Double(guess).NextDouble(); } else if ((Double(guess).Significand() & 1) == 0) { // Round towards even. return guess; } else { return Double(guess).NextDouble(); } } float Strtof(Vector buffer, int exponent) { char copy_buffer[kMaxSignificantDecimalDigits]; Vector trimmed; int updated_exponent; TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits, &trimmed, &updated_exponent); exponent = updated_exponent; double double_guess; bool is_correct = ComputeGuess(trimmed, exponent, &double_guess); float float_guess = static_cast(double_guess); if (float_guess == double_guess) { // This shortcut triggers for integer values. return float_guess; } // We must catch double-rounding. Say the double has been rounded up, and is // now a boundary of a float, and rounds up again. This is why we have to // look at previous too. // Example (in decimal numbers): // input: 12349 // high-precision (4 digits): 1235 // low-precision (3 digits): // when read from input: 123 // when rounded from high precision: 124. // To do this we simply look at the neigbors of the correct result and see // if they would round to the same float. If the guess is not correct we have // to look at four values (since two different doubles could be the correct // double). double double_next = Double(double_guess).NextDouble(); double double_previous = Double(double_guess).PreviousDouble(); float f1 = static_cast(double_previous); #ifndef NDEBUG float f2 = float_guess; #endif float f3 = static_cast(double_next); float f4; if (is_correct) { f4 = f3; } else { double double_next2 = Double(double_next).NextDouble(); f4 = static_cast(double_next2); } #ifndef NDEBUG ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4); #endif // If the guess doesn't lie near a single-precision boundary we can simply // return its float-value. if (f1 == f4) { return float_guess; } ASSERT((f1 != f2 && f2 == f3 && f3 == f4) || (f1 == f2 && f2 != f3 && f3 == f4) || (f1 == f2 && f2 == f3 && f3 != f4)); // guess and next are the two possible canditates (in the same way that // double_guess was the lower candidate for a double-precision guess). float guess = f1; float next = f4; DiyFp upper_boundary; if (guess == 0.0f) { float min_float = 1e-45f; upper_boundary = Double(static_cast(min_float) / 2).AsDiyFp(); } else { upper_boundary = Single(guess).UpperBoundary(); } int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary); if (comparison < 0) { return guess; } else if (comparison > 0) { return next; } else if ((Single(guess).Significand() & 1) == 0) { // Round towards even. return guess; } else { return next; } } } // namespace double_conversion ================================================ FILE: src/kenlm/util/double-conversion/strtod.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_STRTOD_H_ #define DOUBLE_CONVERSION_STRTOD_H_ #include "utils.h" namespace double_conversion { // The buffer must only contain digits in the range [0-9]. It must not // contain a dot or a sign. It must not start with '0', and must not be empty. double Strtod(Vector buffer, int exponent); // The buffer must only contain digits in the range [0-9]. It must not // contain a dot or a sign. It must not start with '0', and must not be empty. float Strtof(Vector buffer, int exponent); } // namespace double_conversion #endif // DOUBLE_CONVERSION_STRTOD_H_ ================================================ FILE: src/kenlm/util/double-conversion/utils.h ================================================ // Copyright 2010 the V8 project authors. All rights reserved. // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following // disclaimer in the documentation and/or other materials provided // with the distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef DOUBLE_CONVERSION_UTILS_H_ #define DOUBLE_CONVERSION_UTILS_H_ #include #include #include #ifndef ASSERT #define ASSERT(condition) (assert(condition)) #endif #ifndef UNIMPLEMENTED #define UNIMPLEMENTED() (abort()) #endif #ifndef UNREACHABLE #define UNREACHABLE() (abort()) #endif // Double operations detection based on target architecture. // Linux uses a 80bit wide floating point stack on x86. This induces double // rounding, which in turn leads to wrong results. // An easy way to test if the floating-point operations are correct is to // evaluate: 89255.0/1e22. If the floating-point stack is 64 bits wide then // the result is equal to 89255e-22. // The best way to test this, is to create a division-function and to compare // the output of the division with the expected result. (Inlining must be // disabled.) // On Linux,x86 89255e-22 != Div_double(89255.0/1e22) #if defined(_M_X64) || defined(__x86_64__) || \ defined(__ARMEL__) || defined(__avr32__) || \ defined(__hppa__) || defined(__ia64__) || \ defined(__mips__) || defined(__powerpc__) || \ defined(__sparc__) || defined(__sparc) || defined(__s390__) || \ defined(__SH4__) || defined(__alpha__) || \ defined(_MIPS_ARCH_MIPS32R2) #define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 #elif defined(_M_IX86) || defined(__i386__) || defined(__i386) #if defined(_WIN32) // Windows uses a 64bit wide floating point stack. #define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1 #else #undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS #endif // _WIN32 #else #error Target architecture was not detected as supported by Double-Conversion. #endif #if defined(_WIN32) && !defined(__MINGW32__) typedef signed char int8_t; typedef unsigned char uint8_t; typedef short int16_t; // NOLINT typedef unsigned short uint16_t; // NOLINT typedef int int32_t; typedef unsigned int uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; // intptr_t and friends are defined in crtdefs.h through stdio.h. #else #include #endif // The following macro works on both 32 and 64-bit platforms. // Usage: instead of writing 0x1234567890123456 // write UINT64_2PART_C(0x12345678,90123456); #define UINT64_2PART_C(a, b) (((static_cast(a) << 32) + 0x##b##u)) // The expression ARRAY_SIZE(a) is a compile-time constant of type // size_t which represents the number of elements of the given // array. You should only use ARRAY_SIZE on statically allocated // arrays. #ifndef ARRAY_SIZE #define ARRAY_SIZE(a) \ ((sizeof(a) / sizeof(*(a))) / \ static_cast(!(sizeof(a) % sizeof(*(a))))) #endif // A macro to disallow the evil copy constructor and operator= functions // This should be used in the private: declarations for a class #ifndef DISALLOW_COPY_AND_ASSIGN #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&); \ void operator=(const TypeName&) #endif // A macro to disallow all the implicit constructors, namely the // default constructor, copy constructor and operator= functions. // // This should be used in the private: declarations for a class // that wants to prevent anyone from instantiating it. This is // especially useful for classes containing only static methods. #ifndef DISALLOW_IMPLICIT_CONSTRUCTORS #define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \ TypeName(); \ DISALLOW_COPY_AND_ASSIGN(TypeName) #endif namespace double_conversion { static const int kCharSize = sizeof(char); // Returns the maximum of the two parameters. template static T Max(T a, T b) { return a < b ? b : a; } // Returns the minimum of the two parameters. template static T Min(T a, T b) { return a < b ? a : b; } inline int StrLength(const char* string) { size_t length = strlen(string); ASSERT(length == static_cast(static_cast(length))); return static_cast(length); } // This is a simplified version of V8's Vector class. template class Vector { public: Vector() : start_(NULL), length_(0) {} Vector(T* data, int length) : start_(data), length_(length) { ASSERT(length == 0 || (length > 0 && data != NULL)); } // Returns a vector using the same backing storage as this one, // spanning from and including 'from', to but not including 'to'. Vector SubVector(int from, int to) { ASSERT(to <= length_); ASSERT(from < to); ASSERT(0 <= from); return Vector(start() + from, to - from); } // Returns the length of the vector. int length() const { return length_; } // Returns whether or not the vector is empty. bool is_empty() const { return length_ == 0; } // Returns the pointer to the start of the data in the vector. T* start() const { return start_; } // Access individual vector elements - checks bounds in debug mode. T& operator[](int index) const { ASSERT(0 <= index && index < length_); return start_[index]; } T& first() { return start_[0]; } T& last() { return start_[length_ - 1]; } private: T* start_; int length_; }; // Helper class for building result strings in a character buffer. The // purpose of the class is to use safe operations that checks the // buffer bounds on all operations in debug mode. class StringBuilder { public: StringBuilder(char* buffer, int size) : buffer_(buffer, size), position_(0) { } ~StringBuilder() { if (!is_finalized()) Finalize(); } int size() const { return buffer_.length(); } // Get the current position in the builder. int position() const { ASSERT(!is_finalized()); return position_; } // Reset the position. void Reset() { position_ = 0; } // Add a single character to the builder. It is not allowed to add // 0-characters; use the Finalize() method to terminate the string // instead. void AddCharacter(char c) { ASSERT(c != '\0'); ASSERT(!is_finalized() && position_ < buffer_.length()); buffer_[position_++] = c; } // Add an entire string to the builder. Uses strlen() internally to // compute the length of the input string. void AddString(const char* s) { AddSubstring(s, StrLength(s)); } // Add the first 'n' characters of the given string 's' to the // builder. The input string must have enough characters. void AddSubstring(const char* s, int n) { ASSERT(!is_finalized() && position_ + n < buffer_.length()); ASSERT(static_cast(n) <= strlen(s)); memmove(&buffer_[position_], s, n * kCharSize); position_ += n; } // Add character padding to the builder. If count is non-positive, // nothing is added to the builder. void AddPadding(char c, int count) { for (int i = 0; i < count; i++) { AddCharacter(c); } } // Finalize the string by 0-terminating it and returning the buffer. char* Finalize() { ASSERT(!is_finalized() && position_ < buffer_.length()); buffer_[position_] = '\0'; // Make sure nobody managed to add a 0-character to the // buffer while building the string. ASSERT(strlen(buffer_.start()) == static_cast(position_)); position_ = -1; ASSERT(is_finalized()); return buffer_.start(); } private: Vector buffer_; int position_; bool is_finalized() const { return position_ < 0; } DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder); }; // The type-based aliasing rule allows the compiler to assume that pointers of // different types (for some definition of different) never alias each other. // Thus the following code does not work: // // float f = foo(); // int fbits = *(int*)(&f); // // The compiler 'knows' that the int pointer can't refer to f since the types // don't match, so the compiler may cache f in a register, leaving random data // in fbits. Using C++ style casts makes no difference, however a pointer to // char data is assumed to alias any other pointer. This is the 'memcpy // exception'. // // Bit_cast uses the memcpy exception to move the bits from a variable of one // type of a variable of another type. Of course the end result is likely to // be implementation dependent. Most compilers (gcc-4.2 and MSVC 2005) // will completely optimize BitCast away. // // There is an additional use for BitCast. // Recent gccs will warn when they see casts that may result in breakage due to // the type-based aliasing rule. If you have checked that there is no breakage // you can use BitCast to cast one pointer type to another. This confuses gcc // enough that it can no longer see that you have cast one pointer type to // another thus avoiding the warning. template inline Dest BitCast(const Source& source) { // Compile time assertion: sizeof(Dest) == sizeof(Source) // A compile error here means your Dest and Source have different sizes. typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]; Dest dest; memmove(&dest, &source, sizeof(dest)); return dest; } template inline Dest BitCast(Source* source) { return BitCast(reinterpret_cast(source)); } } // namespace double_conversion #endif // DOUBLE_CONVERSION_UTILS_H_ ================================================ FILE: src/kenlm/util/ersatz_progress.cc ================================================ #include "util/ersatz_progress.hh" #include #include #include #include namespace util { namespace { const unsigned char kWidth = 100; } const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n"; ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} ErsatzProgress::~ErsatzProgress() { if (out_) Finished(); } ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message) : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { if (!out_) { next_ = std::numeric_limits::max(); return; } if (!message.empty()) *out_ << message << '\n'; *out_ << kProgressBanner; } void ErsatzProgress::Milestone() { if (!out_) { current_ = 0; return; } if (!complete_) return; unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_); for (; stones_written_ < stone; ++stones_written_) { (*out_) << '*'; } if (stone == kWidth) { (*out_) << std::endl; next_ = std::numeric_limits::max(); out_ = NULL; } else { next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth); } } } // namespace util ================================================ FILE: src/kenlm/util/ersatz_progress.hh ================================================ #ifndef UTIL_ERSATZ_PROGRESS__ #define UTIL_ERSATZ_PROGRESS__ #include #include #include // Ersatz version of boost::progress so core language model doesn't depend on // boost. Also adds option to print nothing. namespace util { extern const char kProgressBanner[]; class ErsatzProgress { public: // No output. ErsatzProgress(); // Null means no output. The null value is useful for passing along the ostream pointer from another caller. explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); ~ErsatzProgress(); ErsatzProgress &operator++() { if (++current_ >= next_) Milestone(); return *this; } ErsatzProgress &operator+=(uint64_t amount) { if ((current_ += amount) >= next_) Milestone(); return *this; } void Set(uint64_t to) { if ((current_ = to) >= next_) Milestone(); } void Finished() { Set(complete_); } private: void Milestone(); uint64_t current_, next_, complete_; unsigned char stones_written_; std::ostream *out_; // noncopyable ErsatzProgress(const ErsatzProgress &other); ErsatzProgress &operator=(const ErsatzProgress &other); }; } // namespace util #endif // UTIL_ERSATZ_PROGRESS__ ================================================ FILE: src/kenlm/util/exception.cc ================================================ #include "util/exception.hh" #ifdef __GXX_RTTI #include #endif #include #include namespace util { Exception::Exception() throw() {} Exception::~Exception() throw() {} Exception::Exception(const Exception &from) : std::exception() { stream_ << from.stream_.str(); } Exception &Exception::operator=(const Exception &from) { stream_ << from.stream_.str(); return *this; } const char *Exception::what() const throw() { text_ = stream_.str(); return text_.c_str(); } void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) { /* The child class might have set some text, but we want this to come first. * Another option would be passing this information to the constructor, but * then child classes would have to accept constructor arguments and pass * them down. */ text_ = stream_.str(); stream_.str(""); stream_ << file << ':' << line; if (func) stream_ << " in " << func << " threw "; if (child_name) { stream_ << child_name; } else { #ifdef __GXX_RTTI stream_ << typeid(this).name(); #else stream_ << "an exception"; #endif } if (condition) stream_ << " because `" << condition; stream_ << "'.\n"; stream_ << text_; } namespace { // The XOPEN version. const char *HandleStrerror(int ret, const char *buf) { if (!ret) return buf; return NULL; } // The GNU version. const char *HandleStrerror(const char *ret, const char * /*buf*/) { return ret; } } // namespace ErrnoException::ErrnoException() throw() : errno_(errno) { char buf[200]; buf[0] = 0; #if defined(sun) || defined(_WIN32) || defined(_WIN64) const char *add = strerror(errno); #else const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); #endif if (add) { *this << add << ' '; } } ErrnoException::~ErrnoException() throw() {} OverflowException::OverflowException() throw() {} OverflowException::~OverflowException() throw() {} } // namespace util ================================================ FILE: src/kenlm/util/exception.hh ================================================ #ifndef UTIL_EXCEPTION__ #define UTIL_EXCEPTION__ #include #include #include #include #include namespace util { template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); class Exception : public std::exception { public: Exception() throw(); virtual ~Exception() throw(); Exception(const Exception &from); Exception &operator=(const Exception &from); // Not threadsafe, but probably doesn't matter. FWIW, Boost's exception guidance implies that what() isn't threadsafe. const char *what() const throw(); // For use by the UTIL_THROW macros. void SetLocation( const char *file, unsigned int line, const char *func, const char *child_name, const char *condition); private: template friend typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); // This helps restrict operator<< defined below. template struct ExceptionTag { typedef T Identity; }; std::stringstream stream_; mutable std::string text_; }; /* This implements the normal operator<< for Exception and all its children. * SFINAE means it only applies to Exception. Think of this as an ersatz * boost::enable_if. */ template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data) { e.stream_ << data; return e; } #ifdef __GNUC__ #define UTIL_FUNC_NAME __PRETTY_FUNCTION__ #else #ifdef _WIN32 #define UTIL_FUNC_NAME __FUNCTION__ #else #define UTIL_FUNC_NAME NULL #endif #endif /* Create an instance of Exception, add the message Modify, and throw it. * Modify is appended to the what() message and can contain << for ostream * operations. * * do .. while kludge to swallow trailing ; character * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html . * Arg can be a constructor argument to the exception. */ #define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \ Exception UTIL_e Arg; \ UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \ UTIL_e << Modify; \ throw UTIL_e; \ } while (0) #define UTIL_THROW_ARG(Exception, Arg, Modify) \ UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify) #define UTIL_THROW(Exception, Modify) \ UTIL_THROW_BACKEND(NULL, Exception, , Modify); #if __GNUC__ >= 3 #define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0) #else #define UTIL_UNLIKELY(x) (x) #endif #define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \ if (UTIL_UNLIKELY(Condition)) { \ UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \ } \ } while (0) #define UTIL_THROW_IF(Condition, Exception, Modify) \ UTIL_THROW_IF_ARG(Condition, Exception, , Modify) // Exception that records errno and adds it to the message. class ErrnoException : public Exception { public: ErrnoException() throw(); virtual ~ErrnoException() throw(); int Error() const throw() { return errno_; } private: int errno_; }; // Utilities for overflow checking. class OverflowException : public Exception { public: OverflowException() throw(); ~OverflowException() throw(); }; template inline std::size_t CheckOverflowInternal(uint64_t value) { UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); return value; } template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { return value; } inline std::size_t CheckOverflow(uint64_t value) { return CheckOverflowInternal(value); } } // namespace util #endif // UTIL_EXCEPTION__ ================================================ FILE: src/kenlm/util/fake_ofstream.hh ================================================ /* Like std::ofstream but without being incredibly slow. Backed by a raw fd. * Does not support many data types. Currently, it's targeted at writing ARPA * files quickly. */ #include "util/double-conversion/double-conversion.h" #include "util/double-conversion/utils.h" #include "util/file.hh" #include "util/scoped.hh" #include "util/string_piece.hh" #define BOOST_LEXICAL_CAST_ASSUME_C_LOCALE #include namespace util { class FakeOFStream { public: static const std::size_t kOutBuf = 1048576; // Does not take ownership of out. explicit FakeOFStream(int out) : buf_(util::MallocOrThrow(kOutBuf)), builder_(static_cast(buf_.get()), kOutBuf), // Mostly the default but with inf instead. And no flags. convert_(double_conversion::DoubleToStringConverter::NO_FLAGS, "inf", "NaN", 'e', -6, 21, 6, 0), fd_(out) {} ~FakeOFStream() { Flush(); } FakeOFStream &operator<<(float value) { // Odd, but this is the largest number found in the comments. EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); convert_.ToShortestSingle(value, &builder_); return *this; } FakeOFStream &operator<<(double value) { EnsureRemaining(double_conversion::DoubleToStringConverter::kMaxPrecisionDigits + 8); convert_.ToShortest(value, &builder_); return *this; } FakeOFStream &operator<<(StringPiece str) { if (str.size() > kOutBuf) { Flush(); util::WriteOrThrow(fd_, str.data(), str.size()); } else { EnsureRemaining(str.size()); builder_.AddSubstring(str.data(), str.size()); } return *this; } // Inefficient! TODO: more efficient implementation FakeOFStream &operator<<(unsigned value) { return *this << boost::lexical_cast(value); } FakeOFStream &operator<<(char c) { EnsureRemaining(1); builder_.AddCharacter(c); return *this; } // Note this does not sync. void Flush() { util::WriteOrThrow(fd_, buf_.get(), builder_.position()); builder_.Reset(); } private: void EnsureRemaining(std::size_t amount) { if (static_cast(builder_.size() - builder_.position()) < amount) { Flush(); } } util::scoped_malloc buf_; double_conversion::StringBuilder builder_; double_conversion::DoubleToStringConverter convert_; int fd_; }; } // namespace ================================================ FILE: src/kenlm/util/file.cc ================================================ #define _LARGEFILE64_SOURCE #define _FILE_OFFSET_BITS 64 #include "util/file.hh" #include "util/exception.hh" #include #include #include #include #include #include #include #include #include #include #if defined(_WIN32) || defined(_WIN64) #include #include #include #include #include #else #include #endif namespace util { scoped_fd::~scoped_fd() { if (fd_ != -1 && close(fd_)) { std::cerr << "Could not close file " << fd_ << std::endl; std::abort(); } } scoped_FILE::~scoped_FILE() { if (file_ && std::fclose(file_)) { std::cerr << "Could not close file " << std::endl; std::abort(); } } // Note that ErrnoException records errno before NameFromFD is called. FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) { *this << "in " << name_guess_ << ' '; } FDException::~FDException() throw() {} EndOfFileException::EndOfFileException() throw() { *this << "End of file"; } EndOfFileException::~EndOfFileException() throw() {} int OpenReadOrThrow(const char *name) { int ret; #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); #else UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); #endif return ret; } int CreateOrThrow(const char *name) { int ret; #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); #else UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); #endif return ret; } uint64_t SizeFile(int fd) { #if defined(_WIN32) || defined(_WIN64) __int64 ret = _filelengthi64(fd); return (ret == -1) ? kBadSize : ret; #else // Not windows. #ifdef OS_ANDROID struct stat64 sb; int ret = fstat64(fd, &sb); #else struct stat sb; int ret = fstat(fd, &sb); #endif if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; return sb.st_size; #endif } uint64_t SizeOrThrow(int fd) { uint64_t ret = SizeFile(fd); UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size"); return ret; } void ResizeOrThrow(int fd, uint64_t to) { #if defined(_WIN32) || defined(_WIN64) errno_t ret = _chsize_s #elif defined(OS_ANDROID) int ret = ftruncate64 #else int ret = ftruncate #endif (fd, to); UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes"); } std::size_t PartialRead(int fd, void *to, std::size_t amount) { #if defined(_WIN32) || defined(_WIN64) amount = min(static_cast(INT_MAX), amount); int ret = _read(fd, to, amount); #else errno = 0; ssize_t ret; do { ret = read(fd, to, amount); } while (ret == -1 && errno == EINTR); #endif UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes"); return static_cast(ret); } void ReadOrThrow(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast(to_void); while (amount) { std::size_t ret = PartialRead(fd, to, amount); UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read."); amount -= ret; to += ret; } } std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast(to_void); std::size_t remaining = amount; while (remaining) { std::size_t ret = PartialRead(fd, to, remaining); if (!ret) return amount - remaining; remaining -= ret; to += ret; } return amount; } void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) { uint8_t *to = static_cast(to_void); #if defined(_WIN32) || defined(_WIN64) UTIL_THROW(Exception, "This pread implementation for windows is broken. Please send me a patch that does not change the file pointer. Atomically. Or send me an implementation of pwrite that is allowed to change the file pointer but can be called concurrently with pread."); const std::size_t kMaxDWORD = static_cast(4294967295UL); #endif for (;size ;) { #if defined(_WIN32) || defined(_WIN64) /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */ // size_t might be 64-bit. DWORD is always 32. DWORD reading = static_cast(std::min(kMaxDWORD, size)); DWORD ret; OVERLAPPED overlapped; memset(&overlapped, 0, sizeof(OVERLAPPED)); overlapped.Offset = static_cast(off); overlapped.OffsetHigh = static_cast(off >> 32); UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), Exception, "ReadFile failed for offset " << off); #else ssize_t ret; errno = 0; do { #ifdef OS_ANDROID ret = pread64(fd, to, size, off); #else ret = pread(fd, to, size, off); #endif } while (ret == -1 && errno == EINTR); if (ret <= 0) { UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd)); UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off); } #endif size -= ret; off += ret; to += ret; } } void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast(data_void); while (size) { #if defined(_WIN32) || defined(_WIN64) int ret = write(fd, data, min(static_cast(INT_MAX), size)); #else errno = 0; ssize_t ret; do { ret = write(fd, data, size); } while (ret == -1 && errno == EINTR); #endif UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes"); data += ret; size -= ret; } } void WriteOrThrow(FILE *to, const void *data, std::size_t size) { if (!size) return; UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size); } void FSyncOrThrow(int fd) { // Apparently windows doesn't have fsync? #if !defined(_WIN32) && !defined(_WIN64) UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing"); #endif } namespace { // Static assert for 64-bit off_t size. #if !defined(_WIN32) && !defined(_WIN64) && !defined(OS_ANDROID) template struct CheckOffT; template <> struct CheckOffT<8> { struct True {}; }; // If there's a compiler error on the next line, then off_t isn't 64 bit. And // that makes me a sad panda. typedef CheckOffT::True IgnoredType; #endif // Can't we all just get along? void InternalSeek(int fd, int64_t off, int whence) { if ( #if defined(_WIN32) || defined(_WIN64) (__int64)-1 == _lseeki64(fd, off, whence) #elif defined(OS_ANDROID) (off64_t)-1 == lseek64(fd, off, whence) #else (off_t)-1 == lseek(fd, off, whence) #endif ) UTIL_THROW_ARG(FDException, (fd), "while seeking to " << off << " whence " << whence); } } // namespace void SeekOrThrow(int fd, uint64_t off) { InternalSeek(fd, off, SEEK_SET); } void AdvanceOrThrow(int fd, int64_t off) { InternalSeek(fd, off, SEEK_CUR); } void SeekEnd(int fd) { InternalSeek(fd, 0, SEEK_END); } std::FILE *FDOpenOrThrow(scoped_fd &file) { std::FILE *ret = fdopen(file.get(), "r+b"); UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write"); file.release(); return ret; } std::FILE *FDOpenReadOrThrow(scoped_fd &file) { std::FILE *ret = fdopen(file.get(), "rb"); UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read"); file.release(); return ret; } // Sigh. Windows temporary file creation is full of race conditions. #if defined(_WIN32) || defined(_WIN64) /* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. */ /* This has been modified from the original version to rename the function and * set the Windows temporary flag. */ static const char letters[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; /* Generate a temporary file name based on TMPL. TMPL must match the rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed does not exist at the time of the call to mkstemp. TMPL is overwritten with the result. */ int mkstemp_and_unlink(char *tmpl) { int len; char *XXXXXX; static unsigned long long value; unsigned long long random_time_bits; unsigned int count; int fd = -1; int save_errno = errno; /* A lower bound on the number of temporary files to attempt to generate. The maximum total number of temporary file names that can exist for a given template is 62**6. It should never be necessary to try all these combinations. Instead if a reasonable number of names is tried (we define reasonable as 62**3) fail to give the system administrator the chance to remove the problems. */ #define ATTEMPTS_MIN (62 * 62 * 62) /* The number of times to attempt to generate a temporary file. To conform to POSIX, this must be no smaller than TMP_MAX. */ #if ATTEMPTS_MIN < TMP_MAX unsigned int attempts = TMP_MAX; #else unsigned int attempts = ATTEMPTS_MIN; #endif len = strlen (tmpl); if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) { errno = EINVAL; return -1; } /* This is where the Xs start. */ XXXXXX = &tmpl[len - 6]; /* Get some more or less random data. */ { SYSTEMTIME stNow; FILETIME ftNow; // get system time GetSystemTime(&stNow); stNow.wMilliseconds = 500; if (!SystemTimeToFileTime(&stNow, &ftNow)) { errno = -1; return -1; } random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) | (unsigned long long)ftNow.dwLowDateTime); } value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); for (count = 0; count < attempts; value += 7777, ++count) { unsigned long long v = value; /* Fill in the random bits. */ XXXXXX[0] = letters[v % 62]; v /= 62; XXXXXX[1] = letters[v % 62]; v /= 62; XXXXXX[2] = letters[v % 62]; v /= 62; XXXXXX[3] = letters[v % 62]; v /= 62; XXXXXX[4] = letters[v % 62]; v /= 62; XXXXXX[5] = letters[v % 62]; /* Modified for windows and to unlink */ // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); int flags = _O_RDWR | _O_CREAT | _O_EXCL | _O_BINARY; flags |= _O_TEMPORARY; fd = _open (tmpl, flags, _S_IREAD | _S_IWRITE); if (fd >= 0) { errno = save_errno; return fd; } else if (errno != EEXIST) return -1; } /* We got out of the loop because we ran out of combinations to try. */ errno = EEXIST; return -1; } #else int mkstemp_and_unlink(char *tmpl) { int ret = mkstemp(tmpl); if (ret != -1) { UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl); } return ret; } #endif // If it's a directory, add a /. This lets users say -T /tmp without creating // /tmpAAAAAA void NormalizeTempPrefix(std::string &base) { if (base.empty()) return; if (base[base.size() - 1] == '/') return; struct stat sb; // It's fine for it to not exist. if (-1 == stat(base.c_str(), &sb)) return; if ( #if defined(_WIN32) || defined(_WIN64) sb.st_mode & _S_IFDIR #else S_ISDIR(sb.st_mode) #endif ) base += '/'; } int MakeTemp(const std::string &base) { std::string name(base); name += "XXXXXX"; name.push_back(0); int ret; UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base); return ret; } std::FILE *FMakeTemp(const std::string &base) { util::scoped_fd file(MakeTemp(base)); return FDOpenOrThrow(file); } int DupOrThrow(int fd) { int ret = dup(fd); UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor"); return ret; } namespace { // Try to name things but be willing to fail too. bool TryName(int fd, std::string &out) { #if defined(_WIN32) || defined(_WIN64) return false; #else std::string name("/proc/self/fd/"); std::ostringstream convert; convert << fd; name += convert.str(); struct stat sb; if (-1 == lstat(name.c_str(), &sb)) return false; out.resize(sb.st_size + 1); ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1); if (-1 == ret) return false; if (ret > sb.st_size) { // Increased in size?! return false; } out.resize(ret); // Don't use the non-file names. if (!out.empty() && out[0] != '/') return false; return true; #endif } } // namespace std::string NameFromFD(int fd) { std::string ret; if (TryName(fd, ret)) return ret; switch (fd) { case 0: return "stdin"; case 1: return "stdout"; case 2: return "stderr"; } ret = "fd "; std::ostringstream convert; convert << fd; ret += convert.str(); return ret; } } // namespace util ================================================ FILE: src/kenlm/util/file.hh ================================================ #ifndef UTIL_FILE__ #define UTIL_FILE__ #include "util/exception.hh" #include #include #include #include namespace util { class scoped_fd { public: scoped_fd() : fd_(-1) {} explicit scoped_fd(int fd) : fd_(fd) {} ~scoped_fd(); void reset(int to = -1) { scoped_fd other(fd_); fd_ = to; } int get() const { return fd_; } int operator*() const { return fd_; } int release() { int ret = fd_; fd_ = -1; return ret; } private: int fd_; scoped_fd(const scoped_fd &); scoped_fd &operator=(const scoped_fd &); }; class scoped_FILE { public: explicit scoped_FILE(std::FILE *file = NULL) : file_(file) {} ~scoped_FILE(); std::FILE *get() { return file_; } const std::FILE *get() const { return file_; } void reset(std::FILE *to = NULL) { scoped_FILE other(file_); file_ = to; } std::FILE *release() { std::FILE *ret = file_; file_ = NULL; return ret; } private: std::FILE *file_; }; /* Thrown for any operation where the fd is known. */ class FDException : public ErrnoException { public: explicit FDException(int fd) throw(); virtual ~FDException() throw(); // This may no longer be valid if the exception was thrown past open. int FD() const { return fd_; } // Guess from NameFromFD. const std::string &NameGuess() const { return name_guess_; } private: int fd_; std::string name_guess_; }; // End of file reached. class EndOfFileException : public Exception { public: EndOfFileException() throw(); ~EndOfFileException() throw(); }; // Open for read only. int OpenReadOrThrow(const char *name); // Create file if it doesn't exist, truncate if it does. Opened for write. int CreateOrThrow(const char *name); // Return value for SizeFile when it can't size properly. const uint64_t kBadSize = (uint64_t)-1; uint64_t SizeFile(int fd); uint64_t SizeOrThrow(int fd); void ResizeOrThrow(int fd, uint64_t to); std::size_t PartialRead(int fd, void *to, std::size_t size); void ReadOrThrow(int fd, void *to, std::size_t size); std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size); // Positioned: unix only for now. void PReadOrThrow(int fd, void *to, std::size_t size, uint64_t off); void WriteOrThrow(int fd, const void *data_void, std::size_t size); void WriteOrThrow(FILE *to, const void *data, std::size_t size); void FSyncOrThrow(int fd); // Seeking void SeekOrThrow(int fd, uint64_t off); void AdvanceOrThrow(int fd, int64_t off); void SeekEnd(int fd); std::FILE *FDOpenOrThrow(scoped_fd &file); std::FILE *FDOpenReadOrThrow(scoped_fd &file); // Temporary files // Append a / if base is a directory. void NormalizeTempPrefix(std::string &base); int MakeTemp(const std::string &prefix); std::FILE *FMakeTemp(const std::string &prefix); // dup an fd. int DupOrThrow(int fd); /* Attempt get file name from fd. This won't always work (i.e. on Windows or * a pipe). The file might have been renamed. It's intended for diagnostics * and logging only. */ std::string NameFromFD(int fd); } // namespace util #endif // UTIL_FILE__ ================================================ FILE: src/kenlm/util/file_piece.cc ================================================ #include "util/file_piece.hh" #include "util/double-conversion/double-conversion.h" #include "util/exception.hh" #include "util/file.hh" #include "util/mmap.hh" #if defined(_WIN32) || defined(_WIN64) #include #else #include #endif #include #include #include #include #include #include #include #include namespace util { ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) { Initialize(name, show_progress, min_buffer); } namespace { std::string NamePossiblyFind(int fd, const char *name) { if (name) return name; return NameFromFD(fd); } } // namespace FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) { Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer); } FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) : total_size_(kBadSize), page_(SizePage()) { InitializeNoRead("istream", min_buffer); fallback_to_read_ = true; data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); position_ = data_.begin(); position_end_ = position_; fell_back_.Reset(stream); } FilePiece::~FilePiece() {} StringPiece FilePiece::ReadLine(char delim) { std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (*i == delim) { StringPiece ret(position_, i - position_); position_ = i + 1; return ret; } } if (at_end_) { if (position_ == position_end_) Shift(); return Consume(position_end_); } skip = position_end_ - position_; Shift(); } } float FilePiece::ReadFloat() { return ReadNumber(); } double FilePiece::ReadDouble() { return ReadNumber(); } long int FilePiece::ReadLong() { return ReadNumber(); } unsigned long int FilePiece::ReadULong() { return ReadNumber(); } // Factored out so that istream can call this. void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) { file_name_ = name; default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; at_end_ = false; } void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { InitializeNoRead(name, min_buffer); if (total_size_ == kBadSize) { // So the assertion passes. fallback_to_read_ = false; if (show_progress) *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl; TransitionToRead(); } else { fallback_to_read_ = false; } Shift(); // gzip detect. if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) { if (!fallback_to_read_) { at_end_ = false; TransitionToRead(); } } } namespace { static const double_conversion::StringToDoubleConverter kConverter( double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES, std::numeric_limits::quiet_NaN(), std::numeric_limits::quiet_NaN(), "inf", "NaN"); void ParseNumber(const char *begin, const char *&end, float &out) { int count; out = kConverter.StringToFloat(begin, end - begin, &count); end = begin + count; } void ParseNumber(const char *begin, const char *&end, double &out) { int count; out = kConverter.StringToDouble(begin, end - begin, &count); end = begin + count; } void ParseNumber(const char *begin, const char *&end, long int &out) { char *silly_end; out = strtol(begin, &silly_end, 10); end = silly_end; } void ParseNumber(const char *begin, const char *&end, unsigned long int &out) { char *silly_end; out = strtoul(begin, &silly_end, 10); end = silly_end; } } // namespace template T FilePiece::ReadNumber() { SkipSpaces(); while (last_space_ < position_) { if (at_end_) { // Hallucinate a null off the end of the file. std::string buffer(position_, position_end_); const char *buf = buffer.c_str(); const char *end = buf + buffer.size(); T ret; ParseNumber(buf, end, ret); if (buf == end) throw ParseNumberException(buffer); position_ += end - buf; return ret; } Shift(); } const char *end = last_space_; T ret; ParseNumber(position_, end, ret); if (end == position_) throw ParseNumberException(ReadDelimited()); position_ = end; return ret; } const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (delim[static_cast(*i)]) return i; } if (at_end_) { if (position_ == position_end_) Shift(); return position_end_; } skip = position_end_ - position_; Shift(); } } void FilePiece::Shift() { if (at_end_) { progress_.Finished(); throw EndOfFileException(); } uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. if (fallback_to_read_) ReadShift(); for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) { if (kSpaces[static_cast(*last_space_)]) break; } } void FilePiece::MMapShift(uint64_t desired_begin) { // Use mmap. uint64_t ignore = desired_begin % page_; // Duplicate request for Shift means give more data. if (position_ == data_.begin() + ignore && position_) { default_map_size_ *= 2; } // Local version so that in case of failure it doesn't overwrite the class variable. uint64_t mapped_offset = desired_begin - ignore; uint64_t mapped_size; if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { at_end_ = true; mapped_size = total_size_ - mapped_offset; } else { mapped_size = default_map_size_; } // Forcibly clear the existing mmap first. data_.reset(); try { MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); } catch (const util::ErrnoException &e) { if (desired_begin) { SeekOrThrow(*file_, desired_begin); } // The mmap was scheduled to end the file, but now we're going to read it. at_end_ = false; TransitionToRead(); return; } mapped_offset_ = mapped_offset; position_ = data_.begin() + ignore; position_end_ = data_.begin() + mapped_size; progress_.Set(desired_begin); } void FilePiece::TransitionToRead() { assert(!fallback_to_read_); fallback_to_read_ = true; data_.reset(); data_.reset(MallocOrThrow(default_map_size_), default_map_size_, scoped_memory::MALLOC_ALLOCATED); position_ = data_.begin(); position_end_ = position_; try { fell_back_.Reset(file_.release()); } catch (util::Exception &e) { e << " in file " << file_name_; throw; } } void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. // Bytes [position_, position_end_) have been read into the buffer. // Start at the beginning of the buffer if there's nothing useful in it. if (position_ == position_end_) { mapped_offset_ += (position_end_ - data_.begin()); position_ = data_.begin(); position_end_ = position_; } std::size_t already_read = position_end_ - data_.begin(); if (already_read == default_map_size_) { if (position_ == data_.begin()) { // Buffer too small. std::size_t valid_length = position_end_ - position_; default_map_size_ *= 2; data_.call_realloc(default_map_size_); UTIL_THROW_IF(!data_.get(), ErrnoException, "realloc failed for " << default_map_size_); position_ = data_.begin(); position_end_ = position_ + valid_length; } else { std::size_t moving = position_end_ - position_; memmove(data_.get(), position_, moving); position_ = data_.begin(); position_end_ = position_ + moving; already_read = moving; } } std::size_t read_return = fell_back_.Read(static_cast(data_.get()) + already_read, default_map_size_ - already_read); progress_.Set(fell_back_.RawAmount()); if (read_return == 0) { at_end_ = true; } position_end_ += read_return; } } // namespace util ================================================ FILE: src/kenlm/util/file_piece.hh ================================================ #ifndef UTIL_FILE_PIECE__ #define UTIL_FILE_PIECE__ #include "util/ersatz_progress.hh" #include "util/exception.hh" #include "util/file.hh" #include "util/mmap.hh" #include "util/read_compressed.hh" #include "util/string_piece.hh" #include #include #include #include namespace util { class ParseNumberException : public Exception { public: explicit ParseNumberException(StringPiece value) throw(); ~ParseNumberException() throw() {} }; extern const bool kSpaces[256]; // Memory backing the returned StringPiece may vanish on the next call. class FilePiece { public: // 1 MB default. explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); // Takes ownership of fd. name is used for messages. explicit FilePiece(int fd, const char *name = NULL, std::ostream *show_progress = NULL, std::size_t min_buffer = 1048576); /* Read from an istream. Don't use this if you can avoid it. Raw fd IO is * much faster. But sometimes you just have an istream like Boost's HTTP * server and want to parse it the same way. * name is just used for messages and FileName(). */ explicit FilePiece(std::istream &stream, const char *name = NULL, std::size_t min_buffer = 1048576); ~FilePiece(); char get() { if (position_ == position_end_) { Shift(); if (at_end_) throw EndOfFileException(); } return *(position_++); } // Leaves the delimiter, if any, to be returned by get(). Delimiters defined by isspace(). StringPiece ReadDelimited(const bool *delim = kSpaces) { SkipSpaces(delim); return Consume(FindDelimiterOrEOF(delim)); } // Unlike ReadDelimited, this includes leading spaces and consumes the delimiter. // It is similar to getline in that way. StringPiece ReadLine(char delim = '\n'); float ReadFloat(); double ReadDouble(); long int ReadLong(); unsigned long int ReadULong(); // Skip spaces defined by isspace. void SkipSpaces(const bool *delim = kSpaces) { for (; ; ++position_) { if (position_ == position_end_) Shift(); if (!delim[static_cast(*position_)]) return; } } uint64_t Offset() const { return position_ - data_.begin() + mapped_offset_; } const std::string &FileName() const { return file_name_; } private: void InitializeNoRead(const char *name, std::size_t min_buffer); // Calls InitializeNoRead, so don't call both. void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template T ReadNumber(); StringPiece Consume(const char *to) { StringPiece ret(position_, to - position_); position_ = to; return ret; } const char *FindDelimiterOrEOF(const bool *delim = kSpaces); void Shift(); // Backends to Shift(). void MMapShift(uint64_t desired_begin); void TransitionToRead(); void ReadShift(); const char *position_, *last_space_, *position_end_; scoped_fd file_; const uint64_t total_size_; const uint64_t page_; std::size_t default_map_size_; uint64_t mapped_offset_; // Order matters: file_ should always be destroyed after this. scoped_memory data_; bool at_end_; bool fallback_to_read_; ErsatzProgress progress_; std::string file_name_; ReadCompressed fell_back_; }; } // namespace util #endif // UTIL_FILE_PIECE__ ================================================ FILE: src/kenlm/util/file_piece_test.cc ================================================ // Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" #include "util/file.hh" #include "util/scoped.hh" #define BOOST_TEST_MODULE FilePieceTest #include #include #include #include #include #include namespace util { namespace { std::string FileLocation() { if (boost::unit_test::framework::master_test_suite().argc < 2) { return "file_piece.cc"; } std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); return ret; } /* istream */ BOOST_AUTO_TEST_CASE(IStream) { std::fstream ref(FileLocation().c_str(), std::ios::in); std::fstream backing(FileLocation().c_str(), std::ios::in); FilePiece test(backing); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); BOOST_CHECK_EQUAL(ref_line, test_line); } BOOST_CHECK_THROW(test.get(), EndOfFileException); BOOST_CHECK_THROW(test.get(), EndOfFileException); } /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { std::fstream ref(FileLocation().c_str(), std::ios::in); FilePiece test(FileLocation().c_str(), NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 if (!test_line.empty() || !ref_line.empty()) { BOOST_CHECK_EQUAL(ref_line, test_line); } } BOOST_CHECK_THROW(test.get(), EndOfFileException); } #if !defined(_WIN32) && !defined(_WIN64) && !defined(__APPLE__) /* Apple isn't happy with the popen, fileno, dup. And I don't want to * reimplement popen. This is an issue with the test. */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { std::fstream ref(FileLocation().c_str(), std::ios::in); std::string popen_args = "cat \""; popen_args += FileLocation(); popen_args += '"'; FILE *catter = popen(popen_args.c_str(), "r"); BOOST_REQUIRE(catter); FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 if (!test_line.empty() || !ref_line.empty()) { BOOST_CHECK_EQUAL(ref_line, test_line); } } BOOST_CHECK_THROW(test.get(), EndOfFileException); BOOST_REQUIRE(!pclose(catter)); } #endif #ifdef HAVE_ZLIB // gzip file BOOST_AUTO_TEST_CASE(PlainZipReadLine) { std::string location(FileLocation()); std::fstream ref(location.c_str(), std::ios::in); std::string command("gzip <\""); command += location + "\" >\"" + location + "\".gz"; BOOST_REQUIRE_EQUAL(0, system(command.c_str())); FilePiece test((location + ".gz").c_str(), NULL, 1); unlink((location + ".gz").c_str()); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 if (!test_line.empty() || !ref_line.empty()) { BOOST_CHECK_EQUAL(ref_line, test_line); } } BOOST_CHECK_THROW(test.get(), EndOfFileException); } // gzip stream. Apple doesn't like popen, fileno, dup. This is an issue with // the test. #ifndef __APPLE__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { std::fstream ref(FileLocation().c_str(), std::ios::in); std::string command("gzip <\""); command += FileLocation() + "\""; FILE * catter = popen(command.c_str(), "r"); BOOST_REQUIRE(catter); FilePiece test(dup(fileno(catter)), "file_piece.cc.gz", NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); // I submitted a bug report to ICU: http://bugs.icu-project.org/trac/ticket/7924 if (!test_line.empty() || !ref_line.empty()) { BOOST_CHECK_EQUAL(ref_line, test_line); } } BOOST_CHECK_THROW(test.get(), EndOfFileException); BOOST_REQUIRE(!pclose(catter)); } #endif // __APPLE__ #endif // HAVE_ZLIB } // namespace } // namespace util ================================================ FILE: src/kenlm/util/getopt.c ================================================ /* POSIX getopt for Windows AT&T Public License Code given out at the 1985 UNIFORUM conference in Dallas. */ #ifndef __GNUC__ #include "getopt.hh" #include #include #define NULL 0 #define EOF (-1) #define ERR(s, c) if(opterr){\ char errbuf[2];\ errbuf[0] = c; errbuf[1] = '\n';\ fputs(argv[0], stderr);\ fputs(s, stderr);\ fputc(c, stderr);} //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ //(void) write(2, s, (unsigned)strlen(s));\ //(void) write(2, errbuf, 2);} int opterr = 1; int optind = 1; int optopt; char *optarg; int getopt(argc, argv, opts) int argc; char **argv, *opts; { static int sp = 1; register int c; register char *cp; if(sp == 1) if(optind >= argc || argv[optind][0] != '-' || argv[optind][1] == '\0') return(EOF); else if(strcmp(argv[optind], "--") == NULL) { optind++; return(EOF); } optopt = c = argv[optind][sp]; if(c == ':' || (cp=strchr(opts, c)) == NULL) { ERR(": illegal option -- ", c); if(argv[optind][++sp] == '\0') { optind++; sp = 1; } return('?'); } if(*++cp == ':') { if(argv[optind][sp+1] != '\0') optarg = &argv[optind++][sp+1]; else if(++optind >= argc) { ERR(": option requires an argument -- ", c); sp = 1; return('?'); } else optarg = argv[optind++]; sp = 1; } else { if(argv[optind][++sp] == '\0') { sp = 1; optind++; } optarg = NULL; } return(c); } #endif /* __GNUC__ */ ================================================ FILE: src/kenlm/util/getopt.hh ================================================ /* POSIX getopt for Windows AT&T Public License Code given out at the 1985 UNIFORUM conference in Dallas. */ #ifdef __GNUC__ #include #endif #ifndef __GNUC__ #ifndef _WINGETOPT_H_ #define _WINGETOPT_H_ #ifdef __cplusplus extern "C" { #endif extern int opterr; extern int optind; extern int optopt; extern char *optarg; extern int getopt(int argc, char **argv, char *opts); #ifdef __cplusplus } #endif #endif /* _GETOPT_H_ */ #endif /* __GNUC__ */ ================================================ FILE: src/kenlm/util/have.hh ================================================ /* Optional packages. You might want to integrate this with your build system e.g. config.h from ./configure. */ #ifndef UTIL_HAVE__ #define UTIL_HAVE__ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifndef HAVE_ICU //#define HAVE_ICU #endif #endif // UTIL_HAVE__ ================================================ FILE: src/kenlm/util/joint_sort.hh ================================================ #ifndef UTIL_JOINT_SORT__ #define UTIL_JOINT_SORT__ /* A terrifying amount of C++ to coax std::sort into soring one range while * also permuting another range the same way. */ #include "util/proxy_iterator.hh" #include #include #include namespace util { namespace detail { template class JointProxy; template class JointIter { public: JointIter() {} JointIter(const KeyIter &key_iter, const ValueIter &value_iter) : key_(key_iter), value_(value_iter) {} bool operator==(const JointIter &other) const { return key_ == other.key_; } bool operator<(const JointIter &other) const { return (key_ < other.key_); } std::ptrdiff_t operator-(const JointIter &other) const { return key_ - other.key_; } JointIter &operator+=(std::ptrdiff_t amount) { key_ += amount; value_ += amount; return *this; } void swap(const JointIter &other) { std::swap(key_, other.key_); std::swap(value_, other.value_); } private: friend class JointProxy; KeyIter key_; ValueIter value_; }; template class JointProxy { private: typedef JointIter InnerIterator; public: typedef struct { typename std::iterator_traits::value_type key; typename std::iterator_traits::value_type value; const typename std::iterator_traits::value_type &GetKey() const { return key; } } value_type; JointProxy(const KeyIter &key_iter, const ValueIter &value_iter) : inner_(key_iter, value_iter) {} JointProxy(const JointProxy &other) : inner_(other.inner_) {} operator value_type() const { value_type ret; ret.key = *inner_.key_; ret.value = *inner_.value_; return ret; } JointProxy &operator=(const JointProxy &other) { *inner_.key_ = *other.inner_.key_; *inner_.value_ = *other.inner_.value_; return *this; } JointProxy &operator=(const value_type &other) { *inner_.key_ = other.key; *inner_.value_ = other.value; return *this; } typename std::iterator_traits::reference GetKey() const { return *(inner_.key_); } void swap(JointProxy &other) { std::swap(*inner_.key_, *other.inner_.key_); std::swap(*inner_.value_, *other.inner_.value_); } private: friend class ProxyIterator >; InnerIterator &Inner() { return inner_; } const InnerIterator &Inner() const { return inner_; } InnerIterator inner_; }; template class LessWrapper : public std::binary_function { public: explicit LessWrapper(const Less &less) : less_(less) {} bool operator()(const Proxy &left, const Proxy &right) const { return less_(left.GetKey(), right.GetKey()); } bool operator()(const Proxy &left, const typename Proxy::value_type &right) const { return less_(left.GetKey(), right.GetKey()); } bool operator()(const typename Proxy::value_type &left, const Proxy &right) const { return less_(left.GetKey(), right.GetKey()); } bool operator()(const typename Proxy::value_type &left, const typename Proxy::value_type &right) const { return less_(left.GetKey(), right.GetKey()); } private: const Less less_; }; } // namespace detail template class PairedIterator : public ProxyIterator > { public: PairedIterator(const KeyIter &key, const ValueIter &value) : ProxyIterator >(detail::JointProxy(key, value)) {} }; template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin, const Less &less) { ProxyIterator > full_begin(detail::JointProxy(key_begin, value_begin)); detail::LessWrapper, Less> less_wrap(less); std::sort(full_begin, full_begin + (key_end - key_begin), less_wrap); } template void JointSort(const KeyIter &key_begin, const KeyIter &key_end, const ValueIter &value_begin) { JointSort(key_begin, key_end, value_begin, std::less::value_type>()); } } // namespace util namespace std { template void swap(util::detail::JointIter &left, util::detail::JointIter &right) { left.swap(right); } template void swap(util::detail::JointProxy &left, util::detail::JointProxy &right) { left.swap(right); } } // namespace std #endif // UTIL_JOINT_SORT__ ================================================ FILE: src/kenlm/util/joint_sort_test.cc ================================================ #include "util/joint_sort.hh" #define BOOST_TEST_MODULE JointSortTest #include namespace util { namespace { BOOST_AUTO_TEST_CASE(just_flip) { char keys[2]; int values[2]; keys[0] = 1; values[0] = 327; keys[1] = 0; values[1] = 87897; JointSort(keys + 0, keys + 2, values + 0); BOOST_CHECK_EQUAL(0, keys[0]); BOOST_CHECK_EQUAL(87897, values[0]); BOOST_CHECK_EQUAL(1, keys[1]); BOOST_CHECK_EQUAL(327, values[1]); } BOOST_AUTO_TEST_CASE(three) { char keys[3]; int values[3]; keys[0] = 1; values[0] = 327; keys[1] = 2; values[1] = 87897; keys[2] = 0; values[2] = 10; JointSort(keys + 0, keys + 3, values + 0); BOOST_CHECK_EQUAL(0, keys[0]); BOOST_CHECK_EQUAL(1, keys[1]); BOOST_CHECK_EQUAL(2, keys[2]); } BOOST_AUTO_TEST_CASE(char_int) { char keys[4]; int values[4]; keys[0] = 3; values[0] = 327; keys[1] = 1; values[1] = 87897; keys[2] = 2; values[2] = 10; keys[3] = 0; values[3] = 24347; JointSort(keys + 0, keys + 4, values + 0); BOOST_CHECK_EQUAL(0, keys[0]); BOOST_CHECK_EQUAL(24347, values[0]); BOOST_CHECK_EQUAL(1, keys[1]); BOOST_CHECK_EQUAL(87897, values[1]); BOOST_CHECK_EQUAL(2, keys[2]); BOOST_CHECK_EQUAL(10, values[2]); BOOST_CHECK_EQUAL(3, keys[3]); BOOST_CHECK_EQUAL(327, values[3]); } }} // namespace anonymous util ================================================ FILE: src/kenlm/util/mmap.cc ================================================ /* Memory mapping wrappers. * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at * NICT. */ #include "util/mmap.hh" #include "util/exception.hh" #include "util/file.hh" #include "util/scoped.hh" #include #include #include #include #include #include #if defined(_WIN32) || defined(_WIN64) #include #include #else #include #include #endif namespace util { long SizePage() { #if defined(_WIN32) || defined(_WIN64) SYSTEM_INFO si; GetSystemInfo(&si); return si.dwAllocationGranularity; #else return sysconf(_SC_PAGE_SIZE); #endif } void SyncOrThrow(void *start, size_t length) { #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); #else UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); #endif } void UnmapOrThrow(void *start, size_t length) { #if defined(_WIN32) || defined(_WIN64) UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); #else UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed"); #endif } scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { try { // Thanks Denis Filimonov for pointing out NFS likes msync first. SyncOrThrow(data_, size_); UnmapOrThrow(data_, size_); } catch (const util::ErrnoException &e) { std::cerr << e.what(); abort(); } } } void scoped_memory::reset(void *data, std::size_t size, Alloc source) { switch(source_) { case MMAP_ALLOCATED: scoped_mmap(data_, size_); break; case ARRAY_ALLOCATED: delete [] reinterpret_cast(data_); break; case MALLOC_ALLOCATED: free(data_); break; case NONE_ALLOCATED: break; } data_ = data; size_ = size; source_ = source; } void scoped_memory::call_realloc(std::size_t size) { assert(source_ == MALLOC_ALLOCATED || source_ == NONE_ALLOCATED); void *new_data = realloc(data_, size); if (!new_data) { reset(); } else { reset(new_data, size, MALLOC_ALLOCATED); } } void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { #ifdef MAP_POPULATE // Linux specific if (prefault) { flags |= MAP_POPULATE; } #endif #if defined(_WIN32) || defined(_WIN64) int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; uint64_t total_size = size + offset; HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast(total_size), NULL); UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size); CloseHandle(hMapping); UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); #else int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret; UTIL_THROW_IF((ret = mmap(NULL, size, protect, flags, fd, offset)) == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); # ifdef MADV_HUGEPAGE /* We like huge pages but it's fine if we can't have them. Note that huge * pages are not supported for file-backed mmap on linux. */ madvise(ret, size, MADV_HUGEPAGE); # endif #endif return ret; } const int kFileFlags = #if defined(_WIN32) || defined(_WIN64) 0 // MapOrThrow ignores flags on windows #elif defined(MAP_FILE) MAP_FILE | MAP_SHARED #else MAP_SHARED #endif ; void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { switch (method) { case LAZY: out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); break; case POPULATE_OR_LAZY: #ifdef MAP_POPULATE case POPULATE_OR_READ: #endif out.reset(MapOrThrow(size, false, kFileFlags, true, fd, offset), size, scoped_memory::MMAP_ALLOCATED); break; #ifndef MAP_POPULATE case POPULATE_OR_READ: #endif case READ: out.reset(MallocOrThrow(size), size, scoped_memory::MALLOC_ALLOCATED); SeekOrThrow(fd, offset); ReadOrThrow(fd, out.get(), size); break; } } // Allocates zeroed memory in to. void MapAnonymous(std::size_t size, util::scoped_memory &to) { to.reset(); #if defined(_WIN32) || defined(_WIN64) to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED); #else to.reset(MapOrThrow(size, true, # if defined(MAP_ANONYMOUS) MAP_ANONYMOUS | MAP_PRIVATE // Linux # else MAP_ANON | MAP_PRIVATE // BSD # endif , false, -1, 0), size, scoped_memory::MMAP_ALLOCATED); #endif } void *MapZeroedWrite(int fd, std::size_t size) { ResizeOrThrow(fd, 0); ResizeOrThrow(fd, size); return MapOrThrow(size, true, kFileFlags, false, fd, 0); } void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(CreateOrThrow(name)); try { return MapZeroedWrite(file.get(), size); } catch (ErrnoException &e) { e << " in file " << name; throw; } } } // namespace util ================================================ FILE: src/kenlm/util/mmap.hh ================================================ #ifndef UTIL_MMAP__ #define UTIL_MMAP__ // Utilities for mmaped files. #include #include #include namespace util { class scoped_fd; long SizePage(); // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. class scoped_mmap { public: scoped_mmap() : data_((void*)-1), size_(0) {} scoped_mmap(void *data, std::size_t size) : data_(data), size_(size) {} ~scoped_mmap(); void *get() const { return data_; } const uint8_t *begin() const { return reinterpret_cast(data_); } const uint8_t *end() const { return reinterpret_cast(data_) + size_; } std::size_t size() const { return size_; } void reset(void *data, std::size_t size) { scoped_mmap other(data_, size_); data_ = data; size_ = size; } void reset() { reset((void*)-1, 0); } private: void *data_; std::size_t size_; scoped_mmap(const scoped_mmap &); scoped_mmap &operator=(const scoped_mmap &); }; /* For when the memory might come from mmap, new char[], or malloc. Uses NULL * and 0 for blanks even though mmap signals errors with (void*)-1). The reset * function checks that blank for mmap. */ class scoped_memory { public: typedef enum {MMAP_ALLOCATED, ARRAY_ALLOCATED, MALLOC_ALLOCATED, NONE_ALLOCATED} Alloc; scoped_memory() : data_(NULL), size_(0), source_(NONE_ALLOCATED) {} ~scoped_memory() { reset(); } void *get() const { return data_; } const char *begin() const { return reinterpret_cast(data_); } const char *end() const { return reinterpret_cast(data_) + size_; } std::size_t size() const { return size_; } Alloc source() const { return source_; } void reset() { reset(NULL, 0, NONE_ALLOCATED); } void reset(void *data, std::size_t size, Alloc from); // realloc allows the current data to escape hence the need for this call // If realloc fails, destroys the original too and get() returns NULL. void call_realloc(std::size_t to); private: void *data_; std::size_t size_; Alloc source_; scoped_memory(const scoped_memory &); scoped_memory &operator=(const scoped_memory &); }; typedef enum { // mmap with no prepopulate LAZY, // On linux, pass MAP_POPULATE to mmap. POPULATE_OR_LAZY, // Populate on Linux. malloc and read on non-Linux. POPULATE_OR_READ, // malloc and read. READ } LoadMethod; extern const int kFileFlags; // Wrapper around mmap to check it worked and hide some platform macros. void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); void MapAnonymous(std::size_t size, scoped_memory &to); // Open file name with mmap of size bytes, all of which are initially zero. void *MapZeroedWrite(int fd, std::size_t size); void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); // msync wrapper void SyncOrThrow(void *start, size_t length); } // namespace util #endif // UTIL_MMAP__ ================================================ FILE: src/kenlm/util/multi_intersection.hh ================================================ #ifndef UTIL_MULTI_INTERSECTION__ #define UTIL_MULTI_INTERSECTION__ #include #include #include #include #include namespace util { namespace detail { template struct RangeLessBySize : public std::binary_function { bool operator()(const Range &left, const Range &right) const { return left.size() < right.size(); } }; /* Takes sets specified by their iterators and a boost::optional containing * the lowest intersection if any. Each set must be sorted in increasing * order. sets is changed to truncate the beginning of each sequence to the * location of the match or an empty set. Precondition: sets is not empty * since the intersection over null is the universe and this function does not * know the universe. */ template boost::optional::value_type> FirstIntersectionSorted(std::vector > &sets, const Less &less = std::less::value_type>()) { typedef std::vector > Sets; typedef typename std::iterator_traits::value_type Value; assert(!sets.empty()); if (sets.front().empty()) return boost::optional(); // Possibly suboptimal to copy for general Value; makes unsigned int go slightly faster. Value highest(sets.front().front()); for (typename Sets::iterator i(sets.begin()); i != sets.end(); ) { i->advance_begin(std::lower_bound(i->begin(), i->end(), highest, less) - i->begin()); if (i->empty()) return boost::optional(); if (less(highest, i->front())) { highest = i->front(); // start over i = sets.begin(); } else { ++i; } } return boost::optional(highest); } } // namespace detail template boost::optional::value_type> FirstIntersection(std::vector > &sets, const Less less) { assert(!sets.empty()); std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); return detail::FirstIntersectionSorted(sets, less); } template boost::optional::value_type> FirstIntersection(std::vector > &sets) { return FirstIntersection(sets, std::less::value_type>()); } template void AllIntersection(std::vector > &sets, Output &out, const Less less) { typedef typename std::iterator_traits::value_type Value; assert(!sets.empty()); std::sort(sets.begin(), sets.end(), detail::RangeLessBySize >()); boost::optional ret; for (boost::optional ret; ret = detail::FirstIntersectionSorted(sets, less); sets.front().advance_begin(1)) { out(*ret); } } template void AllIntersection(std::vector > &sets, Output &out) { AllIntersection(sets, out, std::less::value_type>()); } } // namespace util #endif // UTIL_MULTI_INTERSECTION__ ================================================ FILE: src/kenlm/util/multi_intersection_test.cc ================================================ #include "util/multi_intersection.hh" #define BOOST_TEST_MODULE MultiIntersectionTest #include namespace util { namespace { BOOST_AUTO_TEST_CASE(Empty) { std::vector > sets; sets.push_back(boost::iterator_range(static_cast(NULL), static_cast(NULL))); BOOST_CHECK(!FirstIntersection(sets)); } BOOST_AUTO_TEST_CASE(Single) { std::vector nums; nums.push_back(1); nums.push_back(4); nums.push_back(100); std::vector::const_iterator> > sets; sets.push_back(nums); boost::optional ret(FirstIntersection(sets)); BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(static_cast(1), *ret); } template boost::iterator_range RangeFromArray(const T (&arr)[len]) { return boost::iterator_range(arr, arr + len); } BOOST_AUTO_TEST_CASE(MultiNone) { unsigned int nums0[] = {1, 3, 4, 22}; unsigned int nums1[] = {2, 5, 12}; unsigned int nums2[] = {4, 17}; std::vector > sets; sets.push_back(RangeFromArray(nums0)); sets.push_back(RangeFromArray(nums1)); sets.push_back(RangeFromArray(nums2)); BOOST_CHECK(!FirstIntersection(sets)); } BOOST_AUTO_TEST_CASE(MultiOne) { unsigned int nums0[] = {1, 3, 4, 17, 22}; unsigned int nums1[] = {2, 5, 12, 17}; unsigned int nums2[] = {4, 17}; std::vector > sets; sets.push_back(RangeFromArray(nums0)); sets.push_back(RangeFromArray(nums1)); sets.push_back(RangeFromArray(nums2)); boost::optional ret(FirstIntersection(sets)); BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(static_cast(17), *ret); } } // namespace } // namespace util ================================================ FILE: src/kenlm/util/murmur_hash.cc ================================================ /* Downloaded from http://sites.google.com/site/murmurhash/ which says "All * code is released to the public domain. For business purposes, Murmurhash is * under the MIT license." * This is modified from the original: * ULL tag on 0xc6a4a7935bd1e995 so this will compile on 32-bit. * length changed to unsigned int. * placed in namespace util * add MurmurHashNative * default option = 0 for seed * ARM port from NICT */ #include "util/murmur_hash.hh" #include namespace util { //----------------------------------------------------------------------------- // MurmurHash2, 64-bit versions, by Austin Appleby // The same caveats as 32-bit MurmurHash2 apply here - beware of alignment // and endian-ness issues if used across multiple platforms. // 64-bit hash for 64-bit platforms uint64_t MurmurHash64A ( const void * key, std::size_t len, uint64_t seed ) { const uint64_t m = 0xc6a4a7935bd1e995ULL; const int r = 47; uint64_t h = seed ^ (len * m); #if defined(__arm) || defined(__arm__) const size_t ksize = sizeof(uint64_t); const unsigned char * data = (const unsigned char *)key; const unsigned char * end = data + (std::size_t)(len/8) * ksize; #else const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len/8); #endif while(data != end) { #if defined(__arm) || defined(__arm__) uint64_t k; memcpy(&k, data, ksize); data += ksize; #else uint64_t k = *data++; #endif k *= m; k ^= k >> r; k *= m; h ^= k; h *= m; } const unsigned char * data2 = (const unsigned char*)data; switch(len & 7) { case 7: h ^= uint64_t(data2[6]) << 48; case 6: h ^= uint64_t(data2[5]) << 40; case 5: h ^= uint64_t(data2[4]) << 32; case 4: h ^= uint64_t(data2[3]) << 24; case 3: h ^= uint64_t(data2[2]) << 16; case 2: h ^= uint64_t(data2[1]) << 8; case 1: h ^= uint64_t(data2[0]); h *= m; }; h ^= h >> r; h *= m; h ^= h >> r; return h; } // 64-bit hash for 32-bit platforms uint64_t MurmurHash64B ( const void * key, std::size_t len, uint64_t seed ) { const unsigned int m = 0x5bd1e995; const int r = 24; unsigned int h1 = seed ^ len; unsigned int h2 = 0; #if defined(__arm) || defined(__arm__) size_t ksize = sizeof(unsigned int); const unsigned char * data = (const unsigned char *)key; #else const unsigned int * data = (const unsigned int *)key; #endif unsigned int k1, k2; while(len >= 8) { #if defined(__arm) || defined(__arm__) memcpy(&k1, data, ksize); data += ksize; memcpy(&k2, data, ksize); data += ksize; #else k1 = *data++; k2 = *data++; #endif k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; } if(len >= 4) { #if defined(__arm) || defined(__arm__) memcpy(&k1, data, ksize); data += ksize; #else k1 = *data++; #endif k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; } switch(len) { case 3: h2 ^= ((unsigned char*)data)[2] << 16; case 2: h2 ^= ((unsigned char*)data)[1] << 8; case 1: h2 ^= ((unsigned char*)data)[0]; h2 *= m; }; h1 ^= h2 >> 18; h1 *= m; h2 ^= h1 >> 22; h2 *= m; h1 ^= h2 >> 17; h1 *= m; h2 ^= h1 >> 19; h2 *= m; uint64_t h = h1; h = (h << 32) | h2; return h; } // Trick to test for 64-bit architecture at compile time. namespace { template inline uint64_t MurmurHashNativeBackend(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64A(key, len, seed); } template <> inline uint64_t MurmurHashNativeBackend<4>(const void * key, std::size_t len, uint64_t seed) { return MurmurHash64B(key, len, seed); } } // namespace uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed) { return MurmurHashNativeBackend(key, len, seed); } } // namespace util ================================================ FILE: src/kenlm/util/murmur_hash.hh ================================================ #ifndef UTIL_MURMUR_HASH__ #define UTIL_MURMUR_HASH__ #include #include namespace util { uint64_t MurmurHash64A(const void * key, std::size_t len, uint64_t seed = 0); uint64_t MurmurHash64B(const void * key, std::size_t len, uint64_t seed = 0); uint64_t MurmurHashNative(const void * key, std::size_t len, uint64_t seed = 0); } // namespace util #endif // UTIL_MURMUR_HASH__ ================================================ FILE: src/kenlm/util/pcqueue.hh ================================================ #ifndef UTIL_PCQUEUE__ #define UTIL_PCQUEUE__ #include #include #include #include #include namespace util { inline void WaitSemaphore (boost::interprocess::interprocess_semaphore &on) { while (1) { try { on.wait(); break; } catch (boost::interprocess::interprocess_exception &e) { if (e.get_native_error() != EINTR) throw; } } } /* Producer consumer queue safe for multiple producers and multiple consumers. * T must be default constructable and have operator=. * The value is copied twice for Consume(T &out) or three times for Consume(), * so larger objects should be passed via pointer. * Strong exception guarantee if operator= throws. Undefined if semaphores throw. */ template class PCQueue : boost::noncopyable { public: explicit PCQueue(size_t size) : empty_(size), used_(0), storage_(new T[size]), end_(storage_.get() + size), produce_at_(storage_.get()), consume_at_(storage_.get()) {} // Add a value to the queue. void Produce(const T &val) { WaitSemaphore(empty_); { boost::unique_lock produce_lock(produce_at_mutex_); try { *produce_at_ = val; } catch (...) { empty_.post(); throw; } if (++produce_at_ == end_) produce_at_ = storage_.get(); } used_.post(); } // Consume a value, assigning it to out. T& Consume(T &out) { WaitSemaphore(used_); { boost::unique_lock consume_lock(consume_at_mutex_); try { out = *consume_at_; } catch (...) { used_.post(); throw; } if (++consume_at_ == end_) consume_at_ = storage_.get(); } empty_.post(); return out; } // Convenience version of Consume that copies the value to return. // The other version is faster. T Consume() { T ret; Consume(ret); return ret; } private: // Number of empty spaces in storage_. boost::interprocess::interprocess_semaphore empty_; // Number of occupied spaces in storage_. boost::interprocess::interprocess_semaphore used_; boost::scoped_array storage_; T *const end_; // Index for next write in storage_. T *produce_at_; boost::mutex produce_at_mutex_; // Index for next read from storage_. T *consume_at_; boost::mutex consume_at_mutex_; }; } // namespace util #endif // UTIL_PCQUEUE__ ================================================ FILE: src/kenlm/util/pool.cc ================================================ #include "util/pool.hh" #include "util/scoped.hh" #include namespace util { Pool::Pool() { current_ = NULL; current_end_ = NULL; } Pool::~Pool() { FreeAll(); } void Pool::FreeAll() { for (std::vector::const_iterator i(free_list_.begin()); i != free_list_.end(); ++i) { free(*i); } free_list_.clear(); current_ = NULL; current_end_ = NULL; } void *Pool::More(std::size_t size) { std::size_t amount = std::max(static_cast(32) << free_list_.size(), size); uint8_t *ret = static_cast(MallocOrThrow(amount)); free_list_.push_back(ret); current_ = ret + size; current_end_ = ret + amount; return ret; } } // namespace util ================================================ FILE: src/kenlm/util/pool.hh ================================================ // Very simple pool. It can only allocate memory. And all of the memory it // allocates must be freed at the same time. #ifndef UTIL_POOL__ #define UTIL_POOL__ #include #include namespace util { class Pool { public: Pool(); ~Pool(); void *Allocate(std::size_t size) { void *ret = current_; current_ += size; if (current_ < current_end_) { return ret; } else { return More(size); } } void FreeAll(); private: void *More(std::size_t size); std::vector free_list_; uint8_t *current_, *current_end_; // no copying Pool(const Pool &); Pool &operator=(const Pool &); }; } // namespace util #endif // UTIL_POOL__ ================================================ FILE: src/kenlm/util/probing_hash_table.hh ================================================ #ifndef UTIL_PROBING_HASH_TABLE__ #define UTIL_PROBING_HASH_TABLE__ #include "util/exception.hh" #include #include #include #include #include #include namespace util { /* Thrown when table grows too large */ class ProbingSizeException : public Exception { public: ProbingSizeException() throw() {} ~ProbingSizeException() throw() {} }; // std::identity is an SGI extension :-( struct IdentityHash { template T operator()(T arg) const { return arg; } }; /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number * of elements, else it throws ProbingSizeException. * Memory management and initialization is externalized to make it easier to * serialize these to disk and load them quickly. * Uses linear probing to find value. * Only insert and lookup operations. */ template > class ProbingHashTable { public: typedef EntryT Entry; typedef typename Entry::Key Key; typedef const Entry *ConstIterator; typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; public: static uint64_t Size(uint64_t entries, float multiplier) { uint64_t buckets = std::max(entries + 1, static_cast(multiplier * static_cast(entries))); return buckets * sizeof(Entry); } // Must be assigned to later. ProbingHashTable() : entries_(0) #ifdef DEBUG , initialized_(false) #endif {} ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) : begin_(reinterpret_cast(start)), buckets_(allocated / sizeof(Entry)), end_(begin_ + buckets_), invalid_(invalid), hash_(hash_func), equal_(equal_func), entries_(0) #ifdef DEBUG , initialized_(true) #endif {} template MutableIterator Insert(const T &t) { #ifdef DEBUG assert(initialized_); #endif UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); return UncheckedInsert(t); } // Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map! template bool FindOrInsert(const T &t, MutableIterator &out) { #ifdef DEBUG assert(initialized_); #endif for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, t.GetKey())) { out = i; return true; } if (equal_(got, invalid_)) { UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); *i = t; out = i; return false; } if (++i == end_) i = begin_; } } void FinishedInserting() {} void LoadedBinary() {} // Don't change anything related to GetKey, template bool UnsafeMutableFind(const Key key, MutableIterator &out) { #ifdef DEBUG assert(initialized_); #endif for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { out = i; return true; } if (equal_(got, invalid_)) return false; if (++i == end_) i = begin_; } } template bool Find(const Key key, ConstIterator &out) const { #ifdef DEBUG assert(initialized_); #endif for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) { Key got(i->GetKey()); if (equal_(got, key)) { out = i; return true; } if (equal_(got, invalid_)) return false; if (++i == end_) i = begin_; } } void Clear() { Entry invalid; invalid.SetKey(invalid_); std::fill(begin_, end_, invalid); entries_ = 0; } // Return number of entries assuming no serialization went on. std::size_t SizeNoSerialization() const { return entries_; } // Return memory size expected by Double. std::size_t DoubleTo() const { return buckets_ * 2 * sizeof(Entry); } // Inform the table that it has double the amount of memory. // Pass clear_new = false if you are sure the new memory is initialized // properly (to invalid_) i.e. by mremap. void Double(void *new_base, bool clear_new = true) { begin_ = static_cast(new_base); MutableIterator old_end = begin_ + buckets_; buckets_ *= 2; end_ = begin_ + buckets_; if (clear_new) { Entry invalid; invalid.SetKey(invalid_); std::fill(old_end, end_, invalid); } std::vector rolled_over; // Move roll-over entries to a buffer because they might not roll over anymore. This should be small. for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) { rolled_over.push_back(*i); i->SetKey(invalid_); } /* Re-insert everything. Entries might go backwards to take over a * recently opened gap, stay, move to new territory, or wrap around. If * an entry wraps around, it might go to a pointer greater than i (which * can happen at the beginning) and it will be revisited to possibly fill * in a gap created later. */ Entry temp; for (MutableIterator i = begin_; i != old_end; ++i) { if (!equal_(i->GetKey(), invalid_)) { temp = *i; i->SetKey(invalid_); UncheckedInsert(temp); } } // Put the roll-over entries back in. for (typename std::vector::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) { UncheckedInsert(*i); } } // Mostly for tests, check consistency of every entry. void CheckConsistency() { MutableIterator last; for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {} UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full"); MutableIterator i; // Beginning can be wrap-arounds. for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) { MutableIterator ideal = Ideal(*i); UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_)); } MutableIterator pre_gap = i; for (; i != end_; ++i) { if (equal_(i->GetKey(), invalid_)) { pre_gap = i; continue; } MutableIterator ideal = Ideal(*i); UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_)); } } private: template MutableIterator Ideal(const T &t) { return begin_ + (hash_(t.GetKey()) % buckets_); } template MutableIterator UncheckedInsert(const T &t) { for (MutableIterator i(Ideal(t));;) { if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } } } MutableIterator begin_; std::size_t buckets_; MutableIterator end_; Key invalid_; Hash hash_; Equal equal_; std::size_t entries_; #ifdef DEBUG bool initialized_; #endif }; } // namespace util #endif // UTIL_PROBING_HASH_TABLE__ ================================================ FILE: src/kenlm/util/probing_hash_table_test.cc ================================================ #include "util/probing_hash_table.hh" #include "util/murmur_hash.hh" #include "util/scoped.hh" #define BOOST_TEST_MODULE ProbingHashTableTest #include #include #include #include #include #include #include namespace util { namespace { struct Entry { unsigned char key; typedef unsigned char Key; unsigned char GetKey() const { return key; } void SetKey(unsigned char to) { key = to; } uint64_t GetValue() const { return value; } uint64_t value; }; typedef ProbingHashTable > Table; BOOST_AUTO_TEST_CASE(simple) { size_t size = Table::Size(10, 1.2); boost::scoped_array mem(new char[size]); memset(mem.get(), 0, size); Table table(mem.get(), size); const Entry *i = NULL; BOOST_CHECK(!table.Find(2, i)); Entry to_ins; to_ins.key = 3; to_ins.value = 328920; table.Insert(to_ins); BOOST_REQUIRE(table.Find(3, i)); BOOST_CHECK_EQUAL(3, i->GetKey()); BOOST_CHECK_EQUAL(static_cast(328920), i->GetValue()); BOOST_CHECK(!table.Find(2, i)); } struct Entry64 { uint64_t key; typedef uint64_t Key; Entry64() {} explicit Entry64(uint64_t key_in) { key = key_in; } Key GetKey() const { return key; } void SetKey(uint64_t to) { key = to; } }; struct MurmurHashEntry64 { std::size_t operator()(uint64_t value) const { return util::MurmurHash64A(&value, 8); } }; typedef ProbingHashTable Table64; BOOST_AUTO_TEST_CASE(Double) { for (std::size_t initial = 19; initial < 30; ++initial) { size_t size = Table64::Size(initial, 1.2); scoped_malloc mem(MallocOrThrow(size)); Table64 table(mem.get(), size, std::numeric_limits::max()); table.Clear(); for (uint64_t i = 0; i < 19; ++i) { table.Insert(Entry64(i)); } table.CheckConsistency(); mem.call_realloc(table.DoubleTo()); table.Double(mem.get()); table.CheckConsistency(); for (uint64_t i = 20; i < 40 ; ++i) { table.Insert(Entry64(i)); } mem.call_realloc(table.DoubleTo()); table.Double(mem.get()); table.CheckConsistency(); } } } // namespace } // namespace util ================================================ FILE: src/kenlm/util/proxy_iterator.hh ================================================ #ifndef UTIL_PROXY_ITERATOR__ #define UTIL_PROXY_ITERATOR__ #include #include /* This is a RandomAccessIterator that uses a proxy to access the underlying * data. Useful for packing data at bit offsets but still using STL * algorithms. * * Normally I would use boost::iterator_facade but some people are too lazy to * install boost and still want to use my language model. It's amazing how * many operators an iterator has. * * The Proxy needs to provide: * class InnerIterator; * InnerIterator &Inner(); * const InnerIterator &Inner() const; * * InnerIterator has to implement: * operator==(InnerIterator) * operator<(InnerIterator) * operator+=(std::ptrdiff_t) * operator-(InnerIterator) * and of course whatever Proxy needs to dereference it. * * It's also a good idea to specialize std::swap for Proxy. */ namespace util { template class ProxyIterator { private: // Self. typedef ProxyIterator S; typedef typename Proxy::InnerIterator InnerIterator; public: typedef std::random_access_iterator_tag iterator_category; typedef typename Proxy::value_type value_type; typedef std::ptrdiff_t difference_type; typedef Proxy reference; typedef Proxy * pointer; ProxyIterator() {} // For cast from non const to const. template ProxyIterator(const ProxyIterator &in) : p_(*in) {} explicit ProxyIterator(const Proxy &p) : p_(p) {} // p_'s operator= does value copying, but here we want iterator copying. S &operator=(const S &other) { I() = other.I(); return *this; } bool operator==(const S &other) const { return I() == other.I(); } bool operator!=(const S &other) const { return !(*this == other); } bool operator<(const S &other) const { return I() < other.I(); } bool operator>(const S &other) const { return other < *this; } bool operator<=(const S &other) const { return !(*this > other); } bool operator>=(const S &other) const { return !(*this < other); } S &operator++() { return *this += 1; } S operator++(int) { S ret(*this); ++*this; return ret; } S &operator+=(std::ptrdiff_t amount) { I() += amount; return *this; } S operator+(std::ptrdiff_t amount) const { S ret(*this); ret += amount; return ret; } S &operator--() { return *this -= 1; } S operator--(int) { S ret(*this); --*this; return ret; } S &operator-=(std::ptrdiff_t amount) { I() += (-amount); return *this; } S operator-(std::ptrdiff_t amount) const { S ret(*this); ret -= amount; return ret; } std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); } Proxy operator*() { return p_; } const Proxy operator*() const { return p_; } Proxy *operator->() { return &p_; } const Proxy *operator->() const { return &p_; } Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); } const InnerIterator &Inner() { return p_.Inner(); } private: InnerIterator &I() { return p_.Inner(); } const InnerIterator &I() const { return p_.Inner(); } Proxy p_; }; template ProxyIterator operator+(std::ptrdiff_t amount, const ProxyIterator &it) { return it + amount; } } // namespace util #endif // UTIL_PROXY_ITERATOR__ ================================================ FILE: src/kenlm/util/read_compressed.cc ================================================ #include "util/read_compressed.hh" #include "util/file.hh" #include "util/have.hh" #include "util/scoped.hh" #include #include #include #include #include #include #ifdef HAVE_ZLIB #include #endif #ifdef HAVE_BZLIB #include #endif #ifdef HAVE_XZLIB #include #endif namespace util { CompressedException::CompressedException() throw() {} CompressedException::~CompressedException() throw() {} GZException::GZException() throw() {} GZException::~GZException() throw() {} BZException::BZException() throw() {} BZException::~BZException() throw() {} XZException::XZException() throw() {} XZException::~XZException() throw() {} class ReadBase { public: virtual ~ReadBase() {} virtual std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) = 0; protected: static void ReplaceThis(ReadBase *with, ReadCompressed &thunk) { thunk.internal_.reset(with); } static uint64_t &ReadCount(ReadCompressed &thunk) { return thunk.raw_amount_; } }; namespace { // Completed file that other classes can thunk to. class Complete : public ReadBase { public: std::size_t Read(void *, std::size_t, ReadCompressed &) { return 0; } }; class Uncompressed : public ReadBase { public: explicit Uncompressed(int fd) : fd_(fd) {} std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { std::size_t got = PartialRead(fd_.get(), to, amount); ReadCount(thunk) += got; return got; } private: scoped_fd fd_; }; class UncompressedWithHeader : public ReadBase { public: UncompressedWithHeader(int fd, void *already_data, std::size_t already_size) : fd_(fd) { assert(already_size); buf_.reset(malloc(already_size)); if (!buf_.get()) throw std::bad_alloc(); memcpy(buf_.get(), already_data, already_size); remain_ = static_cast(buf_.get()); end_ = remain_ + already_size; } std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { assert(buf_.get()); std::size_t sending = std::min(amount, end_ - remain_); memcpy(to, remain_, sending); remain_ += sending; if (remain_ == end_) { ReplaceThis(new Uncompressed(fd_.release()), thunk); } return sending; } private: scoped_malloc buf_; uint8_t *remain_; uint8_t *end_; scoped_fd fd_; }; #ifdef HAVE_ZLIB class GZip : public ReadBase { private: static const std::size_t kInputBuffer = 16384; public: GZip(int fd, void *already_data, std::size_t already_size) : file_(fd), in_buffer_(malloc(kInputBuffer)) { if (!in_buffer_.get()) throw std::bad_alloc(); assert(already_size < kInputBuffer); if (already_size) { memcpy(in_buffer_.get(), already_data, already_size); stream_.next_in = static_cast(in_buffer_.get()); stream_.avail_in = already_size; stream_.avail_in += ReadOrEOF(file_.get(), static_cast(in_buffer_.get()) + already_size, kInputBuffer - already_size); } else { stream_.avail_in = 0; } stream_.zalloc = Z_NULL; stream_.zfree = Z_NULL; stream_.opaque = Z_NULL; stream_.msg = NULL; // 32 for zlib and gzip decoding with automatic header detection. // 15 for maximum window size. UTIL_THROW_IF(Z_OK != inflateInit2(&stream_, 32 + 15), GZException, "Failed to initialize zlib."); } ~GZip() { if (Z_OK != inflateEnd(&stream_)) { std::cerr << "zlib could not close properly." << std::endl; abort(); } } std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { if (amount == 0) return 0; stream_.next_out = static_cast(to); stream_.avail_out = std::min(std::numeric_limits::max(), amount); do { if (!stream_.avail_in) ReadInput(thunk); int result = inflate(&stream_, 0); switch (result) { case Z_OK: break; case Z_STREAM_END: { std::size_t ret = static_cast(stream_.next_out) - static_cast(to); ReplaceThis(new Complete(), thunk); return ret; } case Z_ERRNO: UTIL_THROW(ErrnoException, "zlib error"); default: UTIL_THROW(GZException, "zlib encountered " << (stream_.msg ? stream_.msg : "an error ") << " code " << result); } } while (stream_.next_out == to); return static_cast(stream_.next_out) - static_cast(to); } private: void ReadInput(ReadCompressed &thunk) { assert(!stream_.avail_in); stream_.next_in = static_cast(in_buffer_.get()); stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); ReadCount(thunk) += stream_.avail_in; } scoped_fd file_; scoped_malloc in_buffer_; z_stream stream_; }; #endif // HAVE_ZLIB #ifdef HAVE_BZLIB class BZip : public ReadBase { public: explicit BZip(int fd, void *already_data, std::size_t already_size) { scoped_fd hold(fd); closer_.reset(FDOpenReadOrThrow(hold)); int bzerror = BZ_OK; file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size); switch (bzerror) { case BZ_OK: return; case BZ_CONFIG_ERROR: UTIL_THROW(BZException, "Looks like bzip2 was miscompiled."); case BZ_PARAM_ERROR: UTIL_THROW(BZException, "Parameter error"); case BZ_IO_ERROR: UTIL_THROW(BZException, "IO error reading file"); case BZ_MEM_ERROR: throw std::bad_alloc(); } } ~BZip() { int bzerror = BZ_OK; BZ2_bzReadClose(&bzerror, file_); if (bzerror != BZ_OK) { std::cerr << "bz2 readclose error" << std::endl; abort(); } } std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { int bzerror = BZ_OK; int ret = BZ2_bzRead(&bzerror, file_, to, std::min(static_cast(INT_MAX), amount)); long pos; switch (bzerror) { case BZ_STREAM_END: pos = ftell(closer_.get()); if (pos != -1) ReadCount(thunk) = pos; ReplaceThis(new Complete(), thunk); return ret; case BZ_OK: pos = ftell(closer_.get()); if (pos != -1) ReadCount(thunk) = pos; return ret; default: UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror); } } private: scoped_FILE closer_; BZFILE *file_; }; #endif // HAVE_BZLIB #ifdef HAVE_XZLIB class XZip : public ReadBase { private: static const std::size_t kInputBuffer = 16384; public: XZip(int fd, void *already_data, std::size_t already_size) : file_(fd), in_buffer_(malloc(kInputBuffer)), stream_(), action_(LZMA_RUN) { if (!in_buffer_.get()) throw std::bad_alloc(); assert(already_size < kInputBuffer); if (already_size) { memcpy(in_buffer_.get(), already_data, already_size); stream_.next_in = static_cast(in_buffer_.get()); stream_.avail_in = already_size; stream_.avail_in += ReadOrEOF(file_.get(), static_cast(in_buffer_.get()) + already_size, kInputBuffer - already_size); } else { stream_.avail_in = 0; } stream_.allocator = NULL; lzma_ret ret = lzma_stream_decoder(&stream_, UINT64_MAX, LZMA_CONCATENATED); switch (ret) { case LZMA_OK: break; case LZMA_MEM_ERROR: UTIL_THROW(ErrnoException, "xz open error"); default: UTIL_THROW(XZException, "xz error code " << ret); } } ~XZip() { lzma_end(&stream_); } std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { if (amount == 0) return 0; stream_.next_out = static_cast(to); stream_.avail_out = amount; do { if (!stream_.avail_in) ReadInput(thunk); lzma_ret status = lzma_code(&stream_, action_); switch (status) { case LZMA_OK: break; case LZMA_STREAM_END: UTIL_THROW_IF(action_ != LZMA_FINISH, XZException, "Input not finished yet."); { std::size_t ret = static_cast(stream_.next_out) - static_cast(to); ReplaceThis(new Complete(), thunk); return ret; } case LZMA_MEM_ERROR: throw std::bad_alloc(); case LZMA_FORMAT_ERROR: UTIL_THROW(XZException, "xzlib says file format not recognized"); case LZMA_OPTIONS_ERROR: UTIL_THROW(XZException, "xzlib says unsupported compression options"); case LZMA_DATA_ERROR: UTIL_THROW(XZException, "xzlib says this file is corrupt"); case LZMA_BUF_ERROR: UTIL_THROW(XZException, "xzlib says unexpected end of input"); default: UTIL_THROW(XZException, "unrecognized xzlib error " << status); } } while (stream_.next_out == to); return static_cast(stream_.next_out) - static_cast(to); } private: void ReadInput(ReadCompressed &thunk) { assert(!stream_.avail_in); stream_.next_in = static_cast(in_buffer_.get()); stream_.avail_in = ReadOrEOF(file_.get(), in_buffer_.get(), kInputBuffer); if (!stream_.avail_in) action_ = LZMA_FINISH; ReadCount(thunk) += stream_.avail_in; } scoped_fd file_; scoped_malloc in_buffer_; lzma_stream stream_; lzma_action action_; }; #endif // HAVE_XZLIB class IStreamReader : public ReadBase { public: explicit IStreamReader(std::istream &stream) : stream_(stream) {} std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) { if (!stream_.read(static_cast(to), amount)) { UTIL_THROW_IF(!stream_.eof(), ErrnoException, "istream error"); amount = stream_.gcount(); } ReadCount(thunk) += amount; return amount; } private: std::istream &stream_; }; enum MagicResult { UNKNOWN, GZIP, BZIP, XZIP }; MagicResult DetectMagic(const void *from_void) { const uint8_t *header = static_cast(from_void); if (header[0] == 0x1f && header[1] == 0x8b) { return GZIP; } if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') { return BZIP; } const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; if (!memcmp(header, xzmagic, 6)) { return XZIP; } return UNKNOWN; } ReadBase *ReadFactory(int fd, uint64_t &raw_amount) { scoped_fd hold(fd); unsigned char header[ReadCompressed::kMagicSize]; raw_amount = ReadOrEOF(fd, header, ReadCompressed::kMagicSize); if (!raw_amount) return new Uncompressed(hold.release()); if (raw_amount != ReadCompressed::kMagicSize) return new UncompressedWithHeader(hold.release(), header, raw_amount); switch (DetectMagic(header)) { case GZIP: #ifdef HAVE_ZLIB return new GZip(hold.release(), header, ReadCompressed::kMagicSize); #else UTIL_THROW(CompressedException, "This looks like a gzip file but gzip support was not compiled in."); #endif case BZIP: #ifdef HAVE_BZLIB return new BZip(hold.release(), header, ReadCompressed::kMagicSize); #else UTIL_THROW(CompressedException, "This looks like a bzip file (it begins with BZ), but bzip support was not compiled in."); #endif case XZIP: #ifdef HAVE_XZLIB return new XZip(hold.release(), header, ReadCompressed::kMagicSize); #else UTIL_THROW(CompressedException, "This looks like an xz file, but xz support was not compiled in."); #endif case UNKNOWN: break; } try { SeekOrThrow(fd, 0); } catch (const util::ErrnoException &e) { return new UncompressedWithHeader(hold.release(), header, ReadCompressed::kMagicSize); } return new Uncompressed(hold.release()); } } // namespace bool ReadCompressed::DetectCompressedMagic(const void *from_void) { return DetectMagic(from_void) != UNKNOWN; } ReadCompressed::ReadCompressed(int fd) { Reset(fd); } ReadCompressed::ReadCompressed(std::istream &in) { Reset(in); } ReadCompressed::ReadCompressed() {} ReadCompressed::~ReadCompressed() {} void ReadCompressed::Reset(int fd) { internal_.reset(); internal_.reset(ReadFactory(fd, raw_amount_)); } void ReadCompressed::Reset(std::istream &in) { internal_.reset(); internal_.reset(new IStreamReader(in)); } std::size_t ReadCompressed::Read(void *to, std::size_t amount) { return internal_->Read(to, amount, *this); } } // namespace util ================================================ FILE: src/kenlm/util/read_compressed.hh ================================================ #ifndef UTIL_READ_COMPRESSED__ #define UTIL_READ_COMPRESSED__ #include "util/exception.hh" #include "util/scoped.hh" #include #include namespace util { class CompressedException : public Exception { public: CompressedException() throw(); virtual ~CompressedException() throw(); }; class GZException : public CompressedException { public: GZException() throw(); ~GZException() throw(); }; class BZException : public CompressedException { public: BZException() throw(); ~BZException() throw(); }; class XZException : public CompressedException { public: XZException() throw(); ~XZException() throw(); }; class ReadBase; class ReadCompressed { public: static const std::size_t kMagicSize = 6; // Must have at least kMagicSize bytes. static bool DetectCompressedMagic(const void *from); // Takes ownership of fd. explicit ReadCompressed(int fd); // Try to avoid using this. Use the fd instead. // There is no decompression support for istreams. explicit ReadCompressed(std::istream &in); // Must call Reset later. ReadCompressed(); ~ReadCompressed(); // Takes ownership of fd. void Reset(int fd); // Same advice as the constructor. void Reset(std::istream &in); std::size_t Read(void *to, std::size_t amount); uint64_t RawAmount() const { return raw_amount_; } private: friend class ReadBase; scoped_ptr internal_; uint64_t raw_amount_; // No copying. ReadCompressed(const ReadCompressed &); void operator=(const ReadCompressed &); }; } // namespace util #endif // UTIL_READ_COMPRESSED__ ================================================ FILE: src/kenlm/util/read_compressed_test.cc ================================================ #include "util/read_compressed.hh" #include "util/file.hh" #include "util/have.hh" #define BOOST_TEST_MODULE ReadCompressedTest #include #include #include #include #include namespace util { namespace { void ReadLoop(ReadCompressed &reader, void *to_void, std::size_t amount) { uint8_t *to = static_cast(to_void); while (amount) { std::size_t ret = reader.Read(to, amount); BOOST_REQUIRE(ret); to += ret; amount -= ret; } } const uint32_t kSize4 = 100000 / 4; std::string WriteRandom() { char name[] = "tempXXXXXX"; scoped_fd original(mkstemp(name)); BOOST_REQUIRE(original.get() > 0); for (uint32_t i = 0; i < kSize4; ++i) { WriteOrThrow(original.get(), &i, sizeof(uint32_t)); } return name; } void VerifyRead(ReadCompressed &reader) { for (uint32_t i = 0; i < kSize4; ++i) { uint32_t got; ReadLoop(reader, &got, sizeof(uint32_t)); BOOST_CHECK_EQUAL(i, got); } char ignored; BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); // Test double EOF call. BOOST_CHECK_EQUAL((std::size_t)0, reader.Read(&ignored, 1)); } void TestRandom(const char *compressor) { std::string name(WriteRandom()); char gzname[] = "tempXXXXXX"; scoped_fd gzipped(mkstemp(gzname)); std::string command(compressor); #ifdef __CYGWIN__ command += ".exe"; #endif command += " <\""; command += name; command += "\" >\""; command += gzname; command += "\""; BOOST_REQUIRE_EQUAL(0, system(command.c_str())); BOOST_CHECK_EQUAL(0, unlink(name.c_str())); BOOST_CHECK_EQUAL(0, unlink(gzname)); ReadCompressed reader(gzipped.release()); VerifyRead(reader); } BOOST_AUTO_TEST_CASE(Uncompressed) { TestRandom("cat"); } #ifdef HAVE_ZLIB BOOST_AUTO_TEST_CASE(ReadGZ) { TestRandom("gzip"); } #endif // HAVE_ZLIB #ifdef HAVE_BZLIB BOOST_AUTO_TEST_CASE(ReadBZ) { TestRandom("bzip2"); } #endif // HAVE_BZLIB #ifdef HAVE_XZLIB BOOST_AUTO_TEST_CASE(ReadXZ) { TestRandom("xz"); } #endif BOOST_AUTO_TEST_CASE(IStream) { std::string name(WriteRandom()); std::fstream stream(name.c_str(), std::ios::in); BOOST_CHECK_EQUAL(0, unlink(name.c_str())); ReadCompressed reader; reader.Reset(stream); VerifyRead(reader); } } // namespace } // namespace util ================================================ FILE: src/kenlm/util/scoped.cc ================================================ #include "util/scoped.hh" #include #if !defined(_WIN32) && !defined(_WIN64) #include #endif namespace util { MallocException::MallocException(std::size_t requested) throw() { *this << "for " << requested << " bytes "; } MallocException::~MallocException() throw() {} namespace { void *InspectAddr(void *addr, std::size_t requested, const char *func_name) { UTIL_THROW_IF_ARG(!addr && requested, MallocException, (requested), "in " << func_name); // These routines are often used for large chunks of memory where huge pages help. #if MADV_HUGEPAGE madvise(addr, requested, MADV_HUGEPAGE); #endif return addr; } } // namespace void *MallocOrThrow(std::size_t requested) { return InspectAddr(std::malloc(requested), requested, "malloc"); } void *CallocOrThrow(std::size_t requested) { return InspectAddr(std::calloc(1, requested), requested, "calloc"); } scoped_malloc::~scoped_malloc() { std::free(p_); } void scoped_malloc::call_realloc(std::size_t requested) { p_ = InspectAddr(std::realloc(p_, requested), requested, "realloc"); } } // namespace util ================================================ FILE: src/kenlm/util/scoped.hh ================================================ #ifndef UTIL_SCOPED__ #define UTIL_SCOPED__ /* Other scoped objects in the style of scoped_ptr. */ #include "util/exception.hh" #include namespace util { class MallocException : public ErrnoException { public: explicit MallocException(std::size_t requested) throw(); ~MallocException() throw(); }; void *MallocOrThrow(std::size_t requested); void *CallocOrThrow(std::size_t requested); class scoped_malloc { public: scoped_malloc() : p_(NULL) {} scoped_malloc(void *p) : p_(p) {} ~scoped_malloc(); void reset(void *p = NULL) { scoped_malloc other(p_); p_ = p; } void call_realloc(std::size_t to); void *get() { return p_; } const void *get() const { return p_; } private: void *p_; scoped_malloc(const scoped_malloc &); scoped_malloc &operator=(const scoped_malloc &); }; // Hat tip to boost. template class scoped_array { public: explicit scoped_array(T *content = NULL) : c_(content) {} ~scoped_array() { delete [] c_; } T *get() { return c_; } const T* get() const { return c_; } T &operator*() { return *c_; } const T&operator*() const { return *c_; } T &operator[](std::size_t idx) { return c_[idx]; } const T &operator[](std::size_t idx) const { return c_[idx]; } void reset(T *to = NULL) { scoped_array other(c_); c_ = to; } private: T *c_; scoped_array(const scoped_array &); void operator=(const scoped_array &); }; template class scoped_ptr { public: explicit scoped_ptr(T *content = NULL) : c_(content) {} ~scoped_ptr() { delete c_; } T *get() { return c_; } const T* get() const { return c_; } T &operator*() { return *c_; } const T&operator*() const { return *c_; } T *operator->() { return c_; } const T*operator->() const { return c_; } T &operator[](std::size_t idx) { return c_[idx]; } const T &operator[](std::size_t idx) const { return c_[idx]; } void reset(T *to = NULL) { scoped_ptr other(c_); c_ = to; } private: T *c_; scoped_ptr(const scoped_ptr &); void operator=(const scoped_ptr &); }; } // namespace util #endif // UTIL_SCOPED__ ================================================ FILE: src/kenlm/util/sized_iterator.hh ================================================ #ifndef UTIL_SIZED_ITERATOR__ #define UTIL_SIZED_ITERATOR__ #include "util/proxy_iterator.hh" #include #include #include #include namespace util { class SizedInnerIterator { public: SizedInnerIterator() {} SizedInnerIterator(void *ptr, std::size_t size) : ptr_(static_cast(ptr)), size_(size) {} bool operator==(const SizedInnerIterator &other) const { return ptr_ == other.ptr_; } bool operator<(const SizedInnerIterator &other) const { return ptr_ < other.ptr_; } SizedInnerIterator &operator+=(std::ptrdiff_t amount) { ptr_ += amount * size_; return *this; } std::ptrdiff_t operator-(const SizedInnerIterator &other) const { return (ptr_ - other.ptr_) / size_; } const void *Data() const { return ptr_; } void *Data() { return ptr_; } std::size_t EntrySize() const { return size_; } private: uint8_t *ptr_; std::size_t size_; }; class SizedProxy { public: SizedProxy() {} SizedProxy(void *ptr, std::size_t size) : inner_(ptr, size) {} operator std::string() const { return std::string(reinterpret_cast(inner_.Data()), inner_.EntrySize()); } SizedProxy &operator=(const SizedProxy &from) { memcpy(inner_.Data(), from.inner_.Data(), inner_.EntrySize()); return *this; } SizedProxy &operator=(const std::string &from) { memcpy(inner_.Data(), from.data(), inner_.EntrySize()); return *this; } const void *Data() const { return inner_.Data(); } void *Data() { return inner_.Data(); } private: friend class util::ProxyIterator; typedef std::string value_type; typedef SizedInnerIterator InnerIterator; InnerIterator &Inner() { return inner_; } const InnerIterator &Inner() const { return inner_; } InnerIterator inner_; }; typedef ProxyIterator SizedIterator; inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); } // Useful wrapper for a comparison function i.e. sort. template class SizedCompare : public std::binary_function { public: explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {} bool operator()(const Proxy &first, const Proxy &second) const { return delegate_(first.Data(), second.Data()); } bool operator()(const Proxy &first, const std::string &second) const { return delegate_(first.Data(), second.data()); } bool operator()(const std::string &first, const Proxy &second) const { return delegate_(first.data(), second.Data()); } bool operator()(const std::string &first, const std::string &second) const { return delegate_(first.data(), second.data()); } const Delegate &GetDelegate() const { return delegate_; } private: const Delegate delegate_; }; } // namespace util #endif // UTIL_SIZED_ITERATOR__ ================================================ FILE: src/kenlm/util/sorted_uniform.hh ================================================ #ifndef UTIL_SORTED_UNIFORM__ #define UTIL_SORTED_UNIFORM__ #include #include #include #include namespace util { template class IdentityAccessor { public: typedef T Key; T operator()(const T *in) const { return *in; } }; struct Pivot64 { static inline std::size_t Calc(uint64_t off, uint64_t range, std::size_t width) { std::size_t ret = static_cast(static_cast(off) / static_cast(range) * static_cast(width)); // Cap for floating point rounding return (ret < width) ? ret : width - 1; } }; // Use when off * width is <2^64. This is guaranteed when each of them is actually a 32-bit value. struct Pivot32 { static inline std::size_t Calc(uint64_t off, uint64_t range, uint64_t width) { return static_cast((off * width) / (range + 1)); } }; // Usage: PivotSelect::T template struct PivotSelect; template <> struct PivotSelect<8> { typedef Pivot64 T; }; template <> struct PivotSelect<4> { typedef Pivot32 T; }; template <> struct PivotSelect<2> { typedef Pivot32 T; }; /* Binary search. */ template bool BinaryFind( const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key, Iterator &out) { while (end > begin) { Iterator pivot(begin + (end - begin) / 2); typename Accessor::Key mid(accessor(pivot)); if (mid < key) { begin = pivot + 1; } else if (mid > key) { end = pivot; } else { out = pivot; return true; } } return false; } // Search the range [before_it + 1, after_it - 1] for key. // Preconditions: // before_v <= key <= after_v // before_v <= all values in the range [before_it + 1, after_it - 1] <= after_v // range is sorted. template bool BoundedSortedUniformFind( const Accessor &accessor, Iterator before_it, typename Accessor::Key before_v, Iterator after_it, typename Accessor::Key after_v, const typename Accessor::Key key, Iterator &out) { while (after_it - before_it > 1) { Iterator pivot(before_it + (1 + Pivot::Calc(key - before_v, after_v - before_v, after_it - before_it - 1))); typename Accessor::Key mid(accessor(pivot)); if (mid < key) { before_it = pivot; before_v = mid; } else if (mid > key) { after_it = pivot; after_v = mid; } else { out = pivot; return true; } } return false; } template bool SortedUniformFind(const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key, Iterator &out) { if (begin == end) return false; typename Accessor::Key below(accessor(begin)); if (key <= below) { if (key == below) { out = begin; return true; } return false; } // Make the range [begin, end]. --end; typename Accessor::Key above(accessor(end)); if (key >= above) { if (key == above) { out = end; return true; } return false; } return BoundedSortedUniformFind(accessor, begin, below, end, above, key, out); } // May return begin - 1. template Iterator BinaryBelow( const Accessor &accessor, Iterator begin, Iterator end, const typename Accessor::Key key) { while (end > begin) { Iterator pivot(begin + (end - begin) / 2); typename Accessor::Key mid(accessor(pivot)); if (mid < key) { begin = pivot + 1; } else if (mid > key) { end = pivot; } else { for (++pivot; (pivot < end) && accessor(pivot) == mid; ++pivot) {} return pivot - 1; } } return begin - 1; } } // namespace util #endif // UTIL_SORTED_UNIFORM__ ================================================ FILE: src/kenlm/util/sorted_uniform_test.cc ================================================ #include "util/sorted_uniform.hh" #include #include #include #include #include #define BOOST_TEST_MODULE SortedUniformTest #include #include #include #include namespace util { namespace { template struct Entry { typedef KeyT Key; typedef ValueT Value; Key key; Value value; Key GetKey() const { return key; } Value GetValue() const { return value; } bool operator<(const Entry &other) const { return key < other.key; } }; template struct Accessor { typedef KeyT Key; template Key operator()(const Entry *entry) const { return entry->GetKey(); } }; template void Check(const Entry *begin, const Entry *end, const boost::unordered_map &reference, const Key key) { typename boost::unordered_map::const_iterator ref = reference.find(key); typedef const Entry *It; // g++ can't tell that require will crash and burn. It i = NULL; bool ret = SortedUniformFind, Pivot64>(Accessor(), begin, end, key, i); if (ref == reference.end()) { BOOST_CHECK(!ret); } else { BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(ref->second, i->GetValue()); } } BOOST_AUTO_TEST_CASE(empty) { typedef const Entry T; const T *i; bool ret = SortedUniformFind, Pivot64>(Accessor(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); BOOST_CHECK(!ret); } template void RandomTest(Key upper, size_t entries, size_t queries) { typedef unsigned char Value; boost::mt19937 rng; boost::uniform_int range_key(0, upper); boost::uniform_int range_value(0, 255); boost::variate_generator > gen_key(rng, range_key); boost::variate_generator > gen_value(rng, range_value); typedef Entry Ent; std::vector backing; boost::unordered_map reference; Ent ent; for (size_t i = 0; i < entries; ++i) { Key key = gen_key(); unsigned char value = gen_value(); if (reference.insert(std::make_pair(key, value)).second) { ent.key = key; ent.value = value; backing.push_back(ent); } } std::sort(backing.begin(), backing.end()); // Random queries. for (size_t i = 0; i < queries; ++i) { const Key key = gen_key(); Check(&*backing.begin(), &*backing.end(), reference, key); } typename boost::unordered_map::const_iterator it = reference.begin(); for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { Check(&*backing.begin(), &*backing.end(), reference, it->second); } } BOOST_AUTO_TEST_CASE(basic) { RandomTest(11, 10, 200); } BOOST_AUTO_TEST_CASE(tiny_dense_random) { RandomTest(11, 50, 200); } BOOST_AUTO_TEST_CASE(small_dense_random) { RandomTest(100, 100, 200); } BOOST_AUTO_TEST_CASE(small_sparse_random) { RandomTest(200, 15, 200); } BOOST_AUTO_TEST_CASE(medium_sparse_random) { RandomTest(32000, 1000, 2000); } BOOST_AUTO_TEST_CASE(sparse_random) { RandomTest(std::numeric_limits::max(), 100000, 2000); } } // namespace } // namespace util ================================================ FILE: src/kenlm/util/string_piece.cc ================================================ // Copyright 2004 The RE2 Authors. All Rights Reserved. // Use of this source code is governed by a BSD-style // license that can be found in string_piece.hh. #include "util/string_piece.hh" #include #include #ifndef HAVE_ICU typedef StringPiece::size_type size_type; void StringPiece::CopyToString(std::string* target) const { target->assign(ptr_, length_); } size_type StringPiece::find(const StringPiece& s, size_type pos) const { // Not sure why length_ < 0 was here since it's std::size_t. if (/*length_ < 0 || */pos > static_cast(length_)) return npos; const char* result = std::search(ptr_ + pos, ptr_ + length_, s.ptr_, s.ptr_ + s.length_); const size_type xpos = result - ptr_; return xpos + s.length_ <= length_ ? xpos : npos; } size_type StringPiece::find(char c, size_type pos) const { if (length_ <= 0 || pos >= static_cast(length_)) { return npos; } const char* result = std::find(ptr_ + pos, ptr_ + length_, c); return result != ptr_ + length_ ? result - ptr_ : npos; } size_type StringPiece::rfind(const StringPiece& s, size_type pos) const { if (length_ < s.length_) return npos; const size_t ulen = length_; if (s.length_ == 0) return std::min(ulen, pos); const char* last = ptr_ + std::min(ulen - s.length_, pos) + s.length_; const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_); return result != last ? result - ptr_ : npos; } size_type StringPiece::rfind(char c, size_type pos) const { if (length_ <= 0) return npos; for (int i = std::min(pos, static_cast(length_ - 1)); i >= 0; --i) { if (ptr_[i] == c) { return i; } } return npos; } // For each character in characters_wanted, sets the index corresponding // to the ASCII code of that character to 1 in table. This is used by // the find_.*_of methods below to tell whether or not a character is in // the lookup table in constant time. // The argument `table' must be an array that is large enough to hold all // the possible values of an unsigned char. Thus it should be be declared // as follows: // bool table[UCHAR_MAX + 1] static inline void BuildLookupTable(const StringPiece& characters_wanted, bool* table) { const size_type length = characters_wanted.length(); const char* const data = characters_wanted.data(); for (size_type i = 0; i < length; ++i) { table[static_cast(data[i])] = true; } } size_type StringPiece::find_first_of(const StringPiece& s, size_type pos) const { if (length_ == 0 || s.length_ == 0) return npos; // Avoid the cost of BuildLookupTable() for a single-character search. if (s.length_ == 1) return find_first_of(s.ptr_[0], pos); bool lookup[UCHAR_MAX + 1] = { false }; BuildLookupTable(s, lookup); for (size_type i = pos; i < length_; ++i) { if (lookup[static_cast(ptr_[i])]) { return i; } } return npos; } size_type StringPiece::find_first_not_of(const StringPiece& s, size_type pos) const { if (length_ == 0) return npos; if (s.length_ == 0) return 0; // Avoid the cost of BuildLookupTable() for a single-character search. if (s.length_ == 1) return find_first_not_of(s.ptr_[0], pos); bool lookup[UCHAR_MAX + 1] = { false }; BuildLookupTable(s, lookup); for (size_type i = pos; i < length_; ++i) { if (!lookup[static_cast(ptr_[i])]) { return i; } } return npos; } size_type StringPiece::find_first_not_of(char c, size_type pos) const { if (length_ == 0) return npos; for (; pos < length_; ++pos) { if (ptr_[pos] != c) { return pos; } } return npos; } size_type StringPiece::find_last_of(const StringPiece& s, size_type pos) const { if (length_ == 0 || s.length_ == 0) return npos; // Avoid the cost of BuildLookupTable() for a single-character search. if (s.length_ == 1) return find_last_of(s.ptr_[0], pos); bool lookup[UCHAR_MAX + 1] = { false }; BuildLookupTable(s, lookup); for (size_type i = std::min(pos, length_ - 1); ; --i) { if (lookup[static_cast(ptr_[i])]) return i; if (i == 0) break; } return npos; } size_type StringPiece::find_last_not_of(const StringPiece& s, size_type pos) const { if (length_ == 0) return npos; size_type i = std::min(pos, length_ - 1); if (s.length_ == 0) return i; // Avoid the cost of BuildLookupTable() for a single-character search. if (s.length_ == 1) return find_last_not_of(s.ptr_[0], pos); bool lookup[UCHAR_MAX + 1] = { false }; BuildLookupTable(s, lookup); for (; ; --i) { if (!lookup[static_cast(ptr_[i])]) return i; if (i == 0) break; } return npos; } size_type StringPiece::find_last_not_of(char c, size_type pos) const { if (length_ == 0) return npos; for (size_type i = std::min(pos, length_ - 1); ; --i) { if (ptr_[i] != c) return i; if (i == 0) break; } return npos; } StringPiece StringPiece::substr(size_type pos, size_type n) const { if (pos > length_) pos = length_; if (n > length_ - pos) n = length_ - pos; return StringPiece(ptr_ + pos, n); } const size_type StringPiece::npos = size_type(-1); #endif // !HAVE_ICU ================================================ FILE: src/kenlm/util/string_piece.hh ================================================ /* If you use ICU in your program, then compile with -DHAVE_ICU -licui18n. If * you don't use ICU, then this will use the Google implementation from Chrome. * This has been modified from the original version to let you choose. */ // Copyright 2008, Google Inc. // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Copied from strings/stringpiece.h with modifications // // A string-like object that points to a sized piece of memory. // // Functions or methods may use const StringPiece& parameters to accept either // a "const char*" or a "string" value that will be implicitly converted to // a StringPiece. The implicit conversion means that it is often appropriate // to include this .h file in other files rather than forward-declaring // StringPiece as would be appropriate for most other Google classes. // // Systematic usage of StringPiece is encouraged as it will reduce unnecessary // conversions from "const char*" to "string" and back again. // #ifndef BASE_STRING_PIECE_H__ #define BASE_STRING_PIECE_H__ #include "util/have.hh" #include #include #include #ifdef HAVE_ICU #include #include // Old versions of ICU don't define operator== and operator!=. #if (U_ICU_VERSION_MAJOR_NUM < 4) || ((U_ICU_VERSION_MAJOR_NUM == 4) && (U_ICU_VERSION_MINOR_NUM < 4)) #warning You are using an old version of ICU. Consider upgrading to ICU >= 4.6. inline bool operator==(const StringPiece& x, const StringPiece& y) { if (x.size() != y.size()) return false; return std::memcmp(x.data(), y.data(), x.size()) == 0; } inline bool operator!=(const StringPiece& x, const StringPiece& y) { return !(x == y); } #endif // old version of ICU U_NAMESPACE_BEGIN #else #include #include #include #include #ifdef WIN32 #undef max #undef min #endif class StringPiece { public: typedef size_t size_type; private: const char* ptr_; size_type length_; public: // We provide non-explicit singleton constructors so users can pass // in a "const char*" or a "string" wherever a "StringPiece" is // expected. StringPiece() : ptr_(NULL), length_(0) { } StringPiece(const char* str) : ptr_(str), length_((str == NULL) ? 0 : strlen(str)) { } StringPiece(const std::string& str) : ptr_(str.data()), length_(str.size()) { } StringPiece(const char* offset, size_type len) : ptr_(offset), length_(len) { } // data() may return a pointer to a buffer with embedded NULs, and the // returned buffer may or may not be null terminated. Therefore it is // typically a mistake to pass data() to a routine that expects a NUL // terminated string. const char* data() const { return ptr_; } size_type size() const { return length_; } size_type length() const { return length_; } bool empty() const { return length_ == 0; } void clear() { ptr_ = NULL; length_ = 0; } void set(const char* data, size_type len) { ptr_ = data; length_ = len; } void set(const char* str) { ptr_ = str; length_ = str ? strlen(str) : 0; } void set(const void* data, size_type len) { ptr_ = reinterpret_cast(data); length_ = len; } char operator[](size_type i) const { return ptr_[i]; } void remove_prefix(size_type n) { ptr_ += n; length_ -= n; } void remove_suffix(size_type n) { length_ -= n; } int compare(const StringPiece& x) const { int r = wordmemcmp(ptr_, x.ptr_, std::min(length_, x.length_)); if (r == 0) { if (length_ < x.length_) r = -1; else if (length_ > x.length_) r = +1; } return r; } std::string as_string() const { // std::string doesn't like to take a NULL pointer even with a 0 size. return std::string(!empty() ? data() : "", size()); } void CopyToString(std::string* target) const; void AppendToString(std::string* target) const; // Does "this" start with "x" bool starts_with(const StringPiece& x) const { return ((length_ >= x.length_) && (wordmemcmp(ptr_, x.ptr_, x.length_) == 0)); } // Does "this" end with "x" bool ends_with(const StringPiece& x) const { return ((length_ >= x.length_) && (wordmemcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); } // standard STL container boilerplate typedef char value_type; typedef const char* pointer; typedef const char& reference; typedef const char& const_reference; typedef ptrdiff_t difference_type; static const size_type npos; typedef const char* const_iterator; typedef const char* iterator; typedef std::reverse_iterator const_reverse_iterator; typedef std::reverse_iterator reverse_iterator; iterator begin() const { return ptr_; } iterator end() const { return ptr_ + length_; } const_reverse_iterator rbegin() const { return const_reverse_iterator(ptr_ + length_); } const_reverse_iterator rend() const { return const_reverse_iterator(ptr_); } size_type max_size() const { return length_; } size_type capacity() const { return length_; } size_type copy(char* buf, size_type n, size_type pos = 0) const; size_type find(const StringPiece& s, size_type pos = 0) const; size_type find(char c, size_type pos = 0) const; size_type rfind(const StringPiece& s, size_type pos = npos) const; size_type rfind(char c, size_type pos = npos) const; size_type find_first_of(const StringPiece& s, size_type pos = 0) const; size_type find_first_of(char c, size_type pos = 0) const { return find(c, pos); } size_type find_first_not_of(const StringPiece& s, size_type pos = 0) const; size_type find_first_not_of(char c, size_type pos = 0) const; size_type find_last_of(const StringPiece& s, size_type pos = npos) const; size_type find_last_of(char c, size_type pos = npos) const { return rfind(c, pos); } size_type find_last_not_of(const StringPiece& s, size_type pos = npos) const; size_type find_last_not_of(char c, size_type pos = npos) const; StringPiece substr(size_type pos, size_type n = npos) const; static int wordmemcmp(const char* p, const char* p2, size_type N) { return memcmp(p, p2, N); } }; inline bool operator==(const StringPiece& x, const StringPiece& y) { if (x.size() != y.size()) return false; return std::memcmp(x.data(), y.data(), x.size()) == 0; } inline bool operator!=(const StringPiece& x, const StringPiece& y) { return !(x == y); } #endif // HAVE_ICU undefined inline bool operator<(const StringPiece& x, const StringPiece& y) { const int r = std::memcmp(x.data(), y.data(), std::min(x.size(), y.size())); return ((r < 0) || ((r == 0) && (x.size() < y.size()))); } inline bool operator>(const StringPiece& x, const StringPiece& y) { return y < x; } inline bool operator<=(const StringPiece& x, const StringPiece& y) { return !(x > y); } inline bool operator>=(const StringPiece& x, const StringPiece& y) { return !(x < y); } // allow StringPiece to be logged (needed for unit testing). inline std::ostream& operator<<(std::ostream& o, const StringPiece& piece) { return o.write(piece.data(), static_cast(piece.size())); } #ifdef HAVE_ICU U_NAMESPACE_END using U_NAMESPACE_QUALIFIER StringPiece; #endif #endif // BASE_STRING_PIECE_H__ ================================================ FILE: src/kenlm/util/string_piece_hash.hh ================================================ #ifndef UTIL_STRING_PIECE_HASH__ #define UTIL_STRING_PIECE_HASH__ #include "util/string_piece.hh" #include #include inline size_t hash_value(const StringPiece &str) { return boost::hash_range(str.data(), str.data() + str.length()); } /* Support for lookup of StringPiece in boost::unordered_map */ struct StringPieceCompatibleHash : public std::unary_function { size_t operator()(const StringPiece &str) const { return hash_value(str); } }; struct StringPieceCompatibleEquals : public std::binary_function { bool operator()(const StringPiece &first, const StringPiece &second) const { return first == second; } }; template typename T::const_iterator FindStringPiece(const T &t, const StringPiece &key) { #if BOOST_VERSION < 104200 std::string temp(key.data(), key.size()); return t.find(temp); #else return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); #endif } template typename T::iterator FindStringPiece(T &t, const StringPiece &key) { #if BOOST_VERSION < 104200 std::string temp(key.data(), key.size()); return t.find(temp); #else return t.find(key, StringPieceCompatibleHash(), StringPieceCompatibleEquals()); #endif } #endif // UTIL_STRING_PIECE_HASH__ ================================================ FILE: src/kenlm/util/thread_pool.hh ================================================ #ifndef UTIL_THREAD_POOL__ #define UTIL_THREAD_POOL__ #include "util/pcqueue.hh" #include #include #include #include #include namespace util { template class Worker : boost::noncopyable { public: typedef HandlerT Handler; typedef typename Handler::Request Request; template Worker(PCQueue &in, Construct &construct, Request &poison) : in_(in), handler_(construct), thread_(boost::ref(*this)), poison_(poison) {} // Only call from thread. void operator()() { Request request; while (1) { in_.Consume(request); if (request == poison_) return; try { (*handler_)(request); } catch(std::exception &e) { std::cerr << "Handler threw " << e.what() << std::endl; abort(); } catch(...) { std::cerr << "Handler threw an exception, dropping request" << std::endl; abort(); } } } void Join() { thread_.join(); } private: PCQueue &in_; boost::optional handler_; boost::thread thread_; Request poison_; }; template class ThreadPool : boost::noncopyable { public: typedef HandlerT Handler; typedef typename Handler::Request Request; template ThreadPool(size_t queue_length, size_t workers, Construct handler_construct, Request poison) : in_(queue_length), poison_(poison) { for (size_t i = 0; i < workers; ++i) { workers_.push_back(new Worker(in_, handler_construct, poison)); } } ~ThreadPool() { for (size_t i = 0; i < workers_.size(); ++i) { Produce(poison_); } for (typename boost::ptr_vector >::iterator i = workers_.begin(); i != workers_.end(); ++i) { i->Join(); } } void Produce(const Request &request) { in_.Produce(request); } // For adding to the queue. PCQueue &In() { return in_; } private: PCQueue in_; boost::ptr_vector > workers_; Request poison_; }; } // namespace util #endif // UTIL_THREAD_POOL__ ================================================ FILE: src/kenlm/util/tokenize_piece.hh ================================================ #ifndef UTIL_TOKENIZE_PIECE__ #define UTIL_TOKENIZE_PIECE__ #include "util/exception.hh" #include "util/string_piece.hh" #include #include #include namespace util { // Thrown on dereference when out of tokens to parse class OutOfTokens : public Exception { public: OutOfTokens() throw() {} ~OutOfTokens() throw() {} }; class SingleCharacter { public: SingleCharacter() {} explicit SingleCharacter(char delim) : delim_(delim) {} StringPiece Find(const StringPiece &in) const { return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); } private: char delim_; }; class MultiCharacter { public: MultiCharacter() {} explicit MultiCharacter(const StringPiece &delimiter) : delimiter_(delimiter) {} StringPiece Find(const StringPiece &in) const { return StringPiece(std::search(in.data(), in.data() + in.size(), delimiter_.data(), delimiter_.data() + delimiter_.size()), delimiter_.size()); } private: StringPiece delimiter_; }; class AnyCharacter { public: AnyCharacter() {} explicit AnyCharacter(const StringPiece &chars) : chars_(chars) {} StringPiece Find(const StringPiece &in) const { return StringPiece(std::find_first_of(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); } private: StringPiece chars_; }; class AnyCharacterLast { public: AnyCharacterLast() {} explicit AnyCharacterLast(const StringPiece &chars) : chars_(chars) {} StringPiece Find(const StringPiece &in) const { return StringPiece(std::find_end(in.data(), in.data() + in.size(), chars_.data(), chars_.data() + chars_.size()), 1); } private: StringPiece chars_; }; template class TokenIter : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { public: TokenIter() {} template TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { increment(); } bool operator!() const { return current_.data() == 0; } operator bool() const { return current_.data() != 0; } static TokenIter end() { return TokenIter(); } private: friend class boost::iterator_core_access; void increment() { do { StringPiece found(finder_.Find(after_)); current_ = StringPiece(after_.data(), found.data() - after_.data()); if (found.data() == after_.data() + after_.size()) { after_ = StringPiece(NULL, 0); } else { after_ = StringPiece(found.data() + found.size(), after_.data() - found.data() + after_.size() - found.size()); } } while (SkipEmpty && current_.data() && current_.empty()); // Compiler should optimize this away if SkipEmpty is false. } bool equal(const TokenIter &other) const { return current_.data() == other.current_.data(); } const StringPiece &dereference() const { UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); return current_; } StringPiece current_; StringPiece after_; Find finder_; }; } // namespace util #endif // UTIL_TOKENIZE_PIECE__ ================================================ FILE: src/kenlm/util/tokenize_piece_test.cc ================================================ #include "util/tokenize_piece.hh" #include "util/string_piece.hh" #define BOOST_TEST_MODULE TokenIteratorTest #include #include namespace util { namespace { BOOST_AUTO_TEST_CASE(pipe_pipe_none) { const char str[] = "nodelimit at all"; TokenIter it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); BOOST_CHECK_EQUAL(StringPiece(str), *it); ++it; BOOST_CHECK(!it); } BOOST_AUTO_TEST_CASE(pipe_pipe_two) { const char str[] = "|||"; TokenIter it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); BOOST_CHECK_EQUAL(StringPiece(), *it); ++it; BOOST_REQUIRE(it); BOOST_CHECK_EQUAL(StringPiece(), *it); ++it; BOOST_CHECK(!it); } BOOST_AUTO_TEST_CASE(remove_empty) { const char str[] = "|||"; TokenIter it(str, MultiCharacter("|||")); BOOST_CHECK(!it); } BOOST_AUTO_TEST_CASE(remove_empty_keep) { const char str[] = " |||"; TokenIter it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); BOOST_CHECK_EQUAL(StringPiece(" "), *it); ++it; BOOST_CHECK(!it); } } // namespace } // namespace util ================================================ FILE: src/kenlm/util/usage.cc ================================================ #include "util/usage.hh" #include "util/exception.hh" #include #include #include #include #include #if !defined(_WIN32) && !defined(_WIN64) #include #include #include #endif namespace util { namespace { #if !defined(_WIN32) && !defined(_WIN64) float FloatSec(const struct timeval &tv) { return static_cast(tv.tv_sec) + (static_cast(tv.tv_usec) / 1000000.0); } #endif } // namespace void PrintUsage(std::ostream &out) { #if !defined(_WIN32) && !defined(_WIN64) struct rusage usage; if (getrusage(RUSAGE_SELF, &usage)) { perror("getrusage"); return; } out << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n'; // Linux doesn't set memory usage :-(. std::ifstream status("/proc/self/status", std::ios::in); std::string line; while (getline(status, line)) { if (!strncmp(line.c_str(), "VmRSS:\t", 7)) { out << "VmRSS: " << (line.c_str() + 7) << '\n'; break; } else if (!strncmp(line.c_str(), "VmPeak:\t", 8)) { out << "VmPeak: " << (line.c_str() + 8) << '\n'; } } #endif } uint64_t GuessPhysicalMemory() { #if defined(_WIN32) || defined(_WIN64) return 0; #elif defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) long pages = sysconf(_SC_PHYS_PAGES); if (pages == -1) return 0; long page_size = sysconf(_SC_PAGESIZE); if (page_size == -1) return 0; return static_cast(pages) * static_cast(page_size); #else return 0; #endif } namespace { class SizeParseError : public Exception { public: explicit SizeParseError(const std::string &str) throw() { *this << "Failed to parse " << str << " into a memory size "; } }; template uint64_t ParseNum(const std::string &arg) { std::stringstream stream(arg); Num value; stream >> value; UTIL_THROW_IF_ARG(!stream, SizeParseError, (arg), "for the leading number."); std::string after; stream >> after; UTIL_THROW_IF_ARG(after.size() > 1, SizeParseError, (arg), "because there are more than two characters after the number."); std::string throwaway; UTIL_THROW_IF_ARG(stream >> throwaway, SizeParseError, (arg), "because there was more cruft " << throwaway << " after the number."); // Silly sort, using kilobytes as your default unit. if (after.empty()) after = "K"; if (after == "%") { uint64_t mem = GuessPhysicalMemory(); UTIL_THROW_IF_ARG(!mem, SizeParseError, (arg), "because % was specified but the physical memory size could not be determined."); return static_cast(value) * static_cast(mem) / 100.0; } std::string units("bKMGTPEZY"); std::string::size_type index = units.find(after[0]); UTIL_THROW_IF_ARG(index == std::string::npos, SizeParseError, (arg), "the allowed suffixes are " << units << "%."); for (std::string::size_type i = 0; i < index; ++i) { value *= 1024; } return value; } } // namespace uint64_t ParseSize(const std::string &arg) { return arg.find('.') == std::string::npos ? ParseNum(arg) : ParseNum(arg); } } // namespace util ================================================ FILE: src/kenlm/util/usage.hh ================================================ #ifndef UTIL_USAGE__ #define UTIL_USAGE__ #include #include #include #include namespace util { void PrintUsage(std::ostream &to); // Determine how much physical memory there is. Return 0 on failure. uint64_t GuessPhysicalMemory(); // Parse a size like unix sort. Sadly, this means the default multiplier is K. uint64_t ParseSize(const std::string &arg); } // namespace util #endif // UTIL_USAGE__ ================================================ FILE: src/opennlp/ccg/Parse.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg; import java.io.*; import java.net.URL; import java.util.HashMap; import java.util.Map; import org.jdom.Document; import org.jdom.Element; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.hylo.HyloHelper; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.parse.ParseException; import opennlp.ccg.parse.Parser; import opennlp.ccg.parse.Supertagger; import opennlp.ccg.parse.supertagger.WordAndPOSDictionaryLabellingStrategy; import opennlp.ccg.synsem.Category; import opennlp.ccg.synsem.LF; import opennlp.ccg.synsem.Sign; import opennlp.ccg.synsem.SignScorer; import opennlp.ccg.test.RegressionInfo; import opennlp.ccg.test.DerivMaker; import opennlp.ccgbank.extract.Testbed; /** * Creates a testbed file by parsing a text file. * Text is assumed to be tokenized, with one sentence per line. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2010/10/28 02:46:32 $ */ public class Parse { public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.Parse \n" + " (-g ) \n" + " -parsescorer \n" + " -supertagger | -stconfig \n" + " (-nbestListSize ) \n" + " (-includederivs) \n" + " (-includescores) \n" + " "; if (args.length == 0 || args[0].equals("-h")) { System.out.println(usage); System.exit(0); } // args String grammarfile = "grammar.xml"; String inputfile = null; String outputfile = null; String parseScorerClass = null; String supertaggerClass = null, stconfig = null; boolean includederivs = false; boolean includescores = false; int nbestListSize = 1; for (int i = 0; i < args.length; i++) { if (args[i].equals("-g")) { grammarfile = args[++i]; continue; } if (args[i].equals("-parsescorer")) { parseScorerClass = args[++i]; continue; } if (args[i].equals("-supertagger")) { supertaggerClass = args[++i]; continue; } if (args[i].equals("-stconfig")) { stconfig = args[++i]; continue; } if (args[i].equals("-nbestListSize")) { nbestListSize = Integer.parseInt(args[++i]); continue; } if (args[i].equals("-includederivs")) { includederivs = true; continue; } if (args[i].equals("-includescores")) { includescores = true; continue; } if (inputfile == null) { inputfile = args[i]; continue; } outputfile = args[i]; } if (nbestListSize < 1) nbestListSize = 1; if (inputfile == null || outputfile == null || parseScorerClass == null || (supertaggerClass == null && stconfig == null)) { System.out.println(usage); System.exit(0); } // make test doc, sign map Document outDoc = new Document(); Element outRoot = new Element("regression"); outDoc.setRootElement(outRoot); Map signMap = new HashMap(); // load grammar URL grammarURL = new File(grammarfile).toURI().toURL(); System.out.println("Loading grammar from URL: " + grammarURL); Grammar grammar = new Grammar(grammarURL); Tokenizer tokenizer = grammar.lexicon.tokenizer; System.out.println(); // set up parser Parser parser = new Parser(grammar); // instantiate scorer try { System.out.println("Instantiating parsing sign scorer from class: " + parseScorerClass); SignScorer parseScorer = (SignScorer) Class.forName(parseScorerClass).newInstance(); parser.setSignScorer(parseScorer); System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } // instantiate supertagger try { Supertagger supertagger; if (supertaggerClass != null) { System.out.println("Instantiating supertagger from class: " + supertaggerClass); supertagger = (Supertagger) Class.forName(supertaggerClass).newInstance(); } else { System.out.println("Instantiating supertagger from config file: " + stconfig); supertagger = WordAndPOSDictionaryLabellingStrategy.supertaggerFactory(stconfig); } parser.setSupertagger(supertagger); System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } // loop through input BufferedReader in = new BufferedReader(new FileReader(inputfile)); String line; Map predInfoMap = new HashMap(); System.out.println("Parsing " + inputfile); System.out.println(); int count = 1; while ((line = in.readLine()) != null) { String id = "s" + count; try { // parse it System.out.println(line); parser.parse(line); int numParses = Math.min(nbestListSize, parser.getResult().size()); for (int i=0; i < numParses; i++) { Sign thisParse = parser.getResult().get(i); // convert lf Category cat = thisParse.getCategory(); LF convertedLF = null; String predInfo = null; if (cat.getLF() != null) { // convert LF LF flatLF = cat.getLF(); cat = cat.copy(); Nominal index = cat.getIndexNominal(); convertedLF = HyloHelper.compactAndConvertNominals(flatLF, index, thisParse); // get pred info predInfoMap.clear(); Testbed.extractPredInfo(flatLF, predInfoMap); predInfo = Testbed.getPredInfo(predInfoMap); } // add test item, sign Element item = RegressionInfo.makeTestItem(grammar, line, 1, convertedLF); String actualID = (nbestListSize == 1) ? id : id + "-" + (i+1); item.setAttribute("info", actualID); item.setAttribute("test","true"); outRoot.addContent(item); signMap.put(actualID, thisParse); // Add parsed words as a separate LF element Element fullWordsElt = new Element("full-words"); fullWordsElt.addContent(tokenizer.format(thisParse.getWords())); item.addContent(fullWordsElt); if (predInfo != null) { Element predInfoElt = new Element("pred-info"); predInfoElt.setAttribute("data", predInfo); item.addContent(predInfoElt); } if (includederivs) { Element derivElt = new Element("deriv"); derivElt.addContent(DerivMaker.makeDeriv(thisParse)); item.addContent(derivElt); } if (includescores) { String score = parser.getScores().get(i).toString(); item.setAttribute("score", score); } } } catch (ParseException e) { System.out.println("Unable to parse!"); // add test item with zero parses Element item = RegressionInfo.makeTestItem(grammar, line, 0, null); item.setAttribute("info", id); outRoot.addContent(item); } count++; } System.out.println(); // write test doc, saved signs System.out.println("Writing parses to " + outputfile); XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()); File regressionFile = new File(outputfile); outputter.output(outDoc, new FileOutputStream(regressionFile)); RegressionInfo.writeSerFile(signMap, regressionFile); System.out.println(); // done in.close(); System.out.println("Done."); } } ================================================ FILE: src/opennlp/ccg/Realize.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-7 University of Edinburgh, Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg; import opennlp.ccg.realize.*; import opennlp.ccg.realize.hypertagger.ZLMaxentHypertagger; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import opennlp.ccg.ngrams.*; import org.jdom.*; import org.jdom.output.*; import java.io.*; import java.net.*; import java.util.*; import java.util.prefs.*; /** * Sample front-end to the realizer, showing the intermediate steps of realization. * * @author Michael White * @version $Revision: 1.38 $, $Date: 2011/08/10 17:58:45 $ */ public class Realize { private static PrintWriter out; @SuppressWarnings("unchecked") public static void main(String[] args) throws Exception { String usage = "Usage: java opennlp.ccg.Realize (-g ) (-exactmatches) (-ngramorder N) ()"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } // args String grammarfile = "grammar.xml"; String inputfile = null; String outputfile = null; boolean exactMatches = false; int ngramOrder = 0; for (int i = 0; i < args.length; i++) { if (args[i].startsWith("-D")) { String prop = args[i].substring(2); int equalpos = prop.indexOf("="); String key = prop.substring(0, equalpos); String val = prop.substring(equalpos+1); System.setProperty(key, val); continue; } if (args[i].equals("-g")) { grammarfile = args[++i]; continue; } if (args[i].equals("-exactmatches")) { exactMatches = true; continue; } if (args[i].equals("-ngramorder")) { ngramOrder = Integer.parseInt(args[++i]); continue; } if (inputfile == null) { inputfile = args[i]; continue; } outputfile = args[i]; } if (inputfile == null) { System.out.println(usage); System.exit(0); } // set out accordingly if (outputfile != null) { out = new PrintWriter(new BufferedWriter(new FileWriter(outputfile))); } else { out = new PrintWriter(System.out); } // remember, modify prefs Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); boolean oldShowCompleteness = prefs.getBoolean(Edge.SHOW_COMPLETENESS, false); boolean oldShowBitset = prefs.getBoolean(Edge.SHOW_BITSET, false); prefs.putBoolean(Edge.SHOW_COMPLETENESS, true); prefs.putBoolean(Edge.SHOW_BITSET, true); // load grammar URL grammarURL = new File(grammarfile).toURI().toURL(); out.println("Loading grammar from URL: " + grammarURL); Grammar grammar = new Grammar(grammarURL); // instantiate realizer Realizer realizer = new Realizer(grammar); // get request out.println(); out.println("Request:"); out.println(); Document doc = grammar.loadFromXml(inputfile); org.jdom.output.XMLOutputter outputter = new org.jdom.output.XMLOutputter(Format.getPrettyFormat()); out.flush(); outputter.output(doc, out); out.flush(); LF lf = Realizer.getLfFromDoc(doc); out.println(); out.println("** Initial run"); out.println(); out.println("Input LF: " + lf); // set up n-gram scorer SignScorer ngramScorer; Element root = doc.getRootElement(); Element ngramModelElt = root.getChild("ngram-model"); if (ngramModelElt == null) { // just use targets List targetElts = root.getChildren("target"); String[] targets = new String[targetElts.size()]; out.println(); out.println("Targets:"); for (int i=0; i < targetElts.size(); i++) { Element ex = (Element) targetElts.get(i); String target = ex.getText(); out.println(target); targets[i] = target; } if (ngramOrder > 0) { out.println(); out.println("Using order " + ngramOrder + " in n-gram precision."); ngramScorer = new NgramPrecisionModel(targets, ngramOrder); } else ngramScorer = new NgramPrecisionModel(targets); if (exactMatches) { out.println(); out.println("Only counting exact matches in n-gram precision."); ((NgramPrecisionModel)ngramScorer).setExactMatches(exactMatches); } } else if (ngramModelElt.getAttributeValue("class") != null) { // load scorer from class String scorerClass = ngramModelElt.getAttributeValue("class"); out.println(); out.println("Instantiating scorer from class: " + scorerClass); ngramScorer = (SignScorer) Class.forName(scorerClass).newInstance(); } else { // load n-gram model String filename = ngramModelElt.getAttributeValue("file"); String reverseStr = ngramModelElt.getAttributeValue("reverse"); boolean reverse = (reverseStr != null) ? reverseStr.equals("true") : false; String factoredStr = ngramModelElt.getAttributeValue("factored"); boolean factored = (factoredStr != null) ? factoredStr.equals("true") : false; String semClassesStr = ngramModelElt.getAttributeValue("sem-classes"); boolean useSemClasses = (semClassesStr != null) ? semClassesStr.equals("true") : true; int order = 3; // order can only be changed for standard n-gram models String orderStr = ngramModelElt.getAttributeValue("order"); if (orderStr != null) { order = Integer.parseInt(orderStr); } if (ngramOrder > 0) order = ngramOrder; // preference given to command-line value out.println(); String msg = "Loading "; if (reverse) msg += "reversed "; if (factored) msg += "factored "; msg += "n-gram model "; if (!factored) msg += "of order " + order + " "; if (useSemClasses) msg += "with semantic class replacement "; msg += "from: " + filename; out.println(msg); if (factored) ngramScorer = new FactoredNgramModelFamily(filename, useSemClasses); else ngramScorer = new StandardNgramModel(order, filename, useSemClasses); if (reverse) ((NgramScorer)ngramScorer).setReverse(true); } // set pruning strategy (if any) Element pruningStrategyElt = root.getChild("pruning-strategy"); if (pruningStrategyElt != null) { // load pruning strategy from class String pruningStrategyClass = pruningStrategyElt.getAttributeValue("class"); out.println(); out.println("Instantiating pruning strategy from class: " + pruningStrategyClass); realizer.pruningStrategy = (PruningStrategy) Class.forName(pruningStrategyClass).newInstance(); } // set hypertagger (if any) Element htModelElt = root.getChild("ht-model"); if (htModelElt != null) { String htconfig = htModelElt.getAttributeValue("config"); if (htconfig != null) { out.println(); out.println("Instantiating hypertagger from: " + htconfig); realizer.hypertagger = ZLMaxentHypertagger.ZLMaxentHypertaggerFactory(htconfig); } else { String htModelClass = htModelElt.getAttributeValue("class"); out.println(); out.println("Instantiating hypertagger from class: " + htModelClass); realizer.hypertagger = (Hypertagger) Class.forName(htModelClass).newInstance(); } } // run request realizer.realize(lf, ngramScorer); Chart chart = realizer.getChart(); chart.out = out; out.println(); out.println("Preds:"); chart.printEPs(); out.println(); out.println("LF chunks:"); chart.printLfChunks(); out.println(); out.println("LF alts:"); chart.printLfAlts(); out.println(); out.println("LF optional parts:"); chart.printLfOpts(); out.println(); out.println("Initial Edges:"); chart.printInitialEdges(); out.println(); out.println("Marked Edges:"); chart.printMarkedEdges(); out.println(); out.println("Instantiated Semantically Null Edges:"); chart.printInstantiatedNoSemEdges(); out.println(); out.println("Uninstantiated Semantically Null Edges:"); chart.printNoSemEdges(); out.println(); out.println("Rule Instances:"); chart.printRuleInstances(); out.println(); out.println("All Edges:"); chart.printEdges(); out.println(); out.println("Complete Edges (unsorted):"); chart.printEdges(true); out.println(); out.println("Complete Edges (sorted):"); chart.printEdges(true, true); out.println(); out.println("Best Edge:"); chart.printBestEdge(); out.println(); out.println("Best Edge Derivation:"); out.println(chart.bestEdge.getSign().getDerivationHistory()); out.flush(); if (chart.bestJoinedEdge != null) { out.println(); out.println("Best Joined Edge:"); chart.printBestJoinedEdge(); out.println(); out.println("Best Joined Edge Derivation:"); out.println(chart.bestJoinedEdge.getSign().getDerivationHistory()); out.flush(); } // reset prefs prefs.putBoolean(Edge.SHOW_COMPLETENESS, oldShowCompleteness); prefs.putBoolean(Edge.SHOW_BITSET, oldShowBitset); } } ================================================ FILE: src/opennlp/ccg/TextCCG.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge, Gann Bierner, // University of Edinburgh (Michael White), // Alexandros Triantafyllidis and David Reitter // Copyright (C) 2006 Ben Wing // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg; import opennlp.ccg.lexicon.*; import opennlp.ccg.grammar.*; import opennlp.ccg.parse.*; import opennlp.ccg.util.*; import opennlp.ccg.synsem.*; import opennlp.ccg.realize.*; import opennlp.ccg.hylo.*; import opennlp.ccg.ngrams.*; import opennlp.ccg.test.*; import opennlp.ccg.realize.Edge; // only realization edges referenced (for preferences) import org.jdom.*; import java.io.*; import java.net.*; import java.util.*; import java.util.prefs.*; /** * A text interface for testing grammars. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @author Alexandros Triantafyllidis * @author David Reitter * @version $Revision: 1.67 $, $Date: 2011/12/13 04:00:54 $ */ public class TextCCG { /** Preference key for showing all results. */ public static final String SHOW_ALL_RESULTS = "Show All Results"; /** Preference key for showing derivations. */ public static final String SHOW_DERIVATIONS = "Show Derivations"; /** Preference key for showing features. */ public static final String SHOW_FEATURES = "Show Features"; /** Preference key for showing semantics. */ public static final String SHOW_SEMANTICS = "Show Semantics"; /** Preference key for showing features. */ public static final String FEATURES_TO_SHOW = "Features to Show"; /** Preference key for showing realizer timing. */ public static final String SHOW_TIMING = "Show Timing"; /** Preference key for showing incomplete edges during realization. */ public static final String SHOW_INCOMPLETE_EDGES = "Show Incomplete Edges"; /** Preference key for visualizing a derivation. */ public static final String VISUALIZE = "Visualize"; /** Preference key for command line history. */ public static final String HISTORY = "Command Line History"; /** Main method for tccg. */ @SuppressWarnings("unchecked") public static void main(String[] args) throws IOException, LexException { String usage = "java opennlp.ccg.TextCCG " + "() | (-exportprefs ) | (-importprefs )"; if (args.length > 0 && args[0].equals("-h")) { System.out.println("Usage: " + usage); System.exit(0); } // args String grammarfile = "grammar.xml"; String prefsfile = null; boolean exportPrefs = false; boolean importPrefs = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-exportprefs")) { exportPrefs = true; prefsfile = args[++i]; continue; } if (args[i].equals("-importprefs")) { importPrefs = true; prefsfile = args[++i]; continue; } grammarfile = args[i]; } // prefs Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); try { if (exportPrefs) { System.out.println("Exporting preferences to prefsfile: " + prefsfile); prefs.exportNode(new FileOutputStream(prefsfile)); return; } if (importPrefs) { System.out.println("Importing preferences from prefsfile: " + prefsfile); Preferences.importPreferences(new FileInputStream(prefsfile)); return; } } catch (Exception exc) { throw (IOException) new IOException().initCause(exc); } // load grammar URL grammarURL = new File(grammarfile).toURI().toURL(); System.out.println("Loading grammar from URL: " + grammarURL); Grammar grammar = new Grammar(grammarURL); if (grammar.getName() != null) System.out.println("Grammar '" + grammar.getName() + "' loaded."); System.out.println(); // create parser and realizer Parser parser = new Parser(grammar); Realizer realizer = new Realizer(grammar); // stuff to remember during loop Sign[] lastResults = null; LF[] lastLFs = null; String lastSentence = ""; int lastReading = 0; // prepare to accept input from user String[] completions = { ":sh", ":v", ":reset", ":feats", ":nofeats", ":foff", ":sem", ":nosem", ":all", ":notall", ":derivs", ":noderivs", ":doff", ":vison", ":visoff", ":wordpos", ":nowordpos", ":eisner", ":noeisner", ":ptl", ":noptl", ":pel", ":nopel", ":ppv", ":noppv", ":pcpv", ":nopcpv", ":plazy", ":noplazy", ":r", ":sel", ":2xml", ":2tb", ":2apml", ":tl", ":notl", ":el", ":noel", ":nbtl", ":nonbtl", ":pv", ":nopv", ":cpv", ":nocpv", ":upon", ":upoff", ":t", ":toff", ":inc", ":noinc", ":ion", ":ioff", ":mion", ":mioff", ":con", ":coff", ":flon", ":floff", ":ccon", ":ccoff", ":pon", ":poff", ":q", ":h"}; LineReader lineReader = LineReader.createLineReader(completions); // initialize history, per grammar, from prefs String historyKey = HISTORY + "_" + grammar.getName(); String histStr = prefs.get(historyKey, ""); lineReader.setCommandHistory(histStr); // welcome msg System.out.println("Enter strings to parse."); System.out.println("Type ':r' to realize selected reading of previous parse."); System.out.println("Type ':h' for help on display options and ':q' to quit."); System.out.println("You can use the tab key for command completion, "); System.out.println("Ctrl-P (prev) and Ctrl-N (next) to access the command history, "); System.out.println("and emacs-style control keys to edit the line."); System.out.println(); while (true) { String input = lineReader.readLine("tccg> "); if (input == null) break; // control-D or the like input = input.trim(); if (input.equals(":show settings") || input.equals(":sh")) { showSettings(prefs); } else if (input.equals(":v")) { prefs.putBoolean(SHOW_ALL_RESULTS, true); prefs.putBoolean(SHOW_DERIVATIONS, true); prefs.putBoolean(SHOW_FEATURES, true); prefs.putBoolean(SHOW_SEMANTICS, true); prefs.put(FEATURES_TO_SHOW, ""); } else if (input.equals(":q")) { break; // end of while loop } else if (input.equals(":h")) { showHelp(); } else if (input.equals(":reset")) { prefs.putBoolean(SHOW_ALL_RESULTS, false); prefs.putBoolean(SHOW_DERIVATIONS, false); prefs.putBoolean(SHOW_TIMING, false); prefs.putBoolean(SHOW_INCOMPLETE_EDGES, false); prefs.putBoolean(Edge.SHOW_COMPLETENESS, false); prefs.putBoolean(Edge.SHOW_BITSET, false); prefs.putBoolean(SHOW_FEATURES, false); prefs.putBoolean(SHOW_SEMANTICS, false); prefs.put(FEATURES_TO_SHOW, ""); prefs.putBoolean(VISUALIZE, false); prefs.put("VISFNAME", ""); prefs.putBoolean(Converter.USE_WORD_POSITIONS_FOR_ATOM_CONVERSION, true); prefs.putBoolean(AbstractCompositionRule.EISNER_CONSTRAINTS, true); AbstractCompositionRule.useEisnerConstraints = true; prefs.putInt(Parser.PARSE_TIME_LIMIT, Parser.NO_TIME_LIMIT); prefs.putInt(Parser.PARSE_EDGE_LIMIT, Parser.NO_EDGE_LIMIT); prefs.putInt(Parser.PARSE_PRUNING_VALUE, Parser.NO_PRUNING); prefs.putInt(Parser.PARSE_CELL_PRUNING_VALUE, Parser.NO_PRUNING); prefs.putBoolean(Parser.PARSE_LAZY_UNPACKING, true); prefs.putBoolean(EdgeFactory.USE_INDEXING, true); prefs.putBoolean(EdgeFactory.ALLOW_MISSING_INDEX_COMBOS, false); prefs.putBoolean(EdgeFactory.USE_CHUNKS, true); prefs.putBoolean(EdgeFactory.USE_FEATURE_LICENSING, true); prefs.putBoolean(opennlp.ccg.realize.Chart.USE_COMBOS, true); prefs.putBoolean(opennlp.ccg.realize.Chart.USE_PACKING, false); prefs.putInt(opennlp.ccg.realize.Chart.TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); prefs.putDouble(opennlp.ccg.realize.Chart.NEW_BEST_TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); prefs.putInt(opennlp.ccg.realize.Chart.EDGE_LIMIT, opennlp.ccg.realize.Chart.NO_EDGE_LIMIT); prefs.putInt(opennlp.ccg.realize.Chart.PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); prefs.putInt(opennlp.ccg.realize.Chart.CELL_PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); prefs.putBoolean(opennlp.ccg.realize.Chart.DO_UNPACKING, true); } else if (input.equals(":show feats") || input.equals(":feats") || input.equals(":f")) { prefs.putBoolean(SHOW_FEATURES, true); prefs.put(FEATURES_TO_SHOW, ""); Grammar.theGrammar.prefs.showFeats = true; Grammar.theGrammar.prefs.featsToShow = ""; } else if (input.startsWith(":show feats ") || input.startsWith(":feats ") || input.startsWith(":f ")) { prefs.putBoolean(SHOW_FEATURES, true); String s = input.substring(input.indexOf(' ') + 1); if (s.startsWith("feats ")) { s = s.substring(6); } prefs.put(FEATURES_TO_SHOW, s); Grammar.theGrammar.prefs.showFeats = true; Grammar.theGrammar.prefs.featsToShow = s; } else if (input.equals(":nofeats") || input.equals(":foff")) { prefs.putBoolean(SHOW_FEATURES, false); prefs.put(FEATURES_TO_SHOW, ""); Grammar.theGrammar.prefs.showFeats = false; Grammar.theGrammar.prefs.featsToShow = ""; } else if (input.equals(":show semantics") || input.equals(":sem") || input.equals(":s")) { prefs.putBoolean(SHOW_SEMANTICS, true); Grammar.theGrammar.prefs.showSem = true; } else if (input.equals(":nosem") || input.equals(":soff")) { prefs.putBoolean(SHOW_SEMANTICS, false); Grammar.theGrammar.prefs.showSem = false; } else if (input.equals(":show all") || input.equals(":all") || input.equals(":a")) { prefs.putBoolean(SHOW_ALL_RESULTS, true); } else if (input.equals(":notall") || input.equals(":aoff")) { prefs.putBoolean(SHOW_ALL_RESULTS, false); } else if (input.equals(":show derivs") || input.equals(":derivs") || input.equals(":d")) { prefs.putBoolean(SHOW_DERIVATIONS, true); } else if (input.equals(":noderivs") || input.equals(":doff")) { prefs.putBoolean(SHOW_DERIVATIONS, false); } else if (input.startsWith(":time limit") || input.startsWith(":tl")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int limit = Integer.parseInt(last); prefs.putInt(opennlp.ccg.realize.Chart.TIME_LIMIT, limit); } catch (NumberFormatException exc) { System.out.println("Expecting a time limit in ms, rather than: " + last); } } else if (input.startsWith(":no time limit") || input.startsWith(":notl")) { prefs.putInt(opennlp.ccg.realize.Chart.TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); } else if (input.startsWith(":nbtl")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { double limit = Double.parseDouble(last); prefs.putDouble(opennlp.ccg.realize.Chart.NEW_BEST_TIME_LIMIT, limit); } catch (NumberFormatException exc) { System.out.println("Expecting a time limit in ms, rather than: " + last); } } else if (input.startsWith(":nonbtl")) { prefs.putDouble(opennlp.ccg.realize.Chart.NEW_BEST_TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); } else if (input.startsWith(":edge limit") || input.startsWith(":el")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int limit = Integer.parseInt(last); prefs.putInt(opennlp.ccg.realize.Chart.EDGE_LIMIT, limit); } catch (NumberFormatException exc) { System.out.println("Expecting an edge limit, rather than: " + last); } } else if (input.startsWith(":no edge limit") || input.startsWith(":noel")) { prefs.putInt(opennlp.ccg.realize.Chart.EDGE_LIMIT, opennlp.ccg.realize.Chart.NO_EDGE_LIMIT); } else if (input.startsWith(":pruning value") || input.startsWith(":pv")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int val = Integer.parseInt(last); prefs.putInt(opennlp.ccg.realize.Chart.PRUNING_VALUE, val); } catch (NumberFormatException exc) { System.out.println("Expecting an integer pruning value, rather than: " + last); } } else if (input.startsWith(":no pruning value") || input.startsWith(":nopv")) { prefs.putInt(opennlp.ccg.realize.Chart.PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); } else if (input.startsWith(":cell pruning value") || input.startsWith(":cpv")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int val = Integer.parseInt(last); prefs.putInt(opennlp.ccg.realize.Chart.CELL_PRUNING_VALUE, val); } catch (NumberFormatException exc) { System.out.println("Expecting an integer cell pruning value, rather than: " + last); } } else if (input.startsWith(":no cell pruning value") || input.startsWith(":nocpv")) { prefs.putInt(opennlp.ccg.realize.Chart.CELL_PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); } else if (input.equals(":upon")) { prefs.putBoolean(opennlp.ccg.realize.Chart.DO_UNPACKING, true); } else if (input.equals(":upoff")) { prefs.putBoolean(opennlp.ccg.realize.Chart.DO_UNPACKING, false); } else if (input.startsWith(":select reading") || input.startsWith(":sel")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int reading = Integer.parseInt(last); if (reading > lastResults.length) { System.out.println("Only " + lastResults.length + " parses found."); } else if (lastLFs[reading-1] == null) { System.out.println("LF not available, use :all command and reparse."); } else { lastReading = reading-1; } } catch (NumberFormatException exc) { System.out.println("Expecting a reading number, rather than: " + last); } } else if (input.equals(":timing on") || input.equals(":ton") || input.equals(":t")) { prefs.putBoolean(SHOW_TIMING, true); } else if (input.equals(":timing off") || input.equals(":toff")) { prefs.putBoolean(SHOW_TIMING, false); } else if (input.equals(":show incomplete") || input.equals(":inc")) { prefs.putBoolean(SHOW_INCOMPLETE_EDGES, true); prefs.putBoolean(Edge.SHOW_COMPLETENESS, true); } else if (input.equals(":noinc")) { prefs.putBoolean(SHOW_INCOMPLETE_EDGES, false); prefs.putBoolean(Edge.SHOW_COMPLETENESS, false); } else if (input.equals(":indexing on") || input.equals(":ion")) { prefs.putBoolean(EdgeFactory.USE_INDEXING, true); } else if (input.equals(":indexing off") || input.equals(":ioff")) { prefs.putBoolean(EdgeFactory.USE_INDEXING, false); } else if (input.equals(":missing index combos on") || input.equals(":mion")) { prefs.putBoolean(EdgeFactory.ALLOW_MISSING_INDEX_COMBOS, true); } else if (input.equals(":missing index combos off") || input.equals(":mioff")) { prefs.putBoolean(EdgeFactory.ALLOW_MISSING_INDEX_COMBOS, false); } else if (input.equals(":chunks on") || input.equals(":con")) { prefs.putBoolean(EdgeFactory.USE_CHUNKS, true); } else if (input.equals(":chunks off") || input.equals(":coff")) { prefs.putBoolean(EdgeFactory.USE_CHUNKS, false); } else if (input.equals(":feature licensing on") || input.equals(":flon")) { prefs.putBoolean(EdgeFactory.USE_FEATURE_LICENSING, true); } else if (input.equals(":feature licensing off") || input.equals(":floff")) { prefs.putBoolean(EdgeFactory.USE_FEATURE_LICENSING, false); } else if (input.equals(":combos on") || input.equals(":ccon")) { prefs.putBoolean(opennlp.ccg.realize.Chart.USE_COMBOS, true); } else if (input.equals(":combos off") || input.equals(":ccoff")) { prefs.putBoolean(opennlp.ccg.realize.Chart.USE_COMBOS, false); } else if (input.equals(":pon")) { prefs.putBoolean(opennlp.ccg.realize.Chart.USE_PACKING, true); } else if (input.equals(":poff")) { prefs.putBoolean(opennlp.ccg.realize.Chart.USE_PACKING, false); } else if (input.startsWith(":realize") || input.startsWith(":r")) { LF lf; NgramScorer ngramScorer; // nb: need to upgrade, consolidate :r FN option with Realize.java ... int space = input.indexOf(" "); if (space != -1) { // check for filename String filename = readFilename(input.substring(space)); if (filename == null) { System.out.println("Expecting a filename to read from."); continue; } try { Document doc = grammar.loadFromXml(filename); lf = Realizer.getLfFromDoc(doc); // nb: just handling explicit targets for now ... List targetElts = doc.getRootElement().getChildren("target"); String[] targets = new String[targetElts.size()]; for (int i=0; i < targetElts.size(); i++) { Element ex = (Element) targetElts.get(i); String target = ex.getText(); targets[i] = target; } ngramScorer = new NgramPrecisionModel(targets); } catch (IOException exc) { System.out.println("Unable to read: " + filename); System.out.println(exc.toString()); continue; } } else { // otherwise use last reading of last LF if (lastLFs == null || lastLFs[lastReading] == null) { System.out.println("Nothing to realize!"); continue; } lf = grammar.transformLF(lastLFs[lastReading]); String[] targets = new String[1]; targets[0] = lastSentence; ngramScorer = new NgramPrecisionModel(targets); } realizer.realize(lf, ngramScorer); opennlp.ccg.realize.Chart chart = realizer.getChart(); boolean showIncompleteEdges = prefs.getBoolean(SHOW_INCOMPLETE_EDGES, false); boolean showTiming = prefs.getBoolean(SHOW_TIMING, false); if (showIncompleteEdges) chart.printEdges(); else chart.printEdges(true, true); if (showTiming) { chart.printTiming(); } } else if (input.startsWith(":2xml")) { if (lastLFs == null || lastLFs[lastReading] == null) { System.out.println("Nothing to save!"); continue; } String filename = readFilename(input.substring(5)); if (filename == null) { System.out.println("Expecting a filename to save to."); continue; } grammar.saveToXml(lastLFs[lastReading], lastSentence, filename); System.out.println("Wrote LF to \"" + filename + "\""); } else if (input.startsWith(":2tb")) { if (lastLFs == null || lastLFs[lastReading] == null) { System.out.println("Nothing to save!"); continue; } String filename = readFilename(input.substring(4)); if (filename == null) { filename = "testbed.xml"; } RegressionInfo.addToTestbed(grammar, lastResults[lastReading], lastResults.length, lastLFs[lastReading], filename); System.out.println("Added test item to \"" + filename + "\""); } else if (input.startsWith(":2apml")) { if (lastSentence.length() == 0) { System.out.println("Nothing to save!"); continue; } String filename = readFilename(input.substring(6)); if (filename == null) { System.out.println("Expecting a filename to save to."); continue; } grammar.saveToApml(lastResults[lastReading], filename); System.out.println("Wrote \"" + lastSentence + "\" to \"" + filename + "\" as APML"); } else if (input.startsWith(":vison")) { prefs.putBoolean(VISUALIZE, true); if ((input.startsWith(":vison ")) && (input.length( )>= 8)) { String fname = input.substring(7); if (fname.lastIndexOf('.')!=-1) { System.out.println("Filename should not contain a suffix. Suffixes .tex and .dvi are assumed."); prefs.put("VISFNAME", ""); } else prefs.put("VISFNAME", fname); } else prefs.put("VISFNAME", ""); } else if (input.equals(":visoff")) { prefs.putBoolean(VISUALIZE, false); prefs.put("VISFNAME", ""); } else if (input.equals(":wordpos")) { prefs.putBoolean(Converter.USE_WORD_POSITIONS_FOR_ATOM_CONVERSION, true); } else if (input.equals(":nowordpos")) { prefs.putBoolean(Converter.USE_WORD_POSITIONS_FOR_ATOM_CONVERSION, false); } else if (input.equals(":eisner")) { prefs.putBoolean(AbstractCompositionRule.EISNER_CONSTRAINTS, true); AbstractCompositionRule.useEisnerConstraints = true; } else if (input.equals(":noeisner")) { prefs.putBoolean(AbstractCompositionRule.EISNER_CONSTRAINTS, false); AbstractCompositionRule.useEisnerConstraints = false; } else if (input.startsWith(":parse time limit") || input.startsWith(":ptl")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int limit = Integer.parseInt(last); prefs.putInt(Parser.PARSE_TIME_LIMIT, limit); } catch (NumberFormatException exc) { System.out.println("Expecting a time limit in ms, rather than: " + last); } } else if (input.startsWith(":no parse time limit") || input.startsWith(":noptl")) { prefs.putInt(Parser.PARSE_TIME_LIMIT, Parser.NO_TIME_LIMIT); } else if (input.startsWith(":parse edge limit") || input.startsWith(":pel")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int limit = Integer.parseInt(last); prefs.putInt(Parser.PARSE_EDGE_LIMIT, limit); } catch (NumberFormatException exc) { System.out.println("Expecting an edge limit, rather than: " + last); } } else if (input.startsWith(":no parse edge limit") || input.startsWith(":nopel")) { prefs.putInt(Parser.PARSE_EDGE_LIMIT, Parser.NO_EDGE_LIMIT); } else if (input.startsWith(":parse pruning value") || input.startsWith(":ppv")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int val = Integer.parseInt(last); prefs.putInt(Parser.PARSE_PRUNING_VALUE, val); } catch (NumberFormatException exc) { System.out.println("Expecting an integer pruning value, rather than: " + last); } } else if (input.startsWith(":no parse pruning value") || input.startsWith(":noppv")) { prefs.putInt(Parser.PARSE_PRUNING_VALUE, Parser.NO_PRUNING); } else if (input.startsWith(":parse cell pruning value") || input.startsWith(":pcpv")) { String[] tokens = input.split("\\s+"); String last = tokens[tokens.length-1]; try { int val = Integer.parseInt(last); prefs.putInt(Parser.PARSE_CELL_PRUNING_VALUE, val); } catch (NumberFormatException exc) { System.out.println("Expecting an integer cell pruning value, rather than: " + last); } } else if (input.startsWith(":no parse cell pruning value") || input.startsWith(":nopcpv")) { prefs.putInt(Parser.PARSE_CELL_PRUNING_VALUE, Parser.NO_PRUNING); } else if (input.equals(":plazy")) { prefs.putBoolean(Parser.PARSE_LAZY_UNPACKING, true); } else if (input.equals(":noplazy")) { prefs.putBoolean(Parser.PARSE_LAZY_UNPACKING, false); } else { try { if (input.length() == 0) { if (lastSentence.length() > 0) { input = lastSentence; } else { System.out.println("Nothing to parse!"); continue; } } parser.parse(input); List parses = parser.getResult(); Sign[] results = new Sign[parses.size()]; parses.toArray(results); int resLength = results.length; switch (resLength) { case 0: break; case 1: System.out.println(resLength + " parse found.\n"); break; default: System.out.println(resLength + " parses found.\n"); } lastResults = results; lastLFs = new LF[resLength]; if (input.length() > 0) { lastSentence = input; } lastReading = 0; boolean showall = prefs.getBoolean(SHOW_ALL_RESULTS, false); boolean showderivs = prefs.getBoolean(SHOW_DERIVATIONS, false); boolean showsem = prefs.getBoolean(SHOW_SEMANTICS, true); boolean visualize = prefs.getBoolean(VISUALIZE, false); boolean showfeats = prefs.getBoolean(SHOW_FEATURES, false); String feats_to_show = prefs.get(FEATURES_TO_SHOW, ""); Visualizer vis = null; String baseFileName = null; grammar.prefs.showSem = showsem; grammar.prefs.showFeats = showfeats; grammar.prefs.featsToShow = feats_to_show; if (visualize) { vis = new Visualizer(); if (prefs.get("VISFNAME", "").equals("")) baseFileName = vis.getTempFileName(); else baseFileName = prefs.get("VISFNAME", ""); vis.writeHeader(baseFileName+".tex"); } int numToShow = (showall) ? resLength : 1; for (int i=0; i < numToShow; i++) { Category cat = results[i].getCategory(); LF convertedLF = null; if (cat.getLF() != null) { cat = cat.copy(); Nominal index = cat.getIndexNominal(); Sign rootSign = results[i]; // could add a switch here for naming convention convertedLF = HyloHelper.compactAndConvertNominals(cat.getLF(), index, rootSign); lastLFs[i] = convertedLF; cat.setLF(null); } String parseNum = (resLength == 1) ? "Parse: " : ("Parse "+(i+1)+": "); System.out.print(parseNum + cat.toString()); if (showsem && convertedLF != null) { System.out.println(" : "); System.out.println(" " + convertedLF.prettyPrint(" ")); } else System.out.println(); if (showderivs) { System.out.println("------------------------------"); System.out.println(results[i].getDerivationHistory()); } if (visualize) vis.saveTeXFile(results[i], baseFileName + ".tex" ); } if (visualize) { vis.writeFooter(baseFileName + ".tex"); vis.show(baseFileName); if (prefs.get("VISFNAME","").equals("")) // If temporary file, vis.cleanFiles(baseFileName); // clean it else { vis.cleanAuxFiles(baseFileName); System.out.println("Saved to files " + baseFileName + ".tex and " + baseFileName + ".dvi"); } vis = null; } } catch(ParseException pe) { System.out.println(pe); } } } // store command input history in preferences prefs.put(historyKey, lineReader.getCommandHistory()); // done System.out.println("Exiting tccg."); System.exit(0); } // reads the next token in the string as a filename private static String readFilename(String s) throws IOException { StreamTokenizer st = new StreamTokenizer(new StringReader(s)); st.wordChars('/','/'); st.wordChars('\\','\\'); st.wordChars(':',':'); st.nextToken(); return st.sval; } /** Shows help for the command-line tool. */ public static void showHelp() { System.out.println(); System.out.println("Commands for tccg (otherwise input is parsed):"); System.out.println(); System.out.println(" :sh\t\t\tshow current preference settings"); System.out.println(" :v\t\t\tverbose output"); System.out.println(" :reset\t\treset options to defaults"); System.out.println(" :feats (L)\t\tshow features (or just show features in list L)"); System.out.println(" :nofeats\t\tdon't show features"); System.out.println(" :sem\t\t\tshow semantics"); System.out.println(" :nosem\t\tdon't show semantics"); System.out.println(" :all\t\t\tshow all parse results"); System.out.println(" :notall\t\tdon't show all parse results"); System.out.println(" :derivs\t\tshow derivations"); System.out.println(" :noderivs\t\tdon't show derivations"); System.out.println(" :vison (FN)\t\tturn visualization on (saving to file with name FN)"); System.out.println(" :visoff\t\tturn visualization off"); System.out.println(" :wordpos\t\tuse word positions to name converted nominals"); System.out.println(" :nowordpos\t\tdon't use word positions to name converted nominals"); System.out.println(" :eisner\t\tuse Eisner constraints on composition"); System.out.println(" :noeisner\t\tturn off Eisner constraints on composition"); System.out.println(); System.out.println(" :ptl N\t\tset parse time limit to N ms"); System.out.println(" :noptl\t\tset parse time limit to none"); System.out.println(" :pel N\t\tset parse edge limit to N"); System.out.println(" :nopel\t\tset parse edge limit to none"); System.out.println(" :ppv N\t\tset parse pruning value to N"); System.out.println(" :noppv\t\tset parse pruning value to none"); System.out.println(" :pcpv N\t\tset parse cell pruning value to N"); System.out.println(" :nopcpv\t\tset parse cell pruning value to none"); System.out.println(" :plazy\t\tturn lazy unpacking on in parser"); System.out.println(" :noplazy\t\tturn lazy unpacking off in parser"); System.out.println(); System.out.println(" :r (FN)\t\trealize selected reading (or from XML file with name FN)"); System.out.println(" :sel N\t\tselect reading N for realization or saving"); System.out.println(" :2xml FN\t\tsave last input and LF to XML file with name FN"); System.out.println(" :2tb (FN)\t\tadd last input and LF as a test item (to file with name FN)"); System.out.println(" :2apml FN\t\tsave last input to APML file with name FN"); System.out.println(); System.out.println(" :tl N\t\t\tset realization time limit to N ms"); System.out.println(" :notl\t\t\tset realization time limit to none"); System.out.println(" :nbtl N\t\tset realization new best time limit to N ms | N < 1 of first"); System.out.println(" :nonbtl\t\tset realization new best time limit to none"); System.out.println(" :el N\t\t\tset realization edge limit to N"); System.out.println(" :noel\t\t\tset realization edge limit to none"); System.out.println(" :pv N\t\t\tset realization pruning value to N"); System.out.println(" :nopv\t\t\tset realization pruning value to none"); System.out.println(" :cpv N\t\tset realization cell pruning value to N"); System.out.println(" :nocpv\t\tset realization cell pruning value to none"); System.out.println(" :upon\t\t\tturn unpacking on"); System.out.println(" :upoff\t\tturn unpacking off"); System.out.println(" :t\t\t\tturn realization timing on"); System.out.println(" :toff\t\t\tturn realization timing off"); System.out.println(" :inc\t\t\tshow incomplete realization edges"); System.out.println(" :noinc\t\tdon't show incomplete realization edges"); System.out.println(); System.out.println(" :ion\t\t\tturn index filtering on"); System.out.println(" :ioff\t\t\tturn index filtering off"); System.out.println(" :mion\t\t\tturn missing index combos on"); System.out.println(" :mioff\t\tturn missing index combos off"); System.out.println(" :con\t\t\tturn LF chunks on"); System.out.println(" :coff\t\t\tturn LF chunks off"); System.out.println(" :flon\t\t\tturn feature licensing on"); System.out.println(" :floff\t\tturn feature licensing off"); System.out.println(" :ccon\t\t\tturn collected combos on"); System.out.println(" :ccoff\t\tturn collected combos off"); System.out.println(" :pon\t\t\tturn packing on"); System.out.println(" :poff\t\t\tturn packing off"); System.out.println(); System.out.println(" :q\t\t\tquit tccg"); System.out.println(" :h\t\t\tshow this message"); System.out.println(); } /** Shows current settings. */ public static void showSettings(Preferences prefs) { System.out.println(); System.out.println("Current preference settings:"); System.out.println(); boolean showfeats = prefs.getBoolean(SHOW_FEATURES, false); boolean showsem = prefs.getBoolean(SHOW_SEMANTICS, true); String feats = prefs.get(FEATURES_TO_SHOW, ""); System.out.println(" show feats:\t\t" + showfeats); System.out.println(" show semantics:\t" + showsem); if (showfeats) { System.out.println(" feats to show:\t" + ((feats.length() > 0) ? feats : "all")); } boolean showall = prefs.getBoolean(SHOW_ALL_RESULTS, false); boolean showderivs = prefs.getBoolean(SHOW_DERIVATIONS, false); System.out.println(" show all:\t\t" + showall); System.out.println(" show derivs:\t\t" + showderivs); boolean visualize = prefs.getBoolean(VISUALIZE, false); String visfname = prefs.get("VISFNAME", ""); System.out.println(" visualize:\t\t" + ((visualize) ? "on" : "off")); if (visfname.length() > 0) { System.out.println(" vis file name:\t" + visfname); } boolean wordpos = prefs.getBoolean(Converter.USE_WORD_POSITIONS_FOR_ATOM_CONVERSION, true); System.out.println(" word pos:\t\t" + ((wordpos) ? "on" : "off")); boolean eisner = prefs.getBoolean(AbstractCompositionRule.EISNER_CONSTRAINTS, true); System.out.println(" Eisner constraints:\t" + ((eisner) ? "on" : "off")); System.out.println(); int ptl = prefs.getInt(Parser.PARSE_TIME_LIMIT, Parser.NO_TIME_LIMIT); System.out.println(" parse time limit:\t" + ((ptl == Parser.NO_TIME_LIMIT) ? "none" : "" + ptl + " ms")); int pel = prefs.getInt(Parser.PARSE_EDGE_LIMIT, Parser.NO_EDGE_LIMIT); System.out.println(" parse edge limit:\t" + ((pel == Parser.NO_EDGE_LIMIT) ? "none" : "" + pel)); int ppv = prefs.getInt(Parser.PARSE_PRUNING_VALUE, Parser.NO_PRUNING); System.out.println(" parse pruning value:\t" + ((ppv == Parser.NO_PRUNING) ? "none" : "" + ppv)); int pcpv = prefs.getInt(Parser.PARSE_CELL_PRUNING_VALUE, Parser.NO_PRUNING); System.out.println(" parse cell prune val:\t" + ((pcpv == Parser.NO_PRUNING) ? "none" : "" + pcpv)); boolean plazy = prefs.getBoolean(Parser.PARSE_LAZY_UNPACKING, true); System.out.println(" lazy unpacking:\t" + ((plazy) ? "on" : "off")); System.out.println(); int tl = prefs.getInt(opennlp.ccg.realize.Chart.TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); System.out.println(" time limit:\t\t" + ((tl == opennlp.ccg.realize.Chart.NO_TIME_LIMIT) ? "none" : "" + tl + " ms")); double nbtl = prefs.getDouble(opennlp.ccg.realize.Chart.NEW_BEST_TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); String nbtlStr = (nbtl >= 1) ? (((int)nbtl) + " ms") : (nbtl + " of first"); System.out.println(" new best time limit:\t" + ((nbtl == opennlp.ccg.realize.Chart.NO_TIME_LIMIT) ? "none" : nbtlStr)); int el = prefs.getInt(opennlp.ccg.realize.Chart.EDGE_LIMIT, opennlp.ccg.realize.Chart.NO_EDGE_LIMIT); System.out.println(" edge limit:\t\t" + ((el == opennlp.ccg.realize.Chart.NO_EDGE_LIMIT) ? "none" : "" + el)); int pv = prefs.getInt(opennlp.ccg.realize.Chart.PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); System.out.println(" pruning value:\t" + ((pv == opennlp.ccg.realize.Chart.NO_PRUNING) ? "none" : "" + pv)); int cpv = prefs.getInt(opennlp.ccg.realize.Chart.CELL_PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); System.out.println(" cell pruning value:\t" + ((cpv == opennlp.ccg.realize.Chart.NO_PRUNING) ? "none" : "" + cpv)); boolean unpacking = prefs.getBoolean(opennlp.ccg.realize.Chart.DO_UNPACKING, true); System.out.println(" unpacking:\t\t" + ((unpacking) ? "on" : "off")); boolean showtiming = prefs.getBoolean(SHOW_TIMING, false); System.out.println(" timing:\t\t" + ((showtiming) ? "on" : "off")); boolean showinc = prefs.getBoolean(SHOW_INCOMPLETE_EDGES, false); System.out.println(" show incomplete:\t" + ((showinc) ? "on" : "off")); System.out.println(); boolean indexing = prefs.getBoolean(EdgeFactory.USE_INDEXING, true); boolean missingIndexCombos = prefs.getBoolean(EdgeFactory.ALLOW_MISSING_INDEX_COMBOS, false); boolean chunks = prefs.getBoolean(EdgeFactory.USE_CHUNKS, true); boolean licensing = prefs.getBoolean(EdgeFactory.USE_FEATURE_LICENSING, true); boolean combos = prefs.getBoolean(opennlp.ccg.realize.Chart.USE_COMBOS, true); boolean packing = prefs.getBoolean(opennlp.ccg.realize.Chart.USE_PACKING, false); System.out.println(" index filtering:\t" + ((indexing) ? "on" : "off")); System.out.println(" missing index combos:\t" + ((missingIndexCombos) ? "on" : "off")); System.out.println(" chunks:\t\t" + ((chunks) ? "on" : "off")); System.out.println(" licensing:\t\t" + ((licensing) ? "on" : "off")); System.out.println(" combos:\t\t" + ((combos) ? "on" : "off")); System.out.println(" packing:\t\t" + ((packing) ? "on" : "off")); System.out.println(); } } ================================================ FILE: src/opennlp/ccg/WebCCG.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2006 Ben Wing. // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg; import opennlp.ccg.lexicon.*; import opennlp.ccg.grammar.*; import opennlp.ccg.parse.*; import opennlp.ccg.util.*; import opennlp.ccg.synsem.*; //import opennlp.ccg.realize.*; import opennlp.ccg.hylo.*; //import opennlp.ccg.ngrams.*; //import opennlp.ccg.test.*; //import org.jdom.*; import java.io.*; import java.net.*; import java.util.*; /** * An interface for use with a higher-level web interface. This should * provide as simple an interface onto parsing as possible, with its output * in a format that can be easily handled by a CGI program or similar. * Called as * * webccg [-showall] [-showderivs] [-showsem] [-visualize FILE] GRAMMARDIR -showall shows all parses rather than just the first one. -showderivs shows the derivation history of each parse. -showsem shows the logical form of each parse. -visualize output a visualization of the parses into FILE (in PNG format). * * @author Ben Wing * @version $Revision: 1.4 $, $Date: 2009/12/21 03:27:18 $ */ public class WebCCG { /** Main method for tccg. */ public static void main(String[] args) throws IOException, LexException { String usage = "java opennlp.ccg.WebCCG " + "[-showall] [-showderivs] [-showsem] [-showfeats] [-visualize FILE] GRAMMARDIR\n" + "\n" + "-showall shows all parses rather than just the first one.\n" + "-showderivs shows the derivation history of each parse.\n" + "-showsem shows the logical form of each parse.\n" + "-showfeats shows the features associated with each nonterminal.\n" + "-visualize output a visualization of the parses into FILE (in TEX format).\n"+ " to convert to an image, try this:\n" + " latex foo.tex; dvips foo.dvi | pstopnm | pnmtopng > foo.png\n" + "\n" + "Sentences to parse are read from standard input.\n"; if (args.length > 0 && (args[0].equals("-h") || args[0].equals("-help"))) { System.out.println("Usage: " + usage); System.exit(0); } // args //String prefsfile = null; boolean showall = false; boolean showderivs = false; boolean showsem = false; boolean showfeats = false; String visfile = null; int i; for (i = 0; i < args.length; i++) { if (args[i].equals("-showall")) showall = true; else if (args[i].equals("-showderivs")) showderivs = true; else if (args[i].equals("-showsem")) showsem = true; else if (args[i].equals("-showfeats")) showfeats = true; else if (args[i].equals("-visualize")) visfile = args[++i]; else break; } if (i != args.length - 1) { System.out.println("Usage: " + usage); System.exit(0); } String grammarfile = args[i] + "/grammar.xml"; // load grammar URL grammarURL = new File(grammarfile).toURI().toURL(); //System.out.println("Loading grammar from URL: " + grammarURL); Grammar grammar = new Grammar(grammarURL); //if (grammar.getName() != null) // System.out.println("Grammar '" + grammar.getName() + "' loaded."); // create parser and realizer Parser parser = new Parser(grammar); //Realizer realizer = new Realizer(grammar); BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); while (true) { String input = br.readLine(); if (input == null) break; // EOF input = input.trim(); if (input.equals("")) continue; try { parser.parse(input); List parses = parser.getResult(); Sign[] results = new Sign[parses.size()]; parses.toArray(results); int resLength = results.length; System.out.print("\"" + input + "\": "); switch (resLength) { case 0: break; case 1: System.out.println(resLength + " parse found.\n"); break; default: System.out.println(resLength + " parses found.\n"); } Visualizer vis = null; grammar.prefs.showSem = showsem; grammar.prefs.showFeats = showfeats; grammar.prefs.featsToShow = ""; if (visfile != null) { vis = new Visualizer(); vis.writeHeader(visfile); } int numToShow = (showall) ? resLength : 1; for (i=0; i < numToShow; i++) { Category cat = results[i].getCategory(); LF convertedLF = null; if (cat.getLF() != null) { cat = cat.copy(); Nominal index = cat.getIndexNominal(); convertedLF = HyloHelper.compactAndConvertNominals(cat.getLF(), index, results[i]); cat.setLF(null); } String parseNum = (resLength == 1) ? "Parse: " : ("Parse "+(i+1)+": "); System.out.print(parseNum + cat.toString()); if (showsem && convertedLF != null) { System.out.println(" : "); System.out.println(" " + convertedLF.prettyPrint(" ")); } else System.out.println(); if (showderivs) { System.out.println("------------------------------"); System.out.println(results[i].getDerivationHistory()); } if (visfile != null) vis.saveTeXFile(results[i], visfile); } if (visfile != null) { vis.writeFooter(visfile); } } catch(ParseException pe) { System.out.print("\"" + input + "\": "); System.out.println(pe + ".\n"); } } } } ================================================ FILE: src/opennlp/ccg/alignment/AbstractEncodingScheme.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; /** * Abstract class for implementing encoding schemes. This base class provides a constructor taking all the * necessary arguments for implementing {@link EncodingScheme}, and gives getter methods for all of them, * as required. * @author Scott Martin * @see EncodingScheme */ public abstract class AbstractEncodingScheme implements EncodingScheme { /** * Delimits a field within a mapping. */ protected Character fieldDelimiter; /** * Delimits a mapping. */ protected Character mappingDelimiter; /** * Delimits a group of mappings. */ protected Character groupDelimiter; /** * The index base for IDs. */ protected IndexBase phraseNumberBase; /** * The index base for indices. */ protected IndexBase indexBase; /** * The order of the fields in mappings corresponding to this encoding scheme. */ protected List order; /** * The set of required fields in this encoding scheme. */ protected Set required; /** * The fields that this encoding scheme uses by default. */ protected Set defaults; final boolean fieldDelimSep, mappingDelimSep, groupDelimSep; protected AbstractEncodingScheme(Character fieldDelimiter, Character mappingDelimiter, Character groupDelimiter, IndexBase phraseNumberBase, IndexBase indexBase, Set defaults, Set required, MappingFormat.Field... order) { this.fieldDelimiter = fieldDelimiter; this.mappingDelimiter = mappingDelimiter; this.groupDelimiter = groupDelimiter; this.phraseNumberBase = phraseNumberBase; this.indexBase = indexBase; fieldDelimSep = isLineSeparator(fieldDelimiter); mappingDelimSep = isLineSeparator(mappingDelimiter); groupDelimSep = isLineSeparator(groupDelimiter); this.defaults = Collections.unmodifiableSet(defaults); this.required = Collections.unmodifiableSet(required); this.order = Collections.unmodifiableList(Arrays.asList(order)); } static boolean isLineSeparator(Character c) { // TODO why doesn't Character.getType(c) == Character.LINE_SEPARATOR work? return c == '\r' || c == '\n'; } /** * Tests whether the supplied character counts as a field delimiter according to this encoding scheme. * @return true if c is equal to {@link #fieldDelimiter} or both c and * {@link #fieldDelimiter} are line separators. */ public boolean isFieldDelimiter(Character c) { return fieldDelimiter.equals(c) || (fieldDelimSep && isLineSeparator(c)); } /** * Tests whether the supplied character counts as a mapping delimiter according to this encoding scheme. * @return true if c is equal to {@link #mappingDelimiter} or both c and * {@link #mappingDelimiter} are line separators. */ public boolean isMappingDelimiter(Character c) { return mappingDelimiter.equals(c) || (mappingDelimSep && isLineSeparator(c)); } /** * Tests whether the supplied character counts as a group delimiter according to this encoding scheme. * @return true if c is equal to {@link #groupDelimiter} or both c and * {@link #groupDelimiter} are line separators. */ public boolean isGroupDelimiter(Character c) { return groupDelimiter.equals(c) || (groupDelimSep && isLineSeparator(c)); } /** * Gets the delimiter for fields. */ public Character getFieldDelimiter() { return fieldDelimiter; } /** * Gets the delimiter for mappings. */ public Character getMappingDelimiter() { return mappingDelimiter; } /** * Gets the delimiter for groups. */ public Character getGroupDelimiter() { return groupDelimiter; } /** * Gets the numbering base used for phrases. */ public IndexBase getPhraseNumberBase() { return phraseNumberBase; } /** * Gets the numbering base used for mapping indices. */ public IndexBase getIndexBase() { return indexBase; } /** * Gets the order in which fields occur in this encoding scheme. */ public List getOrder() { return order; } /** * Gets the required (non-optional) fields in this scheme. */ public Set getRequired() { return required; } /** * @return Gets the default fields used by this scheme. */ public Set getDefaults() { return defaults; } /** * Gets a hash code for this encoding scheme based on its delimiters, index bases, and fields. */ @Override public int hashCode() { return 37 * fieldDelimiter.hashCode() + groupDelimiter.hashCode() + mappingDelimiter.hashCode() + indexBase.hashCode() + phraseNumberBase.hashCode() + order.hashCode() + required.hashCode() + defaults.hashCode(); } /** * Tests whether this encoding scheme is equal to another based on its delimiters, index bases, and fields. */ @Override public boolean equals(Object obj) { if(obj instanceof AbstractEncodingScheme) { AbstractEncodingScheme e = (AbstractEncodingScheme)obj; return fieldDelimiter.equals(e.fieldDelimiter) && groupDelimiter.equals(e.groupDelimiter) && mappingDelimiter.equals(e.mappingDelimiter) && indexBase.equals(e.indexBase) && phraseNumberBase.equals(e.phraseNumberBase) && order.equals(e.order) && required.equals(e.required) && defaults.equals(e.defaults); } return false; } } ================================================ FILE: src/opennlp/ccg/alignment/Alignment.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.PhrasePosition.A; import static opennlp.ccg.alignment.PhrasePosition.B; import java.util.AbstractMap; import java.util.AbstractSet; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import opennlp.ccg.util.DelegatedFilter; import opennlp.ccg.util.Filter; import opennlp.ccg.util.FilteredSet; import opennlp.ccg.util.VisitedFilter; /** * An alignment consisting of a pair of phrases and a set of mappings between them. *

* This class is a flat representation of the mappings between indices in its * {@linkplain #getA() A-position} phrase and its {@linkplain #getB() B-position} phrase in that it is * simply a set of mappings. More granularity is available by calling * {@link #getTargets(Integer, PhrasePosition)}, which returns all the indices a certain index is mapped to * from a specified position. * Alignments also allow their indices to be accessed when the phrase position is not necessarily known, * via {@link #get(PhrasePosition)}, {@link #getIndices(PhrasePosition)}, and {@link #asMap(PhrasePosition)}. *

* A detached view of this alignment as a map whose keys are the indices in a * specified position and whose values are the sets of indices that index maps to can be obtained by calling * {@link #asMap(PhrasePosition)}. If only the set of indices mapped to by a certain index is required, * {@link #getTargets(Integer, PhrasePosition)} provides similar functionality. The static method * {@link #fromMap(Phrase, Phrase, Map)} allows an alignment to be reconstructed from a map of indices to * sets of indices. *

* A version of this alignment with the phrase positions reversed and all the mappings * {@linkplain Mapping#reverse() reversed} can be obtained by calling {@link #reverse()}. * * @author Scott Martin * @see PhrasePosition * @see Phrase * @see Mapping */ public class Alignment extends AbstractSet implements Comparable { final Phrase a, b; final Set mappings; /** * Creates a new alignment with the specified phrases and mappings between them. The specified set of * mappings is copied in a way that preserves whatever ordering is present in the original set, * via {@link LinkedHashSet}. * * @param a The phrase to use for {@linkplain PhrasePosition#A the "A" position}. * @param b The phrase to use for {@linkplain PhrasePosition#B the "B" position}. * @param mappings The mappings between a and b, where the * {@linkplain Mapping#getA() first index} is understood to belong to a and the * {@linkplain Mapping#getB() second index} is understood to belong to b. * * @throws IllegalArgumentException If either phrase is null, or if phrases a and b do not * have matching {@linkplain Phrase#getNumber() numbers}, if mappings is null, * or if any of the mappings have a non-null phrase number that is not equal to the phrases' numbers. * @throws IndexOutOfBoundsException If any of the mappings contains an index that does not * exist in the phrase in the corresponding position. * * @see LinkedHashSet */ public Alignment(Phrase a, Phrase b, Collection mappings) { checkPhrases(a, b); if(mappings == null) { throw new IllegalArgumentException("mappings is null"); } // have to set these first or checkMapping() throws exception this.a = a; this.b = b; for(Mapping m : mappings) { checkMapping(m); } this.mappings = new LinkedHashSet(mappings); } /** * Creates a new alignment based on the specified phrases and map view of their mappings. * @param a The {@linkplain PhrasePosition#A A-position} phrase. * @param b The {@linkplain PhrasePosition#B B-position} phrase. * @param map A map whose keys are the A-position indices and whose values are the B-position indices * that the corresponding key is mapped to. * @return A new alignment with mappings created based on the specified map. * * @see #asMap() */ public static Alignment fromMap(Phrase a, Phrase b, Map> map) { @SuppressWarnings("unchecked") Set ms = map.isEmpty() ? Collections.EMPTY_SET : new LinkedHashSet(); for(Integer k : map.keySet()) { for(Integer v : map.get(k)) { ms.add(new Mapping(a.getNumber(), k, v)); } } return new Alignment(a, b, ms); } /** * Creates an alignment based on this one except that the phrases have * switched positions and all of the mappings are reversed. * * @return A new alignment with the phrases swapped and all the mappings' * indices swapped. * * @see Mapping#reverse() */ public Alignment reverse() { @SuppressWarnings("unchecked") Alignment r = new Alignment(getB(), getA(), Collections.EMPTY_SET); for(Mapping m : mappings) { r.add(m.reverse()); } return r; } /** * Gets this alignment's number. * @return The value of the {@linkplain Phrase#getNumber() number} of the phrase in * both {@linkplain PhrasePosition positions}. */ public Integer getNumber() { return a.number; } /** * Gets the phrase in {@linkplain PhrasePosition#A A-position}. */ public Phrase getA() { return get(A); } /** * Gets the phrase in {@linkplain PhrasePosition#B B-position}. */ public Phrase getB() { return get(B); } /** * Gets the phrase in the specified position. * @param pos The position in which to find the phrase. * @return If pos is {@link PhrasePosition#notifyAll()}, the A-phrase; otherwise the B-phrase. */ public Phrase get(PhrasePosition pos) { return (pos == A) ? a : b; } /** * Adds a new mapping to this alignment. * @throws IndexOutOfBoundsException If either of the indices in m are out of bounds for the * phrase in the corresponding {@linkplain PhrasePosition position}. */ @Override public boolean add(Mapping m) { checkMapping(m); return mappings.add(m); } /** * Gets an iterator over the mappings in this alignment. */ @Override public Iterator iterator() { return mappings.iterator(); } /** * Gets the number of mappings in this alignment. */ @Override public int size() { return mappings.size(); } /** * Compares this alignment to another by comparing their {@linkplain #getNumber() numbers}. * @param o The alignment to compare to. * @return The value of getNumber().compareTo(o.getNumber()). * @see Integer#compareTo(Integer) */ @Override public int compareTo(Alignment o) { return getNumber().compareTo(o.getNumber()); } /** * Tests whether this alignment is equal to another by comparing their mappings and their phrases. * @see Phrase#equals(Object) */ @Override public boolean equals(Object o) { if(o instanceof Alignment) { Alignment al = (Alignment)o; return super.equals(o) && a.equals(al.a) && b.equals(al.b); } return false; } /** * Generates a hash code for this alignment based on its mappings and phrases. */ @Override public int hashCode() { return 37 * super.hashCode() + a.hashCode() + b.hashCode(); } /** * Gets a string representation of this alignment with both phrases and the mappings between them. */ @Override public String toString() { StringBuilder sb = new StringBuilder(A.name()); sb.append(": "); sb.append(a.toString()); sb.append(", "); sb.append(B.name()); sb.append(": "); sb.append(b.toString()); sb.append(", mappings: "); sb.append(super.toString()); return sb.toString(); } // Views and conveniences /** * Gets the indices mapped to from the specified source, assuming that the source is in the * {@linkplain PhrasePosition#A A-position}. * * @return the value of getTargets(source, PhrasePosition.A) * @see #getTargets(Integer, PhrasePosition) */ public Set getTargets(Integer source) { return getTargets(source, A); } /** * Gets the indices mapped to by a specified index starting from the specified position. For example, if * an alignment contains the following mappings: *

	 * 7 <-> 4
	 * 3 <-> 4
	 * 4 <-> 4
	 * ...
* Then calling getTargets(4, {@link PhrasePosition#B}) returns a set containing 7, 3, * and 4. *

* Calling this method is equivalent to calling * {@link #asMap(PhrasePosition) asMap}(sourcePosition).get(source), with the exception that if * no mappings have source in the source position, the empty set is returned rather than * null. * * @param source The index to look for targets of. * @param sourcePosition The phrase position to assume the source index belongs to. * * @return A set of indices in the {@linkplain PhrasePosition#opposite() opposite position} that the * specified source index maps to (the same as asMap(sourcePosition).get(source)), * or the empty set if no such indices are present. * * @see #add(Mapping) * @see #asMap(PhrasePosition) */ public Set getTargets(Integer source, PhrasePosition sourcePosition) { return new LinkedHashSet(new ValueView(source, sourcePosition)); } /** * Gets the indices in a specified phrase position. Specifically, returns a set containing every * integer i such that there exists a mapping in this alignment that returns i * for the call {@link Mapping#get(PhrasePosition)} with the specified position as argument. * @param position The position to get indices for. * @return The same value as asMap(position).keySet(). * @see #asMap(PhrasePosition) */ public Set getIndices(PhrasePosition position) { return new LinkedHashSet(new KeyView(position)); } /** * Gets a map view of this alignment from the {@linkplain Alignments#DEFAULT_PHRASE_POSITION * default phrase position}. * @see #asMap(PhrasePosition) */ public Map> asMap() { return asMap(Alignments.DEFAULT_PHRASE_POSITION); } /** * Gets a map view of this alignment from the specified key position. The returned map's keys are drawn * from the mappings by accessing the specified key position, while the values are aggregated together * into sets from the indices at keyPosition's {@linkplain PhrasePosition#opposite() opposite * position}. The returned map contains key/value pairs that can be used to reconstruct the alignment * it is based on via the {@link #fromMap(Phrase, Phrase, Map)} method. That is, calling *

Alignment.fromMap(a.getA(), a.getB(), a.asMap(PhrasePosition.A))
* for any alignment a always returns an alignment that is equivalent to a according * to the {@link #equals(Object)} method. *

* For example, if this alignment contains the following mappings *

	 * 0 <-> 0
	 * 0 <-> 1
	 * 1 <-> 2
	 * 3 <-> 2
*
* then calling asMap(PhrasePosition.A) returns a map with the key/value pairings *
	 * 0=[0, 1]
	 * 1=[2]
	 * 3=[2]
*
* while calling asMap(PhrasePosition.B) gives the map view from the "opposite * direction", i.e. *
	 * 0=[0]
	 * 1=[0]
	 * 2=[1, 3]
*
* Note that the order of the keys and values reflects the ordering of the alignment's mappings via * {@link LinkedHashMap} and {@link LinkedHashSet}, and is dependent * on its {@linkplain #iterator() iterator}. Also, the behavior of * the returned map is not specified if mappings are added are removed to this alignment after a call to * asMap(). *

* The returned map is detached from (not backed by) this set of mappings, so keys can be removed from and * added to it without any effect on this alignment. Similarly, the sets of indices that are the values * of its entry set can be modified without affecting this alignment. The * {@link #fromMap(Phrase, Phrase, Map)} provides the ability to create an alignment based on a map of * A indices to sets of B indices. * * @param keyPosition The phrase position that the resulting maps keys should be taken from. * @return A map whose {@linkplain Map#keySet() keys} are from the phrase in the specified position, and * whose values are sets of indices from the phrase in the {@linkplain PhrasePosition#opposite() opposite} * position. * * @see #fromMap(Phrase, Phrase, Map) * @see LinkedHashMap * @see LinkedHashSet */ public Map> asMap(PhrasePosition keyPosition) { return new LinkedHashMap>(new MapView(keyPosition)); } void checkPhrases(Phrase ap, Phrase bp) { if(ap == null) { throw new IllegalArgumentException(A.name() + " phrase is null"); } if(bp == null) { throw new IllegalArgumentException(B.name() + " phrase is null"); } if(!ap.number.equals(bp.number)) { throw new IllegalArgumentException("phrases have different numbers"); } } void checkMapping(Mapping m) { if(m == null) { throw new IllegalArgumentException("attempt to add null mapping"); } if(m.phraseNumber != null && !m.phraseNumber.equals(a.number)) { throw new IllegalArgumentException("mapping's phrase number does not match: expected " + a.number + ", but was " + m.phraseNumber); } for(PhrasePosition pos : PhrasePosition.values()) { checkIndex(m.get(pos), pos); } } void checkIndex(Integer index, PhrasePosition intendedPosition) { if(index == null) { throw new IllegalArgumentException("attempt to add null index in position " + intendedPosition.name()); } if(index < -1 || get(intendedPosition).size() <= index) { throw new IndexOutOfBoundsException(intendedPosition.name() + " index out of bounds: " + index); } } class MapView extends AbstractMap> { PhrasePosition keyPosition; MapView(PhrasePosition keyPosition) { this.keyPosition = keyPosition; } @Override public Set>> entrySet() { return new AbstractSet>>() { private Set keys = new KeyView(keyPosition); @Override public int size() { return keys.size(); } @Override public Iterator>> iterator() { return new Iterator>>() { private Iterator i = keys.iterator(); @Override public boolean hasNext() { return i.hasNext(); } @Override public Entry> next() { final Integer key = i.next(); // copy values because HashMap's constructor doesn't return new SimpleImmutableEntry>( key, new LinkedHashSet(new ValueView(key, keyPosition))); } @Override public void remove() { i.remove(); // throws UnsupportedOperationException } }; } }; } } abstract class IndexView extends AbstractSet { PhrasePosition indexPosition; Filter indexFilter; private Set indices; IndexView(PhrasePosition indexPosition, Filter indexFilter) { this.indexPosition = indexPosition; this.indexFilter = indexFilter; } Set indices() { return (indices == null) ? (indices = new FilteredSet(Alignment.this.mappings, indexFilter)) : indices; } @Override public int size() { return indices().size(); } @Override public Iterator iterator() { return new Iterator() { private Iterator i = indices().iterator(); @Override public boolean hasNext() { return i.hasNext(); } @Override public Integer next() { return i.next().get(indexPosition); } @Override public void remove() { throw new UnsupportedOperationException(); // just in case } }; } } class KeyView extends IndexView { KeyView(final PhrasePosition keyPosition) { super(keyPosition, new DelegatedFilter(new VisitedFilter()) { @Override public Integer delegateValueFor(Mapping e) { return e.get(keyPosition); } }); } } class ValueView extends IndexView { ValueView(final Integer key, final PhrasePosition keyPosition) { super(keyPosition.opposite(), new Filter() { @Override public boolean allows(Mapping m) { return key.equals(m.get(keyPosition)); } }); } } } ================================================ FILE: src/opennlp/ccg/alignment/Alignments.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.MappingFormat.Field.A_INDEX_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.B_INDEX_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.PHRASE_NUMBER_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.STATUS_FIELD; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.Reader; import java.io.Writer; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import opennlp.ccg.alignment.MappingFormat.Field; /** * Static non-instantiable class that provides convenience methods for reading * and writing phrases, mappings, and alignments. *

* The convenience methods tokenize split a string into an array of strings, and the * untokenize methods provide their inverses. *

* The readXxxPhrases() methods all call {@link #readPhrases(PhraseReader)} to read * {@linkplain Phrase phrases} from an underlying reader. Depending on the method, phrases are assumed to be * identified or to merely occur in sequence. Similarly, the writeXxxPhrases all call * {@link #writePhrases(List, PhraseWriter)}. *

* {@linkplain Mapping Mappings} can be read using readMappings(...) methods, which will read * mappings as formatted by the specified * format. The readSortedMappings(...) methods are variants of these that return a map with * sorted keys that map to {@linkplain SortedSet sorted sets} of mappings. * The methods {@link #writeMappings(Map, File, MappingFormat)} and * {@link #writeMappings(Map, Writer, MappingFormat)} perform the inverse of the methods for reading. *

* Finally, the readXxxAlignments() methods combine the methods for reading phrases and mappings to * allow {@linkplain Alignments alignments} to be read. The methods * {@link #writeAlignments(List, File, File, File, String, MappingFormat)} and * {@link #writeAlignments(List, Writer, Writer, Writer, String, MappingFormat)} write alignments according to * a specified word separator and mapping format. * * @see PhraseReader * @see PhraseWriter * @see IdentifiedPhraseReader * @see IdentifiedPhraseWriter * @see MappingReader * @see MappingWriter * @see MappingFormat * @author Scott Martin */ public final class Alignments { /** * The default status: {@link Status#SURE}. */ public static final Status DEFAULT_STATUS = Status.SURE; /** * The default mapping confidence: 1.0. */ public static final Double DEFAULT_CONFIDENCE = Double.valueOf(1.0d); /** * The default phrase numbering base: {@link IndexBase#ZERO}. */ public static final IndexBase DEFAULT_PHRASE_NUMBER_BASE = IndexBase.ZERO; /** * The default index base: {@link IndexBase#ZERO}. */ public static final IndexBase DEFAULT_INDEX_BASE = IndexBase.ZERO; /** * The default phrase position: {@link PhrasePosition#A}. */ public static final PhrasePosition DEFAULT_PHRASE_POSITION = PhrasePosition.A; /** * The default word delimiter pattern, which matches multiple whitespace characters. */ public static final Pattern DEFAULT_WORD_DELIMITER = Pattern.compile("\\s+"); /** * The default word separator, a single space. */ public static final String DEFAULT_WORD_SEPARATOR = " "; /** * The default phrase identifier attribute name, "snum". */ public static final String DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE = "snum"; /** * The default phrase tag, "s". */ public static final String DEFAULT_PHRASE_TAG = "s"; /** * The Moses encoding scheme. */ public static final EncodingScheme MOSES_ENCODING_SCHEME; /** * The NAACL encoding scheme. */ public static final EncodingScheme NAACL_ENCODING_SCHEME; /** * The default fields for the Moses format. */ public static final Set MOSES_DEFAULT_FIELDS; /** * The default fields for the NAACL format. */ public static final Set NAACL_DEFAULT_FIELDS; /** * The fields used in the abbreviated Moses format: {@link Field#A_INDEX_FIELD}, * {@link Field#B_INDEX_FIELD}. */ public static final Set MOSES_SHORT_FIELDS; /** * The fields used in the abbreviated NAACL format: {@link Field#PHRASE_NUMBER_FIELD}, * {@link Field#A_INDEX_FIELD}, {@link Field#B_INDEX_FIELD}, {@link Field#STATUS_FIELD}. */ public static final Set NAACL_SHORT_FIELDS; /** * The fields used in the extremely abbreviated NAACL format: {@link Field#PHRASE_NUMBER_FIELD}, * {@link Field#A_INDEX_FIELD}, {@link Field#B_INDEX_FIELD}. */ public static final Set NAACL_VERY_SHORT_FIELDS; /** * The default {@linkplain MappingFormat#isStrict() mapping format strictness}: false. */ public static final boolean DEFAULT_STRICTNESS = false; /** * The default {@linkplain IdentifiedPhraseWriter#isPadding() identified phrase writer padding}: * false. */ public static final boolean DEFAULT_PHRASE_PADDING = false; static { Set msf = new HashSet(), nsf = new HashSet(), nvsf = new HashSet(); msf.add(A_INDEX_FIELD); msf.add(B_INDEX_FIELD); nsf.add(PHRASE_NUMBER_FIELD); nsf.add(A_INDEX_FIELD); nsf.add(B_INDEX_FIELD); nsf.add(STATUS_FIELD); nvsf.add(PHRASE_NUMBER_FIELD); nvsf.add(A_INDEX_FIELD); nvsf.add(B_INDEX_FIELD); MOSES_SHORT_FIELDS = Collections.unmodifiableSet(msf); NAACL_SHORT_FIELDS = Collections.unmodifiableSet(nsf); NAACL_VERY_SHORT_FIELDS = Collections.unmodifiableSet(nvsf); MOSES_DEFAULT_FIELDS = MOSES_SHORT_FIELDS; NAACL_DEFAULT_FIELDS = NAACL_SHORT_FIELDS; // these have to come last, they depend on some of the others MOSES_ENCODING_SCHEME = new MosesEncodingScheme(); NAACL_ENCODING_SCHEME = new NAACLEncodingScheme(); } private Alignments() { // this class should not be instantiated } /** * Tokenizes a string according to the {@linkplain #DEFAULT_WORD_DELIMITER default word delimiter}. */ public static String[] tokenize(String s) { return tokenize(s, DEFAULT_WORD_DELIMITER); } /** * Tokenizes an array of strings according to the {@linkplain #DEFAULT_WORD_DELIMITER default word * delimiter pattern}. */ public static String[] tokenize(String s, Pattern wordDelimiter) { return wordDelimiter.split(s); } /** * Untokenizes a list of tokens into a single string, with former tokens separated by the * {@linkplain #DEFAULT_WORD_SEPARATOR}. * * @see #untokenize(List, String) */ public static String untokenize(List tokens) { return untokenize(tokens, DEFAULT_WORD_SEPARATOR); } /** * Untokenizes a list of tokens into a single string, with former tokens separated by the * specified delimiter string. */ public static String untokenize(List tokens, String delimiter) { return untokenize(tokens.toArray(new String[tokens.size()])); } /** * Untokenizes an array of tokens into a single string using the * {@linkplain #DEFAULT_WORD_SEPARATOR default word separator}. * @param tokens * @return An untokenized string from the given tokens, with individual * tokens separated by the default word separator. */ public static String untokenize(String[] tokens) { return untokenize(tokens, DEFAULT_WORD_SEPARATOR); } /** * Untokenizes an array of tokens into a single string, with former tokens separated by the * specified delimiter string. */ public static String untokenize(String[] tokens, String delimiter) { StringBuilder sb = new StringBuilder(); for(String w : tokens) { if(sb.length() > 0) { sb.append(delimiter); } sb.append(w.toString()); } return sb.toString(); } /** * Reads phrases sequentially from the specified file. * @see #readPhrases(Reader) */ public static List readPhrases(File f) throws IOException { return readPhrases(new BufferedReader(new FileReader(f))); } /** * Reads phrases sequentially from the specified reader. * @see #readPhrases(Reader, IndexBase) */ public static List readPhrases(Reader r) throws IOException { return readPhrases(r, DEFAULT_PHRASE_NUMBER_BASE); } /** * Reads phrases from the specified reader. Phrases will have their line numbers translated into the * specified index base. * * @see #readPhrases(PhraseReader) */ public static List readPhrases(Reader r, IndexBase phraseNumberBase) throws IOException { PhraseReader reader = new PhraseReader(r, phraseNumberBase); try { return readPhrases(reader); } finally { reader.close(); } } /** * Reads phrases from the specified file. * * @see #readIdentifiedPhrases(Reader) */ public static List readIdentifiedPhrases(File f) throws IOException { return readIdentifiedPhrases(new BufferedReader(new FileReader(f))); } /** * Reads phrases from the specified reader, using the default * {@linkplain #DEFAULT_PHRASE_NUMBER_BASE phrase number base}, * {@linkplain #DEFAULT_PHRASE_TAG phrase tag}, and * {@linkplain #DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE phrase identifier attribute}. * * @see #readIdentifiedPhrases(Reader, IndexBase, String, String) */ public static List readIdentifiedPhrases(Reader r) throws IOException { return readIdentifiedPhrases(r, DEFAULT_PHRASE_NUMBER_BASE, DEFAULT_PHRASE_TAG, DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE); } /** * Reads phrases from the specified reader. The input is assumed to have markup accompanying the phrase data * that indicates the {@linkplain Phrase#getId() phrase ID} for each phrase. * * @param r The underlying reader. * @param phraseNumberBase The index base to translate line numbers into. * @param phraseTag The tag name for markup signaling a phrase. * @param phraseIdentifierAttribute The name of the attribute that contains the phrase's ID in the markup. * * @see #readPhrases(PhraseReader) */ public static List readIdentifiedPhrases(Reader r, IndexBase phraseNumberBase, String phraseTag, String phraseIdentifierAttribute) throws IOException { PhraseReader reader = new IdentifiedPhraseReader(r, phraseNumberBase, phraseTag, phraseIdentifierAttribute); try { return readPhrases(reader); } finally { reader.close(); } } /** * Reads phrases sequentially from the specified phrase reader. * * @return A list of phrases in the order they are encountered by calling {@link PhraseReader#readPhrase()} * on the specified reader. * @throws IOException if one is thrown by the specified phrase reader. * @see PhraseReader */ public static List readPhrases(PhraseReader reader) throws IOException { List l = new ArrayList(); Phrase p; while((p = reader.readPhrase()) != null) { l.add(p); } return l; } /** * Writes a list of phrases with IDs to the specified file. * * @see #writeIdentifiedPhrases(List, Writer) */ public static void writeIdentifiedPhrases(List phrases, File f) throws IOException { writeIdentifiedPhrases(phrases, new BufferedWriter(new FileWriter(f))); } /** * Writes a list of phrases with IDs to the specified writer, using the default * {@linkplain #DEFAULT_WORD_SEPARATOR word separator}, * {@linkplain #DEFAULT_PHRASE_TAG phrase tag}, * {@linkplain #DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE phrase identifier attribute}, and * {@linkplain #DEFAULT_PHRASE_PADDING padding flag}. * * @see #writeIdentifiedPhrases(List, Writer, String, String, String, boolean) */ public static void writeIdentifiedPhrases(List phrases, Writer w) throws IOException { writeIdentifiedPhrases(phrases, w, DEFAULT_WORD_SEPARATOR, DEFAULT_PHRASE_TAG, DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE, DEFAULT_PHRASE_PADDING); } /** * Writes a list of phrases to the specified writer. * * @param phrases The phrases to write. * @param w The underlying writer. * @param wordSeparator The string to use to {@linkplain #untokenize(String[], String) untokenize} with. * @param phraseTag The name of the tag used to signal a phrase in the output markup. * @param phraseIdentifierAttribute The name of the attribute bearing the phrase's ID in the output markup. * @param padding Whether or not to include padding between the markup and the phrase data. * * @see #writePhrases(List, PhraseWriter) */ public static void writeIdentifiedPhrases(List phrases, Writer w, String wordSeparator, String phraseTag, String phraseIdentifierAttribute, boolean padding) throws IOException { PhraseWriter pw = new IdentifiedPhraseWriter(w, wordSeparator, phraseTag, phraseIdentifierAttribute, padding); try { writePhrases(phrases, pw); } finally { pw.close(); } } /** * Writes a list of phrases to the specified file. * * @see #writePhrases(List, Writer) */ public static void writePhrases(List phrases, File f) throws IOException { writePhrases(phrases, new BufferedWriter(new FileWriter(f))); } /** * Writes a list of phrases to the specified writer using the * {@linkplain #DEFAULT_WORD_SEPARATOR default word separator}. * * @see #writePhrases(List, Writer, String) */ public static void writePhrases(List phrases, Writer w) throws IOException { writePhrases(phrases, w, DEFAULT_WORD_SEPARATOR); } /** * Writes a list of phrases to the specified writer using the specified word separator. * * @see #writePhrases(List, PhraseWriter) */ public static void writePhrases(List phrases, Writer w, String wordSeparator) throws IOException { PhraseWriter pw = new PhraseWriter(w, wordSeparator); try { writePhrases(phrases, pw); } finally { pw.close(); } } /** * Writes a list of phrases to the specified phrase writer. * * @param phrases The phrases to write. * @param writer The underlying phrase writer. * @throws IOException if a call to {@link PhraseWriter#writePhrase(Phrase)} throws one for one of the * phrases. */ public static void writePhrases(List phrases, PhraseWriter writer) throws IOException { for(Phrase p : phrases) { writer.writePhrase(p); } } /** * Reads mappings from the specified file using the specified format. * * @see #readMappings(Reader, MappingFormat) */ public static Map> readMappings(File f, MappingFormat format) throws IOException { return readMappings(new BufferedReader(new FileReader(f)), format); } /** * Reads mappings from the specified reader using the specified format. * * @see #readMappings(MappingReader) */ public static Map> readMappings(Reader r, MappingFormat format) throws IOException { MappingReader mr = new MappingReader(r, format); try { return readMappings(mr); } finally { mr.close(); } } /** * Reads mappings from the specified mapping reader. Once all available mappings have been read, the * specified reader is {@linkplain MappingReader#close() closed}. * * @param reader The mapping reader to read mappings from. * @return A map whose keys are the {@linkplain Phrase#getNumber() phrase numbers} of the corresponding * phrases and whose values are sets containing the mappings for the key phrase. Both the keys and the * sets of mappings are maintained in the order in which they occur in the input. * * @throws IOException if the underlying reader throws an exception, or if one of the * {@linkplain MappingGroup mapping groups} contains a duplicate mapping. * * @see MappingReader */ public static Map> readMappings(MappingReader reader) throws IOException { Map> am = new LinkedHashMap>(); try { MappingGroup ag = null; while((ag = reader.nextGroup()) != null) { Set as = am.get(ag.phraseNumber); if(as == null) { as = new LinkedHashSet(); am.put(ag.phraseNumber, as); } while(reader.canRead()) { Mapping m = reader.readMapping(); if(!as.add(m)) { throw new IOException("duplicate mapping in group " + ag + ": " + m); } } } } finally { reader.close(); } return am; } /** * Reads mappings into a sorted map from the specified file, based on the specified format. * * @see #readSortedMappings(Reader, MappingFormat) */ public static SortedMap> readSortedMappings(File f, MappingFormat format) throws IOException { return readSortedMappings(new BufferedReader(new FileReader(f)), format); } /** * Reads mappings into a sorted map from the specified reader, using the specified format to parse * mappings. * * @see #readSortedMappings(MappingReader) */ public static SortedMap> readSortedMappings(Reader r, MappingFormat format) throws IOException { MappingReader mr = new MappingReader(r, format); try { return readSortedMappings(mr); } finally { mr.close(); } } /** * Reads mappings into a sorted map from the specified reader, using the specified format to parse * mappings. * * @param mr The mapping reader to use. * @return A sorted map whose keys and values are also sorted according to their natural order. * * @throws IOException if the underlying reader throws an exception. * * @see Mapping#compareTo(Mapping) */ public static SortedMap> readSortedMappings(MappingReader mr) throws IOException { SortedMap> sm = new TreeMap>(); Map> m = readMappings(mr); for(Integer k : m.keySet()) { sm.put(k, new TreeSet(m.get(k))); } return sm; } /** * Writes the specified map to the specified file using the format provided. * * @see #writeMappings(Map, Writer, MappingFormat) */ public static void writeMappings(Map> map, File f, MappingFormat format) throws IOException { writeMappings(map, new BufferedWriter(new FileWriter(f)), format); } /** * Writes the specified map to the specified writer using the format provided. * * @see #writeMappings(Map, MappingWriter) */ public static void writeMappings(Map> map, Writer w, MappingFormat format) throws IOException { writeMappings(map, new MappingWriter(w, format)); } /** * Writes the specified map to the specified writer, starting {@linkplain MappingGroup mapping groups} * as needed based on the key and the {@linkplain Set#size() size} of the value set. After all the sets * of mappings in the map have been written, {@link MappingWriter#close()} is called. * * @param map The mappings to write. * @param writer The underlying writer to write to. * @see MappingGroup * @throws IOException if one occurs in the underlying writer. */ public static void writeMappings(Map> map, MappingWriter writer) throws IOException { try { for(Integer k : map.keySet()) { Set as = map.get(k); writer.startGroup(new MappingGroup(k, as.size())); for(Mapping a : as) { writer.writeMapping(a); } } } finally { writer.close(); } } /** * Reads alignments from the specified files, using the provided mapping format. * * @see #readAlignments(Reader, Reader, Reader, MappingFormat) */ public static List readAlignments(File phraseA, File phraseB, File mappings, Pattern wordDelimiter, MappingFormat format) throws IOException { return readAlignments(new BufferedReader(new FileReader(phraseA)), new BufferedReader(new FileReader(phraseB)), new BufferedReader(new FileReader(mappings)), format); } /** * Reads alignments from the specified readers, using the provided mapping format. * * @see #readAlignments(PhraseReader, PhraseReader, Reader, MappingFormat) */ public static List readAlignments(Reader phraseA, Reader phraseB, Reader mappings, MappingFormat format) throws IOException { IndexBase idBase = format.encodingScheme.getPhraseNumberBase(); return readAlignments(new PhraseReader(phraseA, idBase), new PhraseReader(phraseB, idBase), mappings, format); } /** * Reads alignments from the specified files, using the provided word delimiter pattern and mapping format. * The files containing phrases are assumed to contain markup indicating the phrase IDs. * * @see #readIdentifiedAlignments(Reader, Reader, Reader, MappingFormat) */ public static List readIdentifiedAlignments(File phraseA, File phraseB, File mappings, Pattern wordDelimiter, MappingFormat format) throws IOException { return readIdentifiedAlignments(new BufferedReader(new FileReader(phraseA)), new BufferedReader(new FileReader(phraseB)), new BufferedReader(new FileReader(mappings)), format); } /** * Reads alignments from the specified readers, using the provided word delimiter pattern and mapping format. * The readers for phrases are assumed to have input with markup indicating the phrase IDs. * * @see #readAlignments(PhraseReader, PhraseReader, Reader, MappingFormat) */ public static List readIdentifiedAlignments(Reader phraseA, Reader phraseB, Reader mappings, MappingFormat format) throws IOException { return readAlignments(new IdentifiedPhraseReader(phraseA), new IdentifiedPhraseReader(phraseB), mappings, format); } /** * Reads alignments from the specified readers, using the provided format to parse mappings. The * line numbers of the phrases are translated into the * {@linkplain EncodingScheme#getPhraseNumberBase() phrase number base} of the * {@linkplain MappingFormat#getEncodingScheme() format's encoding scheme}. * * @param phraseA The reader from which the {@linkplain PhrasePosition#A A-position} phrases are read. * @param phraseB The reader from which the {@linkplain PhrasePosition#B B-position} phrases are read. * @param mappings The reader whose input contains the mappings from A-phrases to B-phrases, where the * A-position indices are assumed to correspond to the phrases in phraseA and the B-position * indices are assumed to correspond to the phrases in phraseB. * @param format The mapping format used to parse mappings read from mapping. * @return A list of alignments where the {@linkplain Alignment#getA() A-phrases} are from phraseA, * the {@linkplain Alignment#getB() B-phrases} are from phraseB, and the mappings are the ones * read from mappings with the corresponding {@linkplain Phrase#getNumber() phrase number}. * @throws IOException if phraseA has a different number of phrases than phraseB, or if * one is thrown by any of the underlying readers. * * @see PhraseReader * @see MappingReader * @see #readPhrases(PhraseReader) * @see #readMappings(MappingReader) */ public static List readAlignments(PhraseReader phraseA, PhraseReader phraseB, Reader mappings, MappingFormat format) throws IOException { List m = new ArrayList(); try { List ps1 = readPhrases(phraseA), ps2 = readPhrases(phraseB); Map> mm = readMappings(mappings, format); // sanity check if(ps1.size() != ps2.size()) { throw new IOException("number of phrases different between first and second"); } for(int i = 0; i < ps1.size(); i++) { m.add(new Alignment(ps1.get(i), ps2.get(i), mm.get(i))); } } finally { phraseA.close(); phraseA.close(); mappings.close(); } return m; } /** * Writes a list of alignments to the specified files, using the word separator and format provided. * * @see #writeAlignments(List, Writer, Writer, Writer, String, MappingFormat) */ public static void writeAlignments(List alignments, File phraseA, File phraseB, File mappings, String wordSeparator, MappingFormat format) throws IOException { writeAlignments(alignments, new BufferedWriter(new FileWriter(phraseA)), new BufferedWriter(new FileWriter(phraseB)), new BufferedWriter(new FileWriter(mappings)), wordSeparator, format); } /** * Writes a list of alignments to the specified readers, using the word separator and format provided. * After all the alignments are written, all the associated writers are closed. * * @param alignments The alignments to write. * @param phraseA The writer to which {@linkplain Alignment#getA() A-position phrases} are written. * @param phraseB The writer to which {@linkplain Alignment#getB() B-position phrases} are written. * @param mappings The mappings to write, where the {@linkplain Mapping#getA() A-position indices} are * assumed to correspond to the phrases in phraseA and the * {@linkplain Mapping#getB() B-position indices} are assumed to correspond to the ones in phraseB. * @param wordSeparator The word separator to use for * {@linkplain #untokenize(List, String) untokenization}. * @param format The format to use for formatting mappings in the alignments. * @throws IOException if one is thrown by any of the underlying writers. */ public static void writeAlignments(List alignments, Writer phraseA, Writer phraseB, Writer mappings, String wordSeparator, MappingFormat format) throws IOException { PhraseWriter pw1 = new PhraseWriter(phraseA, wordSeparator), pw2 = new PhraseWriter(phraseB, wordSeparator); MappingWriter mw = new MappingWriter(mappings, format); try { for(int i = 0; i < alignments.size(); i++) { Alignment a = alignments.get(i); pw1.writePhrase(a.a); pw2.writePhrase(a.b); mw.startGroup(new MappingGroup(i, a.size())); for(Mapping m : a) { mw.writeMapping(m); } } } finally { pw1.close(); pw2.close(); mw.close(); } } } ================================================ FILE: src/opennlp/ccg/alignment/EncodingScheme.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.util.List; import java.util.Set; /** * An encoding scheme for mappings. Some examples are * the {@linkplain MosesEncodingScheme Moses scheme} and the {@linkplain NAACLEncodingScheme NAACL scheme}. *

* Implementers keep track of the {@link IndexBase}s corresponding to an encoding scheme (for both phrase * numbers and indices), which characters it uses to delimit mappings, groups of mappings, and fields within * mappings. It also captures which fields occur * in which {@linkplain #getOrder() order} in an encoding scheme, along with which ones are * {@linkplain #getRequired() required}, and which are used by {@linkplain #getDefaults() default}. *

* Some convenience methods are provided for determining whether a given * character is the field, mapping, or group delimiter for this encoding scheme. These methods mainly allow * comparison when one character is a line separator that may be different from the one on the current platform. *

* Encoding schemes are used in the {@link MappingFormat} class, as well as in * the readers and writers for {@link Mapping}s and {@link Alignment}s. * * @author Scott Martin * @see AbstractEncodingScheme * @see MappingFormat * @see Moses alignment format * @see NAACL shared task alignment format */ public interface EncodingScheme { /** * Tests whether the supplied character counts as a field delimiter according to this encoding scheme. * @return true if c is equal to {@link #getFieldDelimiter()} or both c and * field delimiters are line separators. */ public boolean isFieldDelimiter(Character c); /** * Tests whether the supplied character counts as a mapping delimiter according to this encoding scheme. * @return true if c is equal to {@link #getMappingDelimiter()} or both c and * mapping delimiters are line separators. */ public boolean isMappingDelimiter(Character c); /** * Tests whether the supplied character counts as a group delimiter according to this encoding scheme. * @return true if c is equal to {@link #getGroupDelimiter()} or both c and * group delimiters are line separators. */ public boolean isGroupDelimiter(Character c); /** * Gets the delimiter for fields. */ public Character getFieldDelimiter(); /** * Gets the delimiter for mappings. */ public Character getMappingDelimiter(); /** * Gets the delimiter for groups. */ public Character getGroupDelimiter(); /** * Gets the numbering base used for phrases. */ public IndexBase getPhraseNumberBase(); /** * Gets the numbering base used for mapping indices. */ public IndexBase getIndexBase(); /** * Gets the order in which fields occur in this encoding scheme. */ public List getOrder(); /** * Gets the required (non-optional) fields in this scheme. */ public Set getRequired(); /** * Gets the default fields in this scheme. */ public Set getDefaults(); } ================================================ FILE: src/opennlp/ccg/alignment/IdentifiedPhraseReader.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.Alignments.DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE; import static opennlp.ccg.alignment.Alignments.DEFAULT_PHRASE_NUMBER_BASE; import static opennlp.ccg.alignment.Alignments.DEFAULT_PHRASE_TAG; import static opennlp.ccg.alignment.Alignments.DEFAULT_WORD_DELIMITER; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; /** * Phrase reader for phrases that have an identifier in addition to a line number, usually formatted by * markup like *

 * <s snum="157"> ... </s>
 * 
* Since the value of the identifier is not necessarily an {@link Integer}, this class gives IDs as strings. * The {@linkplain #getPhraseIdentifierAttribute() phrase identifier attribute} (here snum) and * {@linkplain #getPhraseTag() phrase tag} (here s) are configurable when instances are * constructed. * * @author Scott Martin * @see Alignments#DEFAULT_PHRASE_TAG * @see Alignments#DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE */ public class IdentifiedPhraseReader extends PhraseReader { static final String TAG_START = "<", TAG_END = ">"; final String phraseTag, phraseIdentifierAttribute; private String lastId = null; /** * Creates an identified phrase reader with the {@linkplain Alignments#DEFAULT_PHRASE_NUMBER_BASE default * number base}. * * @see #IdentifiedPhraseReader(Reader, IndexBase) * @see Alignments#DEFAULT_PHRASE_TAG * @see Alignments#DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE */ public IdentifiedPhraseReader(Reader in) { this(in, DEFAULT_PHRASE_NUMBER_BASE); } /** * Creates an identified phrase reader with the {@linkplain Alignments#DEFAULT_PHRASE_TAG default phrase tag} * and {@link Alignments#DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE default phrase identifier attribute}. * * @see #IdentifiedPhraseReader(Reader, IndexBase, String, String) * @see Alignments#DEFAULT_PHRASE_TAG * @see Alignments#DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE */ public IdentifiedPhraseReader(Reader in, IndexBase numberBase) { this(in, numberBase, DEFAULT_PHRASE_TAG, DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE); } /** * Creates an identified phrase reader for the specified phrase tag and identifier attribute. * @param phraseTag The phrase tag that will be used to parse identifiers. * @param phraseIdentifierAttribute The attribute that will denote identifiers when parsing phrases. * @throws IllegalArgumentException if phraseIdentifierAttribute or phraseTag * is null. */ public IdentifiedPhraseReader(Reader in, IndexBase numberBase, String phraseTag, String phraseIdentifierAttribute) { super(in, numberBase); if(phraseIdentifierAttribute == null) { throw new IllegalArgumentException("phraseIdentifierAttribute is null"); } if(phraseTag == null) { throw new IllegalArgumentException("phraseTag is null"); } this.phraseIdentifierAttribute = phraseIdentifierAttribute; this.phraseTag = phraseTag; } /** * Gets the last ID encountered. */ public String getLastId() { return lastId; } /** * Gets the tag that denotes a phrase. */ public String getPhraseTag() { return phraseTag; } /** * Gets the attribute name that denotes a phrase identifier. */ public String getPhraseIdentifierAttribute() { return phraseIdentifierAttribute; } /** * Reads a phrase from the underlying input stream, first parsing the {@linkplain Phrase#getNumber() * phrase's number} based on the {@linkplain #getPhraseTag() phrase tag} and * {@linkplain #getPhraseIdentifierAttribute() identifier attribute} being used. * @return A phrase with the ID signaled in the input. * @throws IOException If the underlying input contains ill-formated phrase markup, or if no ID cannot be * determined after parsing the phrase markup. */ @Override public Phrase readPhrase() throws IOException { String ln = readLine(); if(ln == null) { return null; } String[] chunks = DEFAULT_WORD_DELIMITER.split(ln); String c = chunks[0]; if(!c.trim().startsWith(TAG_START)) { throw new IOException("unable to parse: " + ln + "; expected <, but was " + c.trim()); } int clen = c.length(); int pos = (clen > 1) ? 1 : 2; String t = (pos == 1) ? c.substring(1).trim() : chunks[1]; if(!t.equals(phraseTag)) { throw new IOException("expected sequence tag " + phraseTag + ", but was " + t); } boolean foundIndex = false; int start = -1; for(int i = pos; i < chunks.length; i++) { if(!foundIndex) { String[] subchunks = chunks[i].trim().split("="); if(subchunks.length > 1) { if(subchunks[0].equals(phraseIdentifierAttribute)) { char[] idVal = subchunks[1].toCharArray(); int idStart = 0, idEnd = idVal.length - 1; boolean foundStart = false; for(int j = 0; j < idVal.length; j++) { if(idVal[j] == '\'' || idVal[j] == '\"') { if(foundStart) { idEnd = j; break; } else { foundStart = true; idStart = j + 1; } } } lastId = new String(idVal).substring(idStart, idEnd); foundIndex = true; } } } if(chunks[i].contains(TAG_END)) { start = i; break; } } if(!foundIndex) { throw new IOException("no ID found on line " + getLineNumber()); } List l = new ArrayList(chunks.length); for(int j = start; j < chunks.length; j++) { String cj = chunks[j]; if(j == start) { int te = cj.indexOf(TAG_END); if(te != -1) { cj = cj.substring(te + 1); } } if(j + 1 == chunks.length) { int ts = cj.indexOf(TAG_START); if(ts != -1) { cj = cj.substring(0, ts); } } if(cj.length() > 0) { l.add(cj); } } return new Phrase(lastId, getPhraseNumber(), l); } } ================================================ FILE: src/opennlp/ccg/alignment/IdentifiedPhraseWriter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.io.IOException; import java.io.Writer; import static opennlp.ccg.alignment.Alignments.*; /** * A writer for phrases with {@linkplain Phrase#getId() ids} in addition to * {@linkplain Phrase#getNumber() numbers}. Identified phrases are ones read from * markup that signals an ID for each phrase. *

* In addition to allowing the phrase tag and identifier attribute to be configured, this class can optionally * output {@linkplain #isPadding() padding} of a single space between the markup and the phrase. * * @author Scott Martin * @see IdentifiedPhraseReader */ public class IdentifiedPhraseWriter extends PhraseWriter { final String phraseTag, phraseIdentifierAttribute; final boolean padding; /** * Creates a new identified phrase writer. The word separator used is * {@link Alignments#DEFAULT_WORD_SEPARATOR}. * @see #IdentifiedPhraseWriter(Writer, String) */ public IdentifiedPhraseWriter(Writer out) { this(out, Alignments.DEFAULT_WORD_SEPARATOR); } /** * Creates a new identified phrase writer with the given word separator. The phrase tag used is the default, * {@link Alignments#DEFAULT_PHRASE_TAG}, as is the phrase ID attribute * ({@link Alignments#DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE}). * @see #IdentifiedPhraseWriter(Writer, String, String, String) */ public IdentifiedPhraseWriter(Writer out, String wordSeparator) { this(out, wordSeparator, DEFAULT_PHRASE_TAG, DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE); } /** * Creates a new identified phrase writer with the given word separator. The phrase tag used is the default, * {@link Alignments#DEFAULT_PHRASE_TAG}, as is the phrase ID attribute * ({@link Alignments#DEFAULT_PHRASE_IDENTIFIER_ATTRIBUTE}). * @see #IdentifiedPhraseWriter(Writer, String, String, String, boolean) */ public IdentifiedPhraseWriter(Writer out, String wordSeparator, String phraseTag, String phraseIdentifierAttribute) { this(out, wordSeparator, phraseTag, phraseIdentifierAttribute, DEFAULT_PHRASE_PADDING); } /** * Creates a new identified phrase writer for writing phrases to the underlying writer. * @param phraseTag The name of the tag that holds the phrase identifier. * @param phraseIdentifierAttribute The phrase identifier attribute. * @param padding Whether this writer should write a space between the pre-markup and the phrase, and * between the phrase and the post-markup. * @throws IllegalArgumentException if phraseTag or phraseIdentifierAttribute * is null. */ public IdentifiedPhraseWriter(Writer out, String wordSeparator, String phraseTag, String phraseIdentifierAttribute, boolean padding) { super(out, wordSeparator); if(phraseTag == null) { throw new IllegalArgumentException("phraseTag is null"); } if(phraseIdentifierAttribute == null) { throw new IllegalArgumentException("phraseIdentifierAttribute is null"); } this.phraseTag = phraseTag; this.phraseIdentifierAttribute = phraseIdentifierAttribute; this.padding = padding; } /** * Gets the phrase tag this writer generates with each phrase. */ public String getPhraseTag() { return phraseTag; } /** * Gets the phrase identifier attribute used to signal the phrase ID for each phrase. */ public String getPhraseIdentifierAttribute() { return phraseIdentifierAttribute; } /** * Tests whether this writer writes space padding between its pre- and post-markup and the phrase itself. * @return true if this writer uses padding. */ public boolean isPadding() { return padding; } /** * Writes the start markup indicating a phrase's ID. The tag and ID attribute are the ones used to * create this writer. * * @see #getPhraseTag() * @see #getPhraseIdentifierAttribute() * @throws IOException If a problem occurs in the underlying writer. */ @Override public void preWritePhrase(Phrase phrase) throws IOException { String id = phrase.getId(); if(id == null) { throw new IOException("attempt to write phrase with null ID: " + phrase); } printWriter.print('<'); printWriter.print(phraseTag); printWriter.print(' '); printWriter.print(phraseIdentifierAttribute); printWriter.print("=\""); printWriter.print(id); printWriter.print("\">"); if(padding) { printWriter.print(' '); } } /** * Writes the end markup signaling the end of a phrase, plus a line separator. * * @throws IOException If a problem occurs in the underlying writer. */ @Override public void postWritePhrase(Phrase phrase) throws IOException { if(padding) { printWriter.print(' '); } printWriter.print(""); } } ================================================ FILE: src/opennlp/ccg/alignment/IndexBase.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; /** * A set of {@link Enum} constants representing the two common indexing bases used in representing alignments. * The constants are ordered so that their {@link Enum#ordinal()} method returns a number corresponding to * their {@linkplain #getStart() start index}. *

* An index base is characterized by its {@linkplain IndexBase#getStart() starting point} and its * {@linkplain #getNullValue() null value}, which is derived from its starting point by decrementing by one. * One index base can translate an integer in that base into another base via the * {@link #translate(Integer, IndexBase)} method. * * @author Scott Martin * @see EncodingScheme */ public enum IndexBase { /** * The index base that starts with 0. */ ZERO, /** * The index base starting with 1. */ ONE; final Integer start, nullValue; private IndexBase() { this.start = Integer.valueOf(ordinal()); this.nullValue = start - 1; } /** * Gets the starting point of this index base. */ public Integer getStart() { return start; } /** * This index base's special null value, the value of {@link #getStart()} - 1. */ public Integer getNullValue() { return nullValue; } /** * Tests whether the specified index is valid in this index base. * @param index The index to test. * @return true If index is non-null and not less than {@link #getNullValue()}. */ public boolean isValidIndex(Integer index) { return index != null && nullValue <= index; } /** * Translates an index in this base to another base. For example, ZERO.translate(2, ONE) yields * 3 and ONE.translate(1, ZERO) yields 0. Note that supplying the same * index base as the target has no effect, so that if b is an index base constant, then * b.translate(n, b) returns n for every {@link Integer} n as long * as n is {@linkplain #isValidIndex(Integer) valid} (throwing an exception otherwise). * @param index The index to translate. * @param target The target index base to translate index into. * @return The value of index as it is represented in the index base target. * @throws IllegalArgumentException If index is invalid for this index base. * @see #isValidIndex(Integer) */ public Integer translate(Integer index, IndexBase target) { if(!isValidIndex(index)) { throw new IllegalArgumentException("invalid index for index base " + name() + ": " + index); } if(target == this) { return index; } return Integer.valueOf(index + (target.start - start)); } } ================================================ FILE: src/opennlp/ccg/alignment/Mapping.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.PhrasePosition.*; /** * A mapping from an index in the {@linkplain PhrasePosition#A A} phrase position to an index in the * {@linkplain PhrasePosition#B B} phrase position. Mappings are interpreted as the individual pairs * that make up an {@link Alignment} from one {@link Phrase} to another. This class implements * {@link Comparable} so that mappings can be easily sorted (see the {@link #compareTo(Mapping)} method). *

* Although different * {@linkplain EncodingScheme encoding schemes} may use different {@linkplain IndexBase index bases}, all * mappings share a common one, namely {@link Alignments#DEFAULT_INDEX_BASE}. As such, no index can be * specified that is less than the {@linkplain IndexBase#nullValue null value} of that index base. Similarly, * the phrase number specified (if any) must be in the range of the * {@linkplain Alignments#DEFAULT_PHRASE_NUMBER_BASE default phrase number base}, * even though different encoding schemes may have different phrase number bases. *

* Since some {@linkplain EncodingScheme encoding schemes} do not require an * {@linkplain #getPhraseNumber() phrase number} * to be specified, the phrase number field may be null. However, none of the other fields may be * null. To specify that the mapping is to the special null word value, the * {@linkplain Alignments#DEFAULT_INDEX_BASE default index base}'s * {@linkplain IndexBase#nullValue null value} is used. * * @author Scott Martin * @see PhrasePosition * @see EncodingScheme * @see IndexBase * @see Alignments#DEFAULT_PHRASE_NUMBER_BASE * @see Alignments#DEFAULT_INDEX_BASE * @see Alignments#DEFAULT_STATUS * @see Alignments#DEFAULT_CONFIDENCE */ public class Mapping implements Comparable { final Integer phraseNumber, a, b; Status status; Double confidence; /** * Creates a mapping between the two specified indices with a null phrase number. * @see #Mapping(Integer, Integer, Integer) */ public Mapping(Integer a, Integer b) { this(null, a, b); } /** * Creates a mapping between the two specified indices with a null phrase number and the * {@linkplain Alignments#DEFAULT_STATUS default status}. * @see #Mapping(Integer, Integer, Integer, Status) */ public Mapping(Integer phraseNumber, Integer a, Integer b) { this(phraseNumber, a, b, Alignments.DEFAULT_STATUS); } /** * Creates a mapping between the two specified indices with the specified phrase number and status, with the * {@linkplain Alignments#DEFAULT_CONFIDENCE default confidence}. * @see #Mapping(Integer, Integer, Integer, Status, Double) */ public Mapping(Integer phraseNumber, Integer a, Integer b, Status status) { this(phraseNumber, a, b, status, Alignments.DEFAULT_CONFIDENCE); } /** * Creates a mapping between the two specified indices. * @param phraseNumber The phrase number of the corresponding mapping. * @param a The A index, corresponding to {@link PhrasePosition#A}. * @param b The B index, corresponding to {@link PhrasePosition#B}. * @param status The status of this mapping. * @param confidence This mapping's confidence value. * * @throws IndexOutOfBoundsException If any of phraseNumber, a, or * b is out of bounds according to the corresponding default. * @throws IllegalArgumentException If either index is null, or if status or * confidence is null. * * @see Alignments#DEFAULT_PHRASE_NUMBER_BASE * @see Alignments#DEFAULT_INDEX_BASE */ public Mapping(Integer phraseNumber, Integer a, Integer b, Status status, Double confidence) { checkPhraseNumber(phraseNumber); checkIndex(a); checkIndex(b); checkField(status, "status"); checkField(confidence, "confidence"); this.phraseNumber = phraseNumber; this.a = a; this.b = b; this.status = status; this.confidence = confidence; } /** * Gets a copy of this mapping with its {@linkplain #getPhraseNumber() phrase number} * set to the specified phrase number. This method is a convenience for the {@link MappingReader} class, * which, for certain encoding scheme like {@link MosesEncodingScheme}, may not be able to * parse the phrase number from the input string. * @param phraseNumber The phrase number the returned mapping should have. * @return This mapping if its {@linkplain #getPhraseNumber() phrase number} is null and the * specified phrase number is null, or if this mapping's phrase number is * {@linkplain Integer#equals(Object) equivalent to} the specified phrase number. Otherwise, a new * mapping is returned with all the same field values as this mapping, but with its phrase number set * to phraseNumber. * * @see #Mapping(Integer, Integer, Integer, Status, Double) */ public Mapping copyWithPhraseNumber(Integer phraseNumber) { if((this.phraseNumber == null && phraseNumber == null) || (this.phraseNumber != null && this.phraseNumber.equals(phraseNumber))) { return this; } return new Mapping(phraseNumber, a, b, status, confidence); } /** * Convenience method for creating mappings when the phrase position may possibly be * {@link PhrasePosition#B}. * @see #mappingByPosition(Integer, Integer, Integer, Status, Double, PhrasePosition) */ public static Mapping mappingByPosition(Integer phraseNumber, Integer a, Integer b, PhrasePosition firstPosition) { return mappingByPosition(phraseNumber, a, b, Alignments.DEFAULT_STATUS, Alignments.DEFAULT_CONFIDENCE, firstPosition); } /** * Convenience method for creating mappings in case which index should be {@linkplain #getA() A} and * which should be {@linkplain #getB() B} depends on the phrase position. * * @param phraseNumber The phrase number to use. * @param a The index to use either for the A (if the first position is * {@link PhrasePosition#A}) or B index (if the first position is {@link PhrasePosition#B}). * @param b The index to use either for the B (if the first position is * {@link PhrasePosition#A}) or A index (if the first position is {@link PhrasePosition#B}). * @param status The status to use. * @param confidence The confidence to use. * @param firstPosition Which phrase position the new mapping should reflect. If this argument is * {@link PhrasePosition#A}, the a argument will be the A index and the * b argument the B. If it is {@link PhrasePosition#B}, these are reversed. * @return A new mapping with its indices configured per the specified firstPosition. * @see Mapping#Mapping(Integer, Integer, Integer, Status, Double) */ public static Mapping mappingByPosition(Integer phraseNumber, Integer a, Integer b, Status status, Double confidence, PhrasePosition firstPosition) { return new Mapping(phraseNumber, (firstPosition == A) ? a : b, (firstPosition == B) ? a : b, status, confidence); } /** * Gets a new mapping just like this one except that the indices in the {@link PhrasePosition#A} * and {@link PhrasePosition#B} positions are swapped. The original status and confidence are * unchanged. * @return A mapping with indices reversed. * @see Mapping#mappingByPosition(Integer, Integer, Integer, Status, Double, PhrasePosition) */ public Mapping reverse() { return mappingByPosition(phraseNumber, a, b, status, confidence, B); } /** * Gets this mapping's phrase number. */ public Integer getPhraseNumber() { return phraseNumber; } /** * Gets this mapping's index in the {@linkplain PhrasePosition#A A-position}. * @return The value of {@link #get(PhrasePosition)} for {@link PhrasePosition#A}. */ public Integer getA() { return get(A); } /** * Gets this mapping's index in the {@linkplain PhrasePosition#B B-position}. * @return The value of {@link #get(PhrasePosition)} for {@link PhrasePosition#B}. */ public Integer getB() { return get(B); } /** * Gets this mapping's index at the specified phrase position. * @param pos The phrase position at which to return the corresponding index. * @return If pos is {@link PhrasePosition#A}, the A index; otherwise the B index. */ public Integer get(PhrasePosition pos) { return (pos == A) ? a : b; } /** * Gets this mapping's status. */ public Status getStatus() { return status; } /** * Sets this mapping's status to the specified value. */ public void setStatus(Status status) { checkField(status, "status"); this.status = status; } /** * Gets this mapping's confidence. */ public Double getConfidence() { return confidence; } /** * Sets this mapping's confidence to the supplied value. * @param confidence May be null. No bounds checking is performed on this value even if it * is non-null. */ public void setConfidence(Double confidence) { checkField(confidence, "confidence"); this.confidence = confidence; } /** * Compares this mapping to another according to their natural ordering. The natural ordering of * mappings is that first their IDs are compared, then their A indices, then their B indices, * and finally their status and confidence value (in that order). *

* For the ID field, which may be null, the comparison is performed as follows. If both this mapping's * ID and the other's are null, they are considered equivalent. If this mapping's ID is non-null, it is * compared to the (possibly null) other mapping's ID via {@link Integer#compareTo(Integer)}. */ @Override public int compareTo(Mapping o) { int i = (phraseNumber == null && o.phraseNumber == null) ? 0 : phraseNumber.compareTo(o.phraseNumber); if(i == 0) { i = a.compareTo(o.a); } if(i == 0) { i = b.compareTo(o.b); } if(i == 0) { i = status.compareTo(o.status); } if(i == 0) { i = confidence.compareTo(o.confidence); } return i; } /** * Computes a hash code based on the ID, and A and B indices. The status and confidence fields are not * used for hash code computation because they are mutable. Because of this, two mappings may have * identical hash codes but not be equivalent according to {@link #equals(Object)}. */ @Override public int hashCode() { int h = 37 * 1 + a + b; return (phraseNumber == null) ? h : h + phraseNumber; } /** * Tests whether this mapping is equal to another. * @return true If this mapping's fields match the other according to the corresponding equals * methods. For the phrase number field, they are considered equal if both null * or if their corresponding equals method returns true, unequal otherwise. */ @Override public boolean equals(Object obj) { if(obj instanceof Mapping) { Mapping m = (Mapping)obj; return ((phraseNumber == null && m.phraseNumber == null) || phraseNumber.equals(m.phraseNumber)) && a.equals(m.a) && b.equals(m.b) && status.equals(m.status) && confidence.equals(m.confidence); } return false; } void checkPhraseNumber(Integer phraseNumber) throws IndexOutOfBoundsException { if(phraseNumber != null && !Alignments.DEFAULT_PHRASE_NUMBER_BASE.isValidIndex(phraseNumber)) { throw new IndexOutOfBoundsException("invalid phrase number: " + phraseNumber); } } void checkIndex(Integer index) throws IndexOutOfBoundsException { if(index == null) { throw new IllegalArgumentException("null index"); } if(!Alignments.DEFAULT_INDEX_BASE.isValidIndex(index)) { throw new IndexOutOfBoundsException("invalid index: " + index); } } void checkField(Object obj, String name) throws IllegalArgumentException { if(obj == null) { throw new IllegalArgumentException("null " + name); } } /** * A string representation of this mapping's indices. * @return For a mapping with a {@linkplain #getA() A} index of 3 and a * {@linkplain #getB() B} index of 6, this method gives the string * "3 <-> 6". */ @Override public String toString() { StringBuilder sb = new StringBuilder(a.toString()); sb.append(" <-> "); sb.append(b.toString()); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/alignment/MappingFormat.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.Alignments.DEFAULT_CONFIDENCE; import static opennlp.ccg.alignment.Alignments.DEFAULT_INDEX_BASE; import static opennlp.ccg.alignment.Alignments.DEFAULT_PHRASE_NUMBER_BASE; import static opennlp.ccg.alignment.Alignments.DEFAULT_STATUS; import static opennlp.ccg.alignment.Alignments.DEFAULT_STRICTNESS; import static opennlp.ccg.alignment.MappingFormat.Field.A_INDEX_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.B_INDEX_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.CONFIDENCE_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.PHRASE_NUMBER_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.STATUS_FIELD; import java.text.FieldPosition; import java.text.Format; import java.text.ParseException; import java.text.ParsePosition; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A class for formatting mappings according to an {@linkplain EncodingScheme encoding scheme} and a specified * set of {@linkplain Field formatting fields}. This class * extends {@link Format} so that it fits in with the Java text parsing API. *

* Care is always taken to translate a mapping's phrase number and indices to the target * encoding scheme's {@linkplain EncodingScheme#getIndexBase() index base}. * If a mapping format is not {@linkplain #isStrict() strict}, parsing is handled robustly in that it tolerates * fields that are optional and may not always be specified. On output, a non-strict mapping format will only * include fields that either (1) do not have a {@linkplain Field#hasDefaultValue() default value}, or (2) * have a value different from {@linkplain Field#getDefaultValue() the default}. Strict mapping * formats, on the other hand, always expect and generate all and only the {@linkplain #getFields() fields * specified}. *

* Instances of mapping formats can be obtained by calling one of the * getInstance(...) methods. * * @see #getFields() * @see #isStrict() * @see EncodingScheme * @see IndexBase * @author Scott Martin */ public class MappingFormat extends Format { private static final long serialVersionUID = 1L; final EncodingScheme encodingScheme; final Set fields; final Pattern mappingPattern, fieldPattern; final boolean strict; private static Map> formatCache; /** * Marks mapping fields like ID, first index, second index, etc. Fields can have a * {@linkplain #getDefaultValue() default value}, which may be used depending on whether this format is * {@linkplain MappingFormat#isStrict() strict}. * @author Scott Martin */ public static class Field extends java.text.Format.Field { private static final long serialVersionUID = 1L; Object defaultValue = null; /** * Field representing a mapping's ID. */ public static final Field PHRASE_NUMBER_FIELD = new Field("PHRASE_NUMBER", null); /** * Field representing a mapping's A index. */ public static final Field A_INDEX_FIELD = new Field("A_INDEX", null); /** * Field representing a mapping's B index. */ public static final Field B_INDEX_FIELD = new Field("B_INDEX", null); /** * Field representing a mapping's status field. */ public static final Field STATUS_FIELD = new Field("STATUS", DEFAULT_STATUS); /** * Field representing a mapping's confidence field. */ public static final Field CONFIDENCE_FIELD = new Field("CONFIDENCE", DEFAULT_CONFIDENCE); /** * Creates a new mapping format field. * @param name The new field's name. * @param defaultValue The default value for this field. If this field does not have a default value, * null is specified. * @throws IllegalArgumentException if name is null. */ protected Field(String name, Object defaultValue) { super(name); if(name == null) { throw new IllegalArgumentException("name is null"); } this.defaultValue = defaultValue; } /** * Tests whether this field has a default value. * @return true If the {@linkplain #getDefaultValue() default value} is non-null. */ public boolean hasDefaultValue() { return defaultValue != null; } /** * Gets the default value for this field, if any. * @see #hasDefaultValue() */ public Object getDefaultValue() { return defaultValue; } /** * Overrides the superclass method to return the value of {@link java.text.Format.Field#getName()}. */ @Override public String toString() { return getName(); } } /** * Creates a mapping format based on the given encoding scheme and fields. The boolean flag tells instances * whether or not to output values when they are the default, or expect them during parsing. * * @param scheme The encoding scheme to create a formatter/parser for. * @param fields The fields to include. * @param strict Whether or not to use/expect default values in formatting and parsing. * @throws IllegalArgumentException If scheme or fields is null, * or if the specified set of fields does not contain all of the * {@linkplain EncodingScheme#getRequired() required fields} of the specified scheme. * @see Field#hasDefaultValue() */ protected MappingFormat(EncodingScheme scheme, Set fields, boolean strict) { if(scheme == null) { throw new IllegalArgumentException("encoding scheme is null"); } if(fields == null) { throw new IllegalArgumentException("fields is null"); } this.encodingScheme = scheme; this.fields = fields; this.strict = strict; if(!fields.containsAll(encodingScheme.getRequired())) { throw new IllegalArgumentException("specified fields does not contain all required fields"); } if(!encodingScheme.getOrder().containsAll(fields)) { throw new IllegalArgumentException("encoding scheme does not use all the specified fields"); } fieldPattern = Pattern.compile(scheme.getFieldDelimiter().toString()); mappingPattern = Pattern.compile("([\\w\\.]+" + scheme.getFieldDelimiter() + ")+[\\w\\.]+"); } /** * Gets an instance of a mapping formatter/parser for a given encoding scheme with the scheme's * {@linkplain EncodingScheme#getDefaults() default fields} as the specified fields. * @see #getInstance(EncodingScheme, Set) */ public static MappingFormat getInstance(EncodingScheme scheme) { return getInstance(scheme, scheme.getDefaults()); } /** * Gets an instance of a mapping formatter/parser for a given encoding scheme and field set. * @see #getInstance(EncodingScheme, Set, boolean) * @see Alignments#DEFAULT_STRICTNESS */ public static MappingFormat getInstance(EncodingScheme scheme, Set fields) { return getInstance(scheme, fields, DEFAULT_STRICTNESS); } /** * Gets an instance of a mapping formatter/parser for a given encoding scheme and field set. The * returned instances are cached to avoid creating multiple copies with the same scheme, fields, and * strictness flag. Cache access is synchronized to avoid threading issues. * * @param scheme The scheme to create a formatter/parser for. * @param fields The fields to include in the parser/formatter. * @param strict Whether the returned format should be {@linkplain #isStrict() strict}. * @return A new formatter/parser for mappings that will expect mappings encoded per the specified * scheme and will format mappings to strings of that encoding scheme. */ public static MappingFormat getInstance(EncodingScheme scheme, Set fields, boolean strict) { if(scheme == null) { // test for this here so null isn't added to cache as a key throw new IllegalArgumentException("encoding scheme is null"); } synchronized(MappingFormat.class) { MappingFormat mf = null; Set fs = null; if(formatCache == null) { formatCache = new HashMap>(); } else { fs = formatCache.get(scheme); } if(fs == null) { fs = new HashSet(); formatCache.put(scheme, fs); } for(MappingFormat f : fs) { if(f.strict == strict && f.fields.equals(fields)) { mf = f; break; } } if(mf == null) { mf = new MappingFormat(scheme, fields, strict); fs.add(mf); } return mf; } } /** * Gets the encoding scheme used by this mapping formatter/parser. * @return The encoding scheme used to create this instance. * @see #getInstance(EncodingScheme) */ public EncodingScheme getEncodingScheme() { return encodingScheme; } /** * Gets the field set used by this mapping formatter/parser. * @see #getInstance(EncodingScheme) */ public Set getFields() { return fields; } /** * Tests whether this mapping format will output default values, or expect them during parsing. If * true, this format will output and parse for every specified field. Otherwise, this format * will only write/expect fields that have a default value if their value differs from the default. * @see Field#hasDefaultValue() */ public boolean isStrict() { return strict; } /** * Formats a mapping according to the {@linkplain #getEncodingScheme() encoding scheme in effect}. * @param mapping The mapping to format. * @return A string in the format required by this formatter/parser's {@link #getEncodingScheme()}. * @see #format(Mapping, StringBuffer, FieldPosition) */ public String formatMapping(Mapping mapping) { List order = encodingScheme.getOrder(); Field field = null; int index = -1; for(int i = 0; i < order.size(); i++) { Field f = order.get(i); if(fields.contains(f)) { field = f; index = i; break; } } return format(mapping, new StringBuffer(), new FieldPosition(field, index)).toString(); } /** * Overrides the {@link Format#format(Object, StringBuffer, FieldPosition)} method to make sure the * specified Object obj is an instance of {@link Mapping}. * @see #format(Mapping, StringBuffer, FieldPosition) */ @Override public StringBuffer format(Object obj, StringBuffer toAppendTo, FieldPosition pos) { if(!(obj instanceof Mapping)) { throw new IllegalArgumentException("not a mapping: " + obj); } if(pos.getFieldAttribute() == null) { int i = pos.getField(); pos = new FieldPosition(fieldAtIndex(i), i); } return format((Mapping)obj, toAppendTo, pos); } /** * Formats a given {@link Mapping} based on the specified field position, appending the * output to the string buffer provided. * @param mapping The mapping to format. * @param toAppendTo The string buffer to append to. * @param pos The field position to use. * @return A string buffer with the information from mapping corresponding to the * field position pos appended to the one provided. * @throws IllegalArgumentException For the following reasons: *

    *
  • An attempt is made to format an alignment with a {@link IndexBase#nullValue null index}.
  • *
  • The value of pos.getFieldAttribute() is not an instance of {@link MappingFormat.Field}. *
  • The encoding scheme does not contain the field specified by pos.
  • *
  • The specified {@linkplain #getFields() list of fields} does not contain the field specified by * pos. *
  • Either a field required by the encoding scheme or the field specified by pos * has a null value. *
*/ public StringBuffer format(Mapping mapping, StringBuffer toAppendTo, FieldPosition pos) { if(mapping.a.equals(Alignments.DEFAULT_INDEX_BASE.nullValue) || mapping.b.equals(Alignments.DEFAULT_INDEX_BASE.nullValue)) { throw new IllegalArgumentException("cannot format null mapping: " + mapping); } java.text.Format.Field f = pos.getFieldAttribute(); if(!(f instanceof MappingFormat.Field)) { int i = pos.getField(); pos = new FieldPosition(fieldAtIndex(i), i); } Field field = (MappingFormat.Field)f; if(!encodingScheme.getOrder().contains(field)) { throw new IllegalArgumentException("no such field \'" + field + "\' in scheme " + encodingScheme); } if(!fields.contains(field)) { throw new IllegalArgumentException("field \'" + field + "\' not specified for this format"); } int start = toAppendTo.length(); for(Field af : encodingScheme.getOrder()) { if(!fields.contains(af)) { continue; } Object val = null; if(af.equals(PHRASE_NUMBER_FIELD)) { val = mapping.phraseNumber; } else if(af.equals(A_INDEX_FIELD)) { val = mapping.a; } else if(af.equals(B_INDEX_FIELD)) { val = mapping.b; } else if(af.equals(STATUS_FIELD)) { val = (strict || encodingScheme.getRequired().contains(STATUS_FIELD)) ? mapping.status : (mapping.status == DEFAULT_STATUS) ? null : mapping.status; } else if(af.equals(CONFIDENCE_FIELD)) { val = (strict || encodingScheme.getRequired().contains(CONFIDENCE_FIELD)) ? mapping.confidence : (mapping.confidence.equals(DEFAULT_CONFIDENCE)) ? null : mapping.confidence; } if(val == null) { // skip nulls, but check if(encodingScheme.getRequired().contains(af)) { throw new IllegalArgumentException("required field " + af + " contains null value"); } if(field.equals(af)) { throw new IllegalArgumentException("specified field " + field + " contains null value"); } } else { if(af.equals(PHRASE_NUMBER_FIELD) || af.equals(A_INDEX_FIELD) || af.equals(B_INDEX_FIELD)) { // translate indices? boolean pn = af.equals(PHRASE_NUMBER_FIELD); IndexBase mappingBase = pn ? encodingScheme.getPhraseNumberBase() : encodingScheme.getIndexBase(), defaultBase = pn ? DEFAULT_PHRASE_NUMBER_BASE : DEFAULT_INDEX_BASE; val = defaultBase.translate((Integer)val, mappingBase); } else if(af.equals(STATUS_FIELD)) { val = ((Status)val).abbreviation; } if(start < toAppendTo.length()) { toAppendTo.append(encodingScheme.getFieldDelimiter()); } if(field.equals(af)) { pos.setBeginIndex(toAppendTo.length()); } toAppendTo.append(val); if(field.equals(af)) { pos.setEndIndex(toAppendTo.length()); } } } return toAppendTo; } Field fieldAtIndex(int i) throws IndexOutOfBoundsException { Field f = encodingScheme.getOrder().get(i); if(f == null) { throw new IndexOutOfBoundsException("no field at position " + i); } return f; } /** * Parses a {@link Mapping} from a given string, based on the {@linkplain #getEncodingScheme() encoding * scheme} in effect. * @param source The string to parse. * @return A mapping object representing the specified string. * @throws ParseException If the string is ill-formed according to this formatter/parser's * {@link #getEncodingScheme() encoding scheme}. The exception thrown will contain an * {@linkplain ParseException#getErrorOffset() error offset} reflecting the position in the string where * the parse error occurred, if possible. */ public Mapping parseMapping(String source) throws ParseException { ParsePosition pos = new ParsePosition(0); Mapping m = (Mapping)parseObject(source, pos); if(pos.getErrorIndex() != -1) { throw new ParseException("problem parsing input \"" + source + "\"", pos.getErrorIndex()); } return m; } /** * Overrides the {@link Format#parseObject(String, ParsePosition)} method to return a mapping, parsing * from the specified {@link ParsePosition}. * @see #parseMapping(String) */ @Override public Object parseObject(String source, ParsePosition pos) { if(pos == null) { throw new NullPointerException("parse position is null"); } int index = pos.getIndex(); Matcher matcher = mappingPattern.matcher(source); if(!matcher.matches()) { pos.setErrorIndex(index); return null; } if(matcher.start() != index) { pos.setErrorIndex(index); return null; } String[] chunks = fieldPattern.split(source); Iterator oi = encodingScheme.getOrder().iterator(); Map values = new HashMap(encodingScheme.getOrder().size()); for(int i = 0; i < chunks.length; i++) { String c = chunks[i]; if(c.length() == 0 || !oi.hasNext()) { pos.setErrorIndex(index); return null; } Object val; MappingFormat.Field af = null; while(oi.hasNext()) { af = oi.next(); try { if(af.equals(PHRASE_NUMBER_FIELD) || af.equals(A_INDEX_FIELD) || af.equals(B_INDEX_FIELD)) { boolean pn = af.equals(PHRASE_NUMBER_FIELD); IndexBase mappingBase = pn ? encodingScheme.getPhraseNumberBase() : encodingScheme.getIndexBase(), defaultBase = pn ? DEFAULT_PHRASE_NUMBER_BASE : DEFAULT_INDEX_BASE; try { val = mappingBase.translate(Integer.valueOf(c), defaultBase); } catch(IllegalArgumentException iie) { // thrown by IndexBase.translate() pos.setErrorIndex(index); return null; } // can't have null value if(((Integer)val).equals(defaultBase.nullValue)) { pos.setErrorIndex(index); return null; } } else if(af.equals(STATUS_FIELD)) { val = Status.forAbbreviation(c); } else if(af.equals(CONFIDENCE_FIELD)) { val = Double.valueOf(c); } else { val = null; } } catch(NumberFormatException e) { pos.setErrorIndex(index); return null; } if(val == null) { if(encodingScheme.getRequired().contains(af) || (strict && fields.contains(af))) { pos.setErrorIndex(index); return null; } continue; // keep going if not required } values.put(af, val); break; } // update parse index index += c.length(); if(i < chunks.length - 1) { index++; // add one for delimiter } } pos.setIndex(matcher.end()); Set keys = values.keySet(); if(!keys.containsAll(encodingScheme.getRequired()) || (strict && !keys.containsAll(fields))) { pos.setErrorIndex(index); return null; } Integer id = values.containsKey(PHRASE_NUMBER_FIELD) ? (Integer)values.get(PHRASE_NUMBER_FIELD) : null; Integer first = values.containsKey(A_INDEX_FIELD) ? (Integer)values.get(A_INDEX_FIELD) : null, second = values.containsKey(B_INDEX_FIELD) ? (Integer)values.get(B_INDEX_FIELD) : null; Status status = (Status)values.get(STATUS_FIELD); if(status == null && STATUS_FIELD.hasDefaultValue()) { status = (Status)STATUS_FIELD.defaultValue; } Double confidence = (Double)values.get(CONFIDENCE_FIELD); if(confidence == null && CONFIDENCE_FIELD.hasDefaultValue()) { confidence = (Double)CONFIDENCE_FIELD.defaultValue; } return new Mapping(id, first, second, status, confidence); } } ================================================ FILE: src/opennlp/ccg/alignment/MappingGroup.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; /** * Class for representing groups of mappings during reading or writing. * Mapping groups encapsulate a phrase number (usually associated with a * phrase's line number) and a number of mappings in the group (its * {@linkplain #getLength() length}). * * @author Scott Martin */ public class MappingGroup implements Comparable { Integer phraseNumber; int length; /** * Creates a new mapping group with the given phrase number and length. * @throws IllegalArgumentException if phraseNumber is null, or if * length < 0. */ public MappingGroup(Integer phraseNumber, int length) { if(phraseNumber == null) { throw new IllegalArgumentException("phraseNumber is null"); } if(length < 0) { throw new IllegalArgumentException("length < 0: " + length); } this.phraseNumber = phraseNumber; this.length = length; } /** * Gets this group's phrase number. */ public Integer getPhraseNumber() { return phraseNumber; } /** * Gets the length of this mapping group. */ public int getLength() { return length; } /** * Tests whether this group is equal to another by comparing the two groups' * phrase numbers and lengths. */ @Override public boolean equals(Object obj) { if(obj instanceof MappingGroup) { MappingGroup ag = (MappingGroup)obj; return phraseNumber.equals(ag.phraseNumber) && length == ag.length; } return false; } /** * Compares this mapping group to another by comparing their IDs and lengths. */ public int compareTo(MappingGroup ag) { int i = phraseNumber.compareTo(ag.phraseNumber); if(i == 0) { i = (length == ag.length) ? 0 : length < ag.length ? -1 : 1; } return i; } /** * Computes a hash code for this mapping group based on its ID and length. */ @Override public int hashCode() { // brackets are to guard against 0 return 37 * (1 + phraseNumber.intValue() + length); } /** * Gets a string representation of this group. * @return For a group with phrase number 37 and length * 12, the string "Group 37 (12 mappings)". */ @Override public String toString() { StringBuilder sb = new StringBuilder("Group "); sb.append(phraseNumber); sb.append(" ("); sb.append(length); sb.append(" mappings)"); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/alignment/MappingReader.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.io.FilterReader; import java.io.IOException; import java.io.Reader; import java.text.ParseException; import java.util.LinkedList; import java.util.Queue; /** * Class for reading {@link Mapping}s from an underlying reader based on a specified {@link MappingFormat}. *

* Instances of this class read mappings group by group, with {@link #nextGroup()} called after reading the * correct number of mappings for the current group, as signaled by the {@link #canRead()} method. *

* The following code fragment illustrates the usage of this class: *

 * MappingReader reader = ...;
 * 
 * MappingGroup g;
 * while((g = reader.getNextGroup()) != null) {
 * 	while(reader.canRead()) {
 * 		Mapping m = reader.readMapping();
 * 		...
 * 	}
 * }
* @author Scott Martin * @see MappingFormat */ public class MappingReader extends FilterReader { final MappingFormat format; private MappingGroup currentGroup; private Queue mappingQueue; private int mappingCount = 0; private boolean skipLF = false; /** * Creates a mapping reader. * @param r The underlying reader. * @param format The mapping format to use for reading {@link Mapping}s. * @throws IllegalArgumentException if format is null. */ public MappingReader(Reader r, MappingFormat format) { super(r); if(format == null) { throw new IllegalArgumentException("format is null"); } this.format = format; mappingQueue = new LinkedList(); } /** * @return The format used to read mappings. */ public MappingFormat getFormat() { return format; } /** * Starts reading from the next mapping group. * @return The next {@link MappingGroup} found by reading from the underlying reader. * @throws IOException If a {@link ParseException} is encountered when calling * {@link MappingFormat#parseMapping(String)} based on the underlying input, or if one is thrown by the * underlying reader. An IOException is also thrown if the number of mappings in the * {@linkplain MappingGroup#getLength() current group} could not be read. */ public MappingGroup nextGroup() throws IOException { checkMappingCount(); mappingCount = 0; MappingGroup previous = (currentGroup == null) ? null : currentGroup; int newCount = mappingQueue.size(); currentGroup = (newCount == 0) ? null : new MappingGroup(mappingQueue.peek().phraseNumber, newCount); boolean eog = false; while(!eog) { StringBuilder sb = new StringBuilder(); int i; while((i = in.read()) != -1) { char c = (char)i; if(skipLF) { skipLF = false; if(c == '\n') { continue; } } if(c == '\r') { skipLF = true; } if(format.encodingScheme.isMappingDelimiter(c)) { break; } else if(format.encodingScheme.isGroupDelimiter(c)) { eog = true; break; } else { sb.append(c); } } if(sb.length() == 0) { break; // for EOF and end of group } Mapping a = null; try { a = format.parseMapping(sb.toString()); } catch(ParseException pe) { throw new IOException(((currentGroup == null) ? "" : "group " + currentGroup.phraseNumber + ": ") + "problem formatting mapping " + sb.toString() + " at offset " + pe.getErrorOffset() + ": " + pe.getMessage(), pe); } // if the format allows null IDs, use previous's running counter if(currentGroup == null) { Integer I = (a.phraseNumber == null) ? (previous == null) ? format.encodingScheme.getPhraseNumberBase().start : previous.phraseNumber + 1 : a.phraseNumber; currentGroup = new MappingGroup(I, 0); } if(a.phraseNumber == null) { // have to copy because phraseNumber is immutable (and final) a = a.copyWithPhraseNumber(currentGroup.phraseNumber); } if(!currentGroup.phraseNumber.equals(a.phraseNumber)) { eog = true; } else { newCount++; // only increment if should be read } if(!mappingQueue.offer(a)) { // save for next read throw new IOException("unable to read mapping"); } } if(currentGroup != null) { currentGroup.length = newCount; } return (currentGroup == null || currentGroup.length == 0) ? null : currentGroup; } /** * Tests whether mappings can be read from this reader without throwing an {@link IOException}. * @return true If there is a current mapping group and mappings remain to be read from it. * @see #nextGroup() * @see #readMapping() */ public boolean canRead() { return currentGroup != null && mappingCount < currentGroup.length; } /** * Overrides the superclass method to check first if any mappings are available. * @throws IOException If no mappings are available. * @see #canRead() */ @Override public int read() throws IOException { checkRead(); int c = super.read(); if(skipLF) { skipLF = false; if(c == '\n') { c = super.read(); } } return c; } /** * Overrides the superclass method to check first if any mappings are available. * @throws IOException If no mappings are available. * @see #canRead() */ @Override public int read(char[] cbuf, int off, int len) throws IOException { checkRead(); if(len < 1) { return 0; } if(skipLF) { int c = read(); skipLF = false; if(c == -1) { return c; } else if(c != '\n') { cbuf[off++] = (char)c; len--; } } return super.read(cbuf, off, len); } /** * Tests whether mappings can be read without blocking. * @return true If it is guaranteed that a call to {@link #readMapping()} will not block for input. */ @Override public boolean ready() throws IOException { return canRead(); } /** * Reads a mapping from the underlying reader, if one is {@linkplain #canRead() available}. * @return A mapping formatted by the {@linkplain #getFormat() format in effect}. * @throws IOException If no mappings are available in the current group. * @see #canRead() */ public Mapping readMapping() throws IOException { checkRead(); Mapping a = mappingQueue.poll(); if(a != null) { mappingCount++; } return a; } /** * Closes this reader, checking first if the correct number of mappings were read. * @throws IOException If mappings remain to be read from the current group. */ @Override public void close() throws IOException { try { checkMappingCount(); } finally { super.close(); } } void checkRead() throws IOException { if(!canRead()) { throw new IOException("no mappings available"); } } void checkMappingCount() throws IOException { if((currentGroup == null && mappingCount > 0) || (currentGroup != null && mappingCount != currentGroup.length)) { throw new IOException( currentGroup == null ? "" : "group " + currentGroup.phraseNumber + ": " + "mapping count does not match: expected " + ((currentGroup == null) ? 0 : currentGroup.length) + ", but was " + mappingCount); } } } ================================================ FILE: src/opennlp/ccg/alignment/MappingWriter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.io.FilterWriter; import java.io.IOException; import java.io.Writer; /** * A writer for outputting {@link Mapping}s to a specified underlying {@link Writer}. *

* Mapping writers write mappings by group, so that before any mappings are written, * {@link #startGroup(MappingGroup)} must always be called * to signal to the writer that a new group is starting (and what its length is). The outgoing mappings are * formatted according to a {@link MappingFormat} specified at creation. *

* After a new group is started, exactly the {@linkplain MappingGroup#getLength() number of mappings} in that * group must be written. Otherwise, an {@link IOException} is thrown. An {@link IOException} is also thrown * if an attempt is made to write a mapping whose {@linkplain Mapping#getPhraseNumber() id} is different from the * current group's {@linkplain MappingGroup#getPhraseNumber() id}, or if {@link #writeMapping(Mapping)} is called without * first calling {@link #startGroup(MappingGroup)}. *

* Example usage: *

 * MappingWriter mw = ...;
 * 
 * // while there are more groups
 * mw.startGroup(new MappingGroup(...));
 * while(mw.canWrite()) {
 * 	mw.writeMapping(...);
 * }
 * 
 * mw.close();
* @author Scott Martin */ public class MappingWriter extends FilterWriter { final MappingFormat format; private MappingGroup currentGroup; private int mappingCount = 0; private String mappingDelimiter, groupDelimiter; /** * Creates a new mapping writer. * @param out The underlying writer. * @param format The mapping format to use. * @throws IllegalArgumentException if format is null. */ public MappingWriter(Writer out, MappingFormat format) { super(out); if(format == null) { throw new IllegalArgumentException("format is null"); } this.format = format; } /** * Gets the mapping format used by this writer. */ public MappingFormat getFormat() { return format; } /** * Gets the current mapping group being written. */ public MappingGroup getCurrentGroup() { return currentGroup; } void checkWrite() throws IOException { if(!canWrite()) { throw new IOException("unable to write"); } } void checkMappingCount() throws IOException { if(currentGroup != null && mappingCount != currentGroup.length) { throw new IOException("incorrect mapping count for group " + + currentGroup.phraseNumber + "; expected " + currentGroup.length + ", but was " + mappingCount); } } /** * Starts a new mapping group for writing mappings. If {@link #endGroup()} was not called explicitly, * it is first called to end the current group. * @param mappingGroup The group to start. * @throws IOException If the number of mappings written since the last call to * {@link #startGroup(MappingGroup)}does not exactly equal the length of the * {@linkplain #getCurrentGroup() current group}. * @see #writeMapping(Mapping) * @see #endGroup() */ public void startGroup(MappingGroup mappingGroup) throws IOException { if(currentGroup != null) { endGroup(); } currentGroup = mappingGroup; } /** * Ends the current group, writing the {@linkplain EncodingScheme#getGroupDelimiter() proper group * delimiter} for the {@linkplain #getFormat() mapping format in effect}. * @throws IOException If {@link #startGroup(MappingGroup)} was not first called, or if the correct number * of mappings for the current group was not written. * @see #startGroup(MappingGroup) */ public void endGroup() throws IOException { if(currentGroup == null) { throw new IOException("no current group"); } checkMappingCount(); if(currentGroup.length > 0) { if(groupDelimiter == null) { Character gd = format.encodingScheme.getGroupDelimiter(); groupDelimiter = AbstractEncodingScheme.isLineSeparator(gd) ? System.getProperty("line.separator") : String.valueOf(gd); } out.write(groupDelimiter); // no empty lines } currentGroup = null; mappingCount = 0; } /** * Writes to the underlying writer, first checking if mappings can be written. * @see #canWrite() */ @Override public void write(char[] cbuf, int off, int len) throws IOException { checkWrite(); super.write(cbuf, off, len); } /** * Writes to the underlying writer, first checking if mappings can be written. * @see #canWrite() */ @Override public void write(int c) throws IOException { checkWrite(); super.write(c); } /** * Writes to the underlying writer, first checking if mappings can be written. * @see #canWrite() */ @Override public void write(String str, int off, int len) throws IOException { checkWrite(); super.write(str, off, len); } /** * Writes to the underlying writer, first checking if mappings can be written. * @see #canWrite() */ @Override public void write(char[] cbuf) throws IOException { checkWrite(); super.write(cbuf); } /** * Writes to the underlying writer, first checking if mappings can be written. * @see #canWrite() */ @Override public void write(String str) throws IOException { checkWrite(); super.write(str); } /** * Tests whether the mapping writer is currently in a state in which mappings can be written without * throwing an {@link IOException}. Mappings * can only be written when a current group has been {@linkplain #startGroup(MappingGroup) started} and * the number of mappings written since the last group started is less than the total * {@linkplain MappingGroup#getLength() length} of the current group. * @return true If the {@linkplain #getCurrentGroup() current group} is non-null and the number of mappings * written to the current group is less than its length. */ public boolean canWrite() { return currentGroup != null && mappingCount < currentGroup.length; } /** * Writes a mapping to the underlying writer, formatted by {@linkplain #getFormat() the mapping format}. If * other mappings have been written since the last call to {@link #startGroup(MappingGroup)}, the * {@linkplain EncodingScheme#getMappingDelimiter() mapping delimiter} used by the current format is first * written. * @param mapping The mapping to write. * @throws IOException If {@link #canWrite()} returns false, if * {@link #startGroup(MappingGroup)} was not first called, or if an attempt is made * to write a mapping with an {@linkplain Mapping#getPhraseNumber() id} that does not equal the current group's * {@linkplain MappingGroup#getPhraseNumber() id}. */ public void writeMapping(Mapping mapping) throws IOException { checkWrite(); if(mapping == null) { throw new NullPointerException("null mapping"); } if(mapping.phraseNumber != null && !mapping.phraseNumber.equals(currentGroup.phraseNumber)) { throw new IOException("mapping from group " + mapping.phraseNumber + ", but current group is " + currentGroup.phraseNumber); } if(mappingCount > 0) { if(mappingDelimiter == null) { Character md = format.encodingScheme.getMappingDelimiter(); mappingDelimiter = AbstractEncodingScheme.isLineSeparator(md) ? System.getProperty("line.separator") : String.valueOf(md); } out.write(mappingDelimiter); } out.write(format.formatMapping(mapping)); mappingCount++; } /** * Overrides the superclass method to first check that the correct number of mappings were written. * @throws IOException If a number of mappings have been written that does not exactly equal the length * of the {@linkplain #getCurrentGroup() current group}. */ @Override public void close() throws IOException { try { checkMappingCount(); } finally { super.close(); } } } ================================================ FILE: src/opennlp/ccg/alignment/MosesEncodingScheme.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.MappingFormat.Field.A_INDEX_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.B_INDEX_FIELD; import static opennlp.ccg.alignment.MappingFormat.Field.STATUS_FIELD; import java.util.Arrays; import java.util.HashSet; /** * Represents the Moses encoding. * * * * * * * *
Field separator-
Mapping separator(space)
Group separator(newline)
ID base{@link IndexBase#ZERO}
Index base{@link IndexBase#ZERO}
Example group0-1 2-3 2-4-P 3-0
* @see Moses Word Alignment Tutorial * * @author Scott Martin */ public class MosesEncodingScheme extends AbstractEncodingScheme { /** * Creates a new instance of the Moses encoding scheme. * @see Alignments#MOSES_ENCODING_SCHEME */ public MosesEncodingScheme() { super('-', ' ', '\n', Alignments.DEFAULT_INDEX_BASE, IndexBase.ZERO, new HashSet(Arrays.asList(A_INDEX_FIELD, B_INDEX_FIELD, STATUS_FIELD)), new HashSet(Arrays.asList(A_INDEX_FIELD, B_INDEX_FIELD)), A_INDEX_FIELD, B_INDEX_FIELD, STATUS_FIELD); } } ================================================ FILE: src/opennlp/ccg/alignment/NAACLEncodingScheme.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import static opennlp.ccg.alignment.MappingFormat.Field.*; import java.util.Arrays; import java.util.HashSet; /** * Represents the NAACL shared task encoding. * * * * * * * *
Field separator(space)
Mapping separator(newline)
Group separator(newline)
ID base{@link IndexBase#ZERO}
Index base{@link IndexBase#ONE}
Example group *
 * 37 1 2 S
 * 37 3 4 S
 * 37 3 5 P
 * 37 4 1 S
 * 
*
* @see NAACL shared task word alignment guidelines * @author Scott Martin */ public class NAACLEncodingScheme extends AbstractEncodingScheme { /** * Creates a new instance of the NAACL encoding scheme. * @see Alignments#NAACL_ENCODING_SCHEME */ public NAACLEncodingScheme() { super(' ', '\n', '\n', IndexBase.ZERO, IndexBase.ONE, Alignments.NAACL_DEFAULT_FIELDS, new HashSet(Arrays.asList(PHRASE_NUMBER_FIELD, A_INDEX_FIELD, B_INDEX_FIELD)), PHRASE_NUMBER_FIELD, A_INDEX_FIELD, B_INDEX_FIELD, STATUS_FIELD, CONFIDENCE_FIELD); } } ================================================ FILE: src/opennlp/ccg/alignment/Phrase.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.util.AbstractList; import java.util.List; /** * A phrase, i.e., a sequence of words with an associated {@linkplain #getNumber() phrase number}. * Phrases may additionally have a {@linkplain #getId() string ID}. * To save access time and space, this class is backed by an array of * {@link String}s, but extends {@link AbstractList} so that phrases can be iterated over and have * all of the usual convenience methods. *

* Phrases are immutable once created, so calling any of the {@link List#add(Object)}, * {@link List#remove(int)}, or {@link List#set(int, Object)} * methods will throw an {@link UnsupportedOperationException}. Similarly, the * {@linkplain #iterator() iterator's} remove() method also throws an * {@link UnsupportedOperationException}. *

* For convenience, this class implements the {@link Comparable} interface, comparing * phrases by their {@linkplain #getNumber() numbers}. * * @author Scott Martin */ public class Phrase extends AbstractList implements Comparable { final Integer number; final String id; final String[] words; /** * Creates a new phrase with the given number and list of words. * @see Phrase#Phrase(String, Integer, String...) */ public Phrase(Integer number, List words) { this(number, words.toArray(new String[words.size()])); } /** * Creates a new phrase with the specified number, made up of the given words. * @see Phrase#Phrase(String, Integer, String...) */ public Phrase(Integer number, String... words) { this(null, number, words); } /** * Creates a new phrase with the given number, ID, and list of words. * @see Phrase#Phrase(String, Integer, String...) */ public Phrase(String id, Integer number, List words) { this(id, number, words.toArray(new String[words.size()])); } /** * Creates a new phrase with the specified number and id, made up of the given words. * The ID may be null, but the cannot be null. The list of words cannot be * null or contain null members, although it can be empty. * @throws IllegalArgumentException If number, words is null, * or one of the strings in words is null. */ public Phrase(String id, Integer number, String... words) { checkObject(number, "number"); checkObject(words, "words"); for(int i = 0; i < words.length; i++) { checkObject(words[i], "word " + i); } this.number = number; this.id = id; this.words = words; } void checkObject(Object obj, String name) { if(obj == null) { throw new IllegalArgumentException(name + " is null"); } } /** * Gets this phrase's ID, if any was specified. * @return The ID of this phrase, possibly null. */ public String getId() { return id; } /** * @return This phrase's number. */ public Integer getNumber() { return number; } /** * Returns the word in this phrase at the supplied index. */ @Override public String get(int index) { return words[index]; } /** * @return The number of words in this phrase. */ @Override public int size() { return words.length; } /** * Compares this phrase to another by comparing their {@linkplain #getNumber() numbers}. * @return The value of getNumber().compareTo(o.getNumber()). * @param o The phrase to compare to. * @see Integer#compareTo(Integer) */ @Override public int compareTo(Phrase o) { return getNumber().compareTo(o.getNumber()); } /** * Tests whether this phrase is equal to another by first calling the superclass method * {@link AbstractList#equals(Object)}, then comparing this phrase's number and id to the other. */ @Override public boolean equals(Object o) { if(o instanceof Phrase && super.equals(o)) { Phrase p = (Phrase)o; return number.equals(p.number) && ((id == null && p.id == null) || id.equals(p.id)); } return false; } /** * Generates a hash code for this phrase based on the superclass hash code, its number, and its ID * (if any). */ @Override public int hashCode() { int h = 31 * super.hashCode() + number.hashCode(); return (id == null) ? h : h + id.hashCode(); } /** * Gets a string representation of this phrase. * @return For a phrase with number 37, ID phrase 3, and words * "Test phrase", prepends 37 (phrase 3): to the result of calling the * superclass method {@link AbstractList#toString()}; */ @Override public String toString() { StringBuilder sb = new StringBuilder(number.toString()); if(id != null) { sb.append(" ("); sb.append(id); sb.append(')'); } sb.append(": "); sb.append(super.toString()); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/alignment/PhrasePosition.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; /** * In an alignment, a phrase position indicates which of the two aligned phrases * is first (the "A" phrase) and which is second (the "B" phrase) * in terms of the alignment indices. * * @author Scott Martin */ public enum PhrasePosition { /** * The "A" position. */ A, /** * The "B" position. */ B; /** * Gives the opposite of this phrase position. * @return {@link #B} if this position is {@link #A}, otherwise * {@link #A}. */ public PhrasePosition opposite() { return (this == A) ? B : A; } } ================================================ FILE: src/opennlp/ccg/alignment/PhraseReader.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.io.IOException; import java.io.LineNumberReader; import java.io.Reader; /** * A phrase reader just reads line numbers, but does not search for phrase IDs. The line number of each phrase * read is {@linkplain IndexBase#translate(Integer, IndexBase) translated} into the {@link IndexBase} provided * at creation. * @author Scott Martin */ public class PhraseReader extends LineNumberReader { final IndexBase numberBase; /** * The index base that starts numbering lines at 1. * @see IndexBase#ONE */ public static final IndexBase LINE_NUMBER_BASE = IndexBase.ONE; /** * Creates a phrase reader from the specified underlying reader and word index base for phrase numbers. * @see #PhraseReader(Reader, IndexBase) * @see Alignments#DEFAULT_PHRASE_NUMBER_BASE */ public PhraseReader(Reader in) { this(in, Alignments.DEFAULT_PHRASE_NUMBER_BASE); } /** * Creates a phrase reader. * @param numberBase The target phrase numbering base. Phrases read from the underlying reader will have * their numbers translated from {@link #LINE_NUMBER_BASE the default} to numberBase. * @throws IllegalArgumentException if numberBase is null. * @see IndexBase#translate(Integer, IndexBase) * @see #readPhrase() */ public PhraseReader(Reader in, IndexBase numberBase) { super(in); if(numberBase == null) { throw new IllegalArgumentException("numberBase is null"); } this.numberBase = numberBase; } /** * The target number base that new phrase IDs will have their line numbers translated into. * @see #readPhrase() */ public IndexBase getNumberBase() { return numberBase; } /** * Gets the number last assigned to a phrase, translated into the specified * {@linkplain #getNumberBase() number base}. Note that this method may return a different result than * {@link #getLineNumber()} due to the base translation. * @see IndexBase#translate(Integer, IndexBase) */ public Integer getPhraseNumber() { return LINE_NUMBER_BASE.translate(getLineNumber(), numberBase); } /** * Reads the next phrase from the underlying reader. The number is determined by translating the line number * of the phrase into the {@linkplain #getNumberBase() target number base}. The words in the phrase are * tokenized by the {@link Alignments#tokenize(String)} method. * @return null if no phrases can be read from the underlying reader. */ public Phrase readPhrase() throws IOException { String ln = readLine(); return (ln == null) ? null : new Phrase(getPhraseNumber(), Alignments.tokenize(ln)); } } ================================================ FILE: src/opennlp/ccg/alignment/PhraseWriter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; import java.io.FilterWriter; import java.io.IOException; import java.io.PrintWriter; import java.io.Writer; /** * A writer for phrases. This writer writes {@linkplain Phrase phrase} instances to the underlying writer with * the specified string as a word separator. No markup is placed around phrases that are written; only a line * separator is written after the phrase. * * @author Scott Martin * @see Alignments#untokenize(String[], String) */ public class PhraseWriter extends FilterWriter { final String wordSeparator; protected PrintWriter printWriter; /** * Creates a new phrase writer with the default word separator. * @see #PhraseWriter(Writer, String) * @see Alignments#DEFAULT_WORD_SEPARATOR */ public PhraseWriter(Writer out) { this(out, Alignments.DEFAULT_WORD_SEPARATOR); } /** * Creates a new phrase writer for the underlying input stream that will use * the specified word separator when untokenizing phrases. * * @param wordSeparator The word separator to use when translating phrases into strings. * @throws IllegalArgumentException if wordSeparator is null. * @see Alignments#untokenize(String[], String) */ public PhraseWriter(Writer out, String wordSeparator) { super(new PrintWriter(out)); if(wordSeparator == null) { throw new IllegalArgumentException("wordSeparator is null"); } this.wordSeparator = wordSeparator; printWriter = (PrintWriter)this.out; } /** * Gets the word separator that this phrase writer uses when writing phrases. */ public String getWordSeparator() { return wordSeparator; } /** * Writes a phrase by {@linkplain Alignments#untokenize(List, String) untokenizing} its words * according to the {@linkplain #getWordSeparator() word separator being used}. Before writing the * untokenized phrase, {@link #preWritePhrase(Phrase)} is called exactly once. After writing the phrase, * {@link #postWritePhrase(Phrase)} is called exactly once. * * @param phrase The phrase to write. * @throws IOException If the underlying writer throws an {@link IOException}, or if one is thrown by * either {@link #preWritePhrase(Phrase)} or {@link #postWritePhrase(Phrase)}. */ public void writePhrase(Phrase phrase) throws IOException { preWritePhrase(phrase); printWriter.print(Alignments.untokenize(phrase, wordSeparator)); postWritePhrase(phrase); } /** * Called before {@link #writePhrase(Phrase)} (to be overridden by implementing classes). * @param phrase The phrase about to be written. */ protected void preWritePhrase(Phrase phrase) throws IOException { // default is to do nothing } /** * Called after {@link #writePhrase(Phrase)} (to be overridden by implementing classes). This implementation * just writes a line separator. * @param phrase The phrase that was just written. */ protected void postWritePhrase(Phrase phrase) throws IOException { printWriter.println(); } } ================================================ FILE: src/opennlp/ccg/alignment/Status.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.alignment; /** * A set of {@link Enum} constants for describing the status of a {@linkplain Mapping mapping}, either as * {@link #POSSIBLE} or {@link #SURE}. *

* Statuses have a corresponding {@linkplain #getAbbreviation() abbreviated form} for use when mappings are * formatted and parsed as strings. The enum constants are arranged in order of strength of surety, so that * {@link Enum#ordinal()} returns numbers in order of increasing surety. * * @author Scott Martin * @see MappingFormat */ public enum Status { /** * The status of a mapping that is only possible, not sure. */ POSSIBLE, /** * The status of a sure mapping (not just possible). */ SURE; /** * An abbreviated form for this status, for use in parsing and formatting. * @see MappingFormat */ final String abbreviation; private Status() { this.abbreviation = name().substring(0, 1); } /** * Gets the abbreviated form of this status, "S" for {@link #SURE} * and "P" for {@link #POSSIBLE}. */ public String getAbbreviation() { return abbreviation; } /** * Gives the status constant corresponding to the given abbreviation. * @param abbreviation The abbreviated form to find a status constant for. * @return A status constant if one is found whose {@link #getAbbreviation()} is equal to the specified * abbreviation, otherwise null. */ public static Status forAbbreviation(String abbreviation) { for(Status s : values()) { if(s.abbreviation.equals(abbreviation)) { return s; } } return null; } } ================================================ FILE: src/opennlp/ccg/alignment/package.html ================================================

This package provides classes for modeling alignments between two phrases.

The {@link opennlp.ccg.alignment.Phrase} class models a phrase as simply a list of words. A single alignment between an index in one phrase and an index in another is captured by the {@link opennlp.ccg.alignment.Mapping} class. The {@link opennlp.ccg.alignment.Alignment} class is build on top of Phrase and Mapping, encapsulating an "A" phrase and a "B" phrase along with a set of mappings between them (see {@link opennlp.ccg.alignment.PhrasePosition}).

Since there are multiple different encoding schemes for representing alignments, a goal of this package is to make the conceptual representation of aligned phrases as independent of encoding as possible. The interface {@link opennlp.ccg.alignment.EncodingScheme} gives a way to represent encoding schemes, and two popular encoding schemes (Moses and NAACL) are provided by this package. Different encoding schemes use different numbering bases: some start numbering indices at 0 and some at 1. To reconcile this difference, the class {@link opennlp.ccg.alignment.IndexBase} provides a way to translate different numbering bases into a common index base. The {@link opennlp.ccg.alignment.MappingFormat} class extends {@link java.text.Format} to give encoding-independent formatting and parsing for mappings.

The classes {@link opennlp.ccg.alignment.PhraseReader} and {@link opennlp.ccg.alignment.PhraseWriter} (and their descendants), {@link opennlp.ccg.alignment.MappingReader} and {@link opennlp.ccg.alignment.MappingWriter} provide convenient ways to read and write phrases and mappings. Lastly, the non-instantiable class {@link opennlp.ccg.alignment.Alignments} provides some convenience methods for reading phrases, mappings and alignments.

================================================ FILE: src/opennlp/ccg/disjunctivizer/AlignedEdgeFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_ALIGNED; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_UNALIGNED; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_ALIGNED; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_UNALIGNED; import java.util.Collection; import java.util.Set; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFVertex; /** * A filter for edges that tests whether they are aligned based on a specified set of * {@linkplain #getAlignmentIndices() alignment indices}. Whether the source or target vertices * (or both) is considered depends on the match type criteria in effect. For example, if the match * type criteria contains {@link MatchType#SOURCE_ALIGNED}, this filter's {@link #allows(LFEdge)} method * will check whether argument edges have a {@linkplain LFVertex#getIndex() source index} that is * contained in the set of alignment indices. *

* Instances of this class use the following match type criteria: {@link MatchType#SOURCE_ALIGNED}, * {@link MatchType#SOURCE_UNALIGNED}, {@link MatchType#TARGET_ALIGNED}, * and {@link MatchType#TARGET_UNALIGNED}. If the set of alignment indices is modified after an instance * of this class is created, the filter will reflect the changes because the set is not copied at * creation. * * @author Scott Martin */ public class AlignedEdgeFilter extends MatchTypeFilter { Set alignmentIndices; /** * Creates a new aligned edge filter based on the specified alignment indices for the specified * match type criteria. * @param alignmentIndices The set of indices to check for alignment. * @param matchTypes The match type criteria to use. * @throws IllegalArgumentException If alignmentIndices is null. */ public AlignedEdgeFilter(Set alignmentIndices, MatchType... matchTypes) { super(matchTypes); checkAlignmentIndices(alignmentIndices); this.alignmentIndices = alignmentIndices; } /** * Creates a new aligned edge filter based on the specified alignment indices for the specified * match type criteria. * @param alignmentIndices The set of indices to check for alignment. * @param matchTypes The match type criteria to use. * @throws IllegalArgumentException If alignmentIndices is null. */ public AlignedEdgeFilter(Set alignmentIndices, Collection matchTypes) { super(matchTypes); checkAlignmentIndices(alignmentIndices); this.alignmentIndices = alignmentIndices; } private void checkAlignmentIndices(Set alignmentIndices) { if(alignmentIndices == null) { throw new IllegalArgumentException("alignmentIndices is null"); } } /** * Gets the alignment indices used by this filter. */ public Set getAlignmentIndices() { return alignmentIndices; } /** * Sets the alignment indices used by this filter. * @throws IllegalArgumentException If alignmentIndices is null. */ public void setAlignmentIndices(Set alignmentIndices) { checkAlignmentIndices(alignmentIndices); this.alignmentIndices = alignmentIndices; } /** * Tests whether this filter allows the specified LF edge. * @return false if {@link #getMatchTypes()} contains *

    *
  • {@link MatchType#SOURCE_ALIGNED}, but the alignment indices does not contain the edge's * source vertex's index,
  • *
  • {@link MatchType#SOURCE_UNALIGNED}, but the alignment indices contains the edge's * source vertex's index,
  • *
  • {@link MatchType#TARGET_ALIGNED}, but the alignment indices does not contain the edge's * target vertex's index,
  • *
  • {@link MatchType#TARGET_UNALIGNED}, but the alignment indices contains the edge's * target vertex's index,
  • *
* and true otherwise. * * @see #getAlignmentIndices() * @see LFEdge#getSource() * @see LFEdge#getTarget() * @see LFVertex#getIndex() */ @Override public boolean allows(LFEdge edge) { for(MatchType t : matchTypes) { if(t == SOURCE_ALIGNED && !alignmentIndices.contains(edge.getSource().getIndex())) { return false; } else if(t == SOURCE_UNALIGNED && alignmentIndices.contains(edge.getSource().getIndex())) { return false; } else if(t == TARGET_ALIGNED && !alignmentIndices.contains(edge.getTarget().getIndex())) { return false; } else if(t == TARGET_UNALIGNED && alignmentIndices.contains(edge.getTarget().getIndex())) { return false; } } return true; } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/Disjunctivizer.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import static opennlp.ccg.alignment.PhrasePosition.A; import static opennlp.ccg.alignment.PhrasePosition.B; import static opennlp.ccg.disjunctivizer.Disjunctivizer.VertexType.LOCAL_ANCESTOR; import static opennlp.ccg.disjunctivizer.Disjunctivizer.VertexType.OPTIONAL; import static opennlp.ccg.disjunctivizer.Disjunctivizer.VertexType.PREDICATES; import static opennlp.ccg.disjunctivizer.Disjunctivizer.VertexType.SHARED; import static opennlp.ccg.disjunctivizer.Disjunctivizer.VertexType.VISITED; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_PREDICATE_MISMATCH; import java.util.Arrays; import java.util.Collections; import java.util.EnumMap; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import opennlp.ccg.alignment.PhrasePosition; import opennlp.ccg.hylo.Mode; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFGraph; import opennlp.ccg.hylo.graph.LFVertex; import opennlp.ccg.util.DelegatedFilter; import opennlp.ccg.util.Filter; import opennlp.ccg.util.FilteredSet; import opennlp.ccg.util.MembershipFilter; import opennlp.ccg.util.VisitedFilter; import org.jgrapht.traverse.DepthFirstIterator; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * Creates a disjunctive logical form from a difference between two graphs. * Instances can be configured to switch handling of {@link LFGraphDifference#inserts()}, * {@link LFGraphDifference#deletes()} and {@link LFGraphDifference#substitutions()} * on or off. By default, all three are performed. *

* Disjunctivizers can be re-used, so that all of the boolean parameters * {@link #setProcessingDeletes(boolean)}, {@link #setProcessingInserts(boolean)}, * and {@link #setProcessingSubstitutions(boolean)} can be modified in between calls to * {@link #buildDisjunctiveLFFor(LFGraphDifference)}. * When these parameters are changed, the disjunctive LF built will change as well. * The {@link Document} used to create disjunctive LF elements (and the elements within them) * can be configured as well, either at creation or via {@link #setDocument(Document)}. * * @author Scott Martin */ public class Disjunctivizer { /** * Attribute set tag name: atts. */ public static final String ATTS_TAG = "atts"; /** * Choice disjunction tag name: one-of. */ public static final String CHOICE_TAG = "one-of"; /** * Disjunctive LF tag name: dlf. */ public static final String DLF_TAG = "dlf"; /** * Node tag name: node. */ public static final String NODE_TAG = "node"; /** * Optional disjunction tag name: opt. */ public static final String OPTIONAL_TAG = "opt"; /** * Relation tag name: rel. */ public static final String RELATION_TAG = "rel"; /** * ID attribute name: id. */ public static final String ID_ATTR = "id"; /** * ID reference attribute name: idref. */ public static final String IDREF_ATTR = "idref"; /** * Name attribute name: name. */ public static final String NAME_ATTR = "name"; /** * Predicate attribute name: pred. */ public static final String PRED_ATTR = "pred"; /** * Attribute name for node sharedness: shared. */ public static final String SHARED_ATTR = "shared"; /** * The suffix appended to foreign nodes: f. */ public static final String FOREIGN_SUFFIX = "f"; Document document; boolean processingInserts, processingDeletes, processingSubstitutions; private Element disjunctiveLF; private LFGraphDifference graphDifference; private Set importedVertices = null; private Map vertexAliases = null; private Map foreignAlignedSubgraphRoots = null; /** * Creates a new disjunctivizer using a new document. * @see #Disjunctivizer(Document) */ public Disjunctivizer() throws ParserConfigurationException { this(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); } /** * Creates a new disjunctivizer that will use the specified document * to create the elements in its generated disjunctive LFs. * @see #Disjunctivizer(Document, boolean, boolean, boolean) */ public Disjunctivizer(Document document) { this(document, true, true, true); } /** * Creates a new disjunctivizer that will use the specified document * to create the elements in its generated disjunctive LFs, with the specified parameters dictating * whether to process inserts, deletes, and substitutions. * * @param document The document to use for creating elements, attributes, nodes, etc. * @param processingInserts Whether to process {@link LFGraphDifference#inserts()}. * @param processingDeletes Whether to process {@link LFGraphDifference#deletes()}. * @param processingSubstitutions Whether to process {@link LFGraphDifference#substitutions()}. */ public Disjunctivizer(Document document, boolean processingInserts, boolean processingDeletes, boolean processingSubstitutions) { if(document == null) { throw new IllegalArgumentException("document is null"); } this.document = document; this.processingInserts = processingInserts; this.processingDeletes = processingDeletes; this.processingSubstitutions = processingSubstitutions; } /** * Gets the document used to create elements, nodes, attributes, etc. * @see #Disjunctivizer(Document, boolean, boolean, boolean) */ public Document getDocument() { return document; } /** * Sets the document used to create elements. * @param document The document that will be used while building disjunctive LF elements. */ public void setDocument(Document document) { this.document = document; } /** * Returns whether this disjunctivizer processes {@link LFGraphDifference#inserts()}. */ public boolean isProcessingInserts() { return processingInserts; } /** * Sets whether this disjunctivizer processes {@link LFGraphDifference#inserts()}. */ public void setProcessingInserts(boolean processingInserts) { if(this.processingInserts != processingInserts) { this.processingInserts = processingInserts; resetDisjunctiveLF(); } } /** * Returns whether this disjunctivizer processes {@link LFGraphDifference#deletes()}. */ public boolean isProcessingDeletes() { return processingDeletes; } /** * Sets whether this disjunctivizer processes {@link LFGraphDifference#deletes()}. */ public void setProcessingDeletes(boolean processingDeletes) { if(this.processingDeletes != processingDeletes) { this.processingDeletes = processingDeletes; resetDisjunctiveLF(); } } /** * Returns whether this disjunctivizer processes {@link LFGraphDifference#substitutions()}. */ public boolean isProcessingSubstitutions() { return processingSubstitutions; } /** * Sets whether this disjunctivizer processes {@link LFGraphDifference#substitutions()}. */ public void setProcessingSubstitutions(boolean processingSubstitutions) { if(this.processingSubstitutions != processingSubstitutions) { this.processingSubstitutions = processingSubstitutions; resetDisjunctiveLF(); } } private void resetDisjunctiveLF() { this.disjunctiveLF = null; } /** * Builds a disjunctive LF based on the specified graph difference. * The shape of the returned element will change depending on whether inserts, deletes, or substitutions * are being processed. * @param graphDifference The graph difference to use for building the disjunctive LF. * @return A recursively build disjunctive LF based on this disjunctivizer's graph difference. * @throws IllegalArgumentException If graphDifference is null. */ public Element buildDisjunctiveLFFor(LFGraphDifference graphDifference) { if(graphDifference == null) { throw new IllegalArgumentException("graph difference is null"); } if(disjunctiveLF == null || !this.graphDifference.equals(graphDifference)) { this.graphDifference = graphDifference; // reset in case this has been previously called if(foreignAlignedSubgraphRoots != null) { foreignAlignedSubgraphRoots = null; } if(importedVertices == null) { importedVertices = new HashSet(); } else { importedVertices.clear(); } if(vertexAliases == null) { vertexAliases = new HashMap(); } else { vertexAliases.clear(); } disjunctiveLF = document.createElement(DLF_TAG); for(LFVertex p : graphDifference.a.highestLFAncestors()) { disjunctiveLF.appendChild(createDisjunctiveElement(new DLFContext(A, p, disjunctiveLF))); } } return disjunctiveLF; } private void findForeignAlignedSubgraphRoots() { foreignAlignedSubgraphRoots = new HashMap(); LFGraphIterator rootIterator = new LFGraphIterator(graphDifference.b); Map> mappings = graphDifference.alignment.asMap(B); while(rootIterator.hasNext()) { LFVertex vertex = rootIterator.next(); if(!foreignAlignedSubgraphRoots.containsKey(vertex)) { // already encountered? if(mappings.containsKey(vertex.getIndex())) { // aligned? LFGraphIterator subgraphIterator = new LFGraphIterator(graphDifference.b, vertex); while(subgraphIterator.hasNext()) { // map whole subgraph to the aligned vertex foreignAlignedSubgraphRoots.put(subgraphIterator.next(), vertex); } } } } } private Element createDisjunctiveElement(DLFContext context) { LFVertex alias = vertexAliases.get(context.vertex); String vertexName = nameFor(context.vertex); boolean imported = importedVertices.contains(context.vertex); Element newNode = document.createElement(NODE_TAG); context.parent.appendChild(newNode); DLFContext localContext = context.copy(); localContext.parent = newNode; Set locals = localContext.getVertices(LOCAL_ANCESTOR), visited = localContext.getVertices(VISITED); if(locals.contains(localContext.vertex) || (alias != null && visited.contains(localContext.vertex))) { LFVertex v = (alias == null) ? localContext.vertex : alias; localContext.parent.setAttribute(IDREF_ATTR, nameFor(v)); if(visited.contains(localContext.vertex) && localContext.getVertices(SHARED).contains(v) && !locals.contains(v)) { localContext.parent.setAttribute(SHARED_ATTR, "true"); } } else { localContext.parent.setAttribute(ID_ATTR, alias == null ? vertexName : nameFor(alias)); visited.add(localContext.vertex); locals.add(localContext.vertex); addNonPredAttributes(localContext); if(!imported && processingInserts) { // inserts processInserts(localContext); } if(!imported && processingDeletes) { // deletes processDeletes(localContext); } LFGraph graph = localContext.getGraph(); @SuppressWarnings("unchecked") Set outgoingEdges = graph.containsVertex(localContext.vertex) ? graph.outgoingEdgesOf(localContext.vertex) : Collections.EMPTY_SET; if(outgoingEdges.isEmpty()) { // leaf? setPredicateName(localContext); } else { if(!imported && processingSubstitutions) { // do substitutions, if applicable processSubstitutions(localContext); } else { for(LFEdge out : outgoingEdges) { if(imported) { importedVertices.add(out.getTarget()); } // context.graph should be the B graph if imported processNonsubstitutedEdge(localContext, out); } } } } fixLabelReferences(newNode); return newNode; } private String nameFor(LFVertex vertex) { String vn = vertex.getName(); return importedVertices.contains(vertex) ? vn + FOREIGN_SUFFIX : vn; } private void processInserts(DLFContext context) { Element optional = null; for(LFEdge ins : graphDifference.insertsFor(context.vertex)) { // check if subgraph is aligned somewhere if(foreignAlignedSubgraphRoots == null) { findForeignAlignedSubgraphRoots(); } if(!foreignAlignedSubgraphRoots.containsKey(ins.getTarget())) { DLFContext ctxt = context.copy(); ctxt.graphPosition = B; // use foreign graph if(optional == null) { ctxt.parent = addOptional(context); } importedVertices.add(ins.getTarget()); // remember that inserted vertex is foreign doInsertDelete(ctxt, ins); } } } private void processDeletes(DLFContext context) { Element optional = null; for(LFEdge del : graphDifference.deletesFor(context.vertex)) { DLFContext ctxt = context.copy(); if(optional == null) { ctxt.parent = addOptional(context); } doInsertDelete(ctxt.copy(), del); } } private void doInsertDelete(DLFContext context, LFEdge edge) { LFVertex trg = edge.getTarget(); DLFContext ctxt = context.copy(); ctxt.vertex = trg; ctxt.parent = addRelation(ctxt, edge.getLabel()); ctxt.addVertex(trg, OPTIONAL); ctxt.parent.appendChild(createDisjunctiveElement(ctxt)); } private void processSubstitutions(DLFContext context) { for(LFEdge outgoing : context.getGraph().outgoingEdgesOf(context.vertex)) { if(graphDifference.substitutionsFor(outgoing).isEmpty()) { // no substitution(s) for this edge? processNonsubstitutedEdge(context.copy(), outgoing); } else { processSubstitutedEdge(context.copy(), outgoing); } } } private void processNonsubstitutedEdge(DLFContext context, LFEdge outgoing) { if(!context.getVertices(PREDICATES).contains(context.vertex)) { setPredicateName(context); } LFVertex trg = outgoing.getTarget(); Set similarTargets = new FilteredSet(context.getVertices(OPTIONAL), new SimilarTargetVertexFilter(trg)); if(similarTargets.isEmpty()) { DLFContext ctxt = context.copy(); ctxt.vertex = trg; ctxt.parent = addRelation(context, outgoing.getLabel()); ctxt.parent.appendChild(createDisjunctiveElement(ctxt)); } else { // target already present as an option for(LFVertex similar : similarTargets) { assimilateAttributes(context.copy(), trg, similar); } } } private void processSubstitutedEdge(DLFContext context, LFEdge outgoing) { processSubstitutedSimilarTarget(context.copy(), outgoing); if(!context.getVertices(PREDICATES).contains(context.vertex)) { processSubstitutedPredicates(context.copy(), outgoing); } // get the substitutions for the outgoing edge Map> subsBySource = graphDifference.substitutionsBySourceFor(outgoing); EdgeMatchFilter predicateFilter = null; for(LFVertex subSource : subsBySource.keySet()) { Set subEdges = subsBySource.get(subSource); if(predicateFilter == null) { predicateFilter = new EdgeMatchFilter(outgoing, TARGET_PREDICATE_MISMATCH); } else { predicateFilter.setBasis(outgoing); } // find the edges matching the outgoing edge's label, and the vertices with different predicates // from the outgoing edge's target vertex FilteredLFEdgeSet identicals = new FilteredLFEdgeSet(subEdges, new MembershipFilter( context.getGraph().outgoingEdgesOf(context.vertex))); Set matchingLabels = new FilteredLFEdgeSet(subEdges, new LabelMatchFilter(outgoing.getLabel())); Set differentPredicates = new FilteredSet( new FilteredLFEdgeSet(matchingLabels, predicateFilter).targetView(), new VisitedFilter()); // deal with the edges with matching labels separately from other substitutions subEdges.removeAll(matchingLabels); differentPredicates.removeAll(identicals.targetView()); subEdges.removeAll(identicals); if(subEdges.isEmpty() && differentPredicates.isEmpty()) { // no substitutions to make DLFContext ctxt = context.copyWithVertexMask(LOCAL_ANCESTOR, PREDICATES); fixOptions(ctxt, outgoing.getLabel()); ctxt.parent = addRelation(context, outgoing.getLabel()); ctxt.vertex = outgoing.getTarget(); ctxt.parent.appendChild(createDisjunctiveElement(ctxt)); } else { if(!differentPredicates.isEmpty()) { // handle matching labels but different predicates processDifferentPredicates(context.copy(), outgoing, differentPredicates); } if(!subEdges.isEmpty()) { // handle others processSubstitutedEdges(context.copy(), outgoing, subEdges); } } } } private void processDifferentPredicates(DLFContext context, LFEdge outgoing, Set differentPredicates) { LFEdgeLabel label = outgoing.getLabel(); boolean terminal = context.getGraph().outDegreeOf(context.vertex) == 0; if(!terminal) { terminal = !new FilteredSet( differentPredicates, new TerminalFilter(graphDifference.b)) .isEmpty(); } if(terminal) { DLFContext ctxt = context.copy(); ctxt.vertex = outgoing.getTarget(); if(differentPredicates.size() == 1) { // if we're here at all, it's at least non-empty processSingletonDifferentPredicate(ctxt, outgoing, differentPredicates.iterator().next()); } else { processMultipleDifferentPredicates(ctxt, outgoing, differentPredicates); } } else { // non-terminal, continue recursing through the graph DLFContext ctxt = context.copyWithVertexMask(LOCAL_ANCESTOR, PREDICATES); ctxt.vertex = outgoing.getTarget(); ctxt.parent = addRelation(ctxt, label); ctxt.parent.appendChild(createDisjunctiveElement(ctxt)); } } private void processSingletonDifferentPredicate(DLFContext context, LFEdge outgoing, LFVertex differentPredicate) { LFEdgeLabel label = outgoing.getLabel(); // add relation, then choice point Element newRel = addRelation(context, label); context.parent = newRel; Element choiceElement = addChoice(context); context.parent = choiceElement; // generate the target element, but do not propagate changes to tracked vertices Element targetElement = createDisjunctiveElement(context.copy(true)); if(!vertexAliases.containsKey(differentPredicate)) { vertexAliases.put(differentPredicate, outgoing.getTarget()); } context.vertex = differentPredicate; context.parent.appendChild(createDisjunctiveElement(context.copy(true))); // cleanup: how many new nodes were aliased? NodeList newNodes = newRel.getElementsByTagName(NODE_TAG); for(int j = 0; j < newNodes.getLength(); j++) { if(newNodes.item(j).getAttributes().getNamedItem(IDREF_ATTR) == null) { return; // one wasn't aliased } } // if we get here, they all were aliased: use generated target element instead newRel.replaceChild(targetElement, choiceElement); } private void processMultipleDifferentPredicates(DLFContext context, LFEdge outgoing, Set differentPredicates) { LFEdgeLabel label = outgoing.getLabel(); // generate the choice point Element choiceElement = addChoice(context); context.parent = choiceElement; // and the relation, but do not propagate changes to tracked vertices context.parent = addRelation(context, label); context.parent.appendChild(createDisjunctiveElement(context.copy(true))); // add attributes tag, after resetting parent to choice point context.parent = choiceElement; Element atts = addElement(context, ATTS_TAG); // then go through the different predicates, checking for aliases boolean aliased = false; for(LFVertex d : differentPredicates) { context.parent = atts; if(!aliased && !vertexAliases.containsKey(d)) { vertexAliases.put(d, outgoing.getTarget()); aliased = true; } // add new relation for each different pred. context.parent = addRelation(context, label); context.vertex = d; context.parent.appendChild(createDisjunctiveElement(context.copy(true))); } } private void processSubstitutedEdges(DLFContext context, LFEdge outgoing, Set substituedEdges) { LFEdgeLabel label = outgoing.getLabel(); boolean singleton = substituedEdges.size() == 1; // can't be empty if we get here Element choiceElement = addChoice(context); context.parent = choiceElement; Element toAppendTo = singleton ? choiceElement : addElement(context, ATTS_TAG); context.parent = addRelation(context, label); DLFContext ctxt = context.copy(true); ctxt.vertex = outgoing.getTarget(); ctxt.parent.appendChild(createDisjunctiveElement(ctxt)); boolean aliased = false; context.parent = toAppendTo; for(LFEdge s : substituedEdges) { LFVertex t = s.getTarget(); String vPred = context.vertex.getPredicate(), tPred = t.getPredicate(); LFEdgeLabel l = s.getLabel(); context.parent = addRelation(context, l); // shared? if(vPred != null && vPred.equals(tPred) && !label.equals(l)) { Element subNode = addElement(context, NODE_TAG); subNode.setAttribute(IDREF_ATTR, nameFor(context.vertex)); LFVertex sAlias = vertexAliases.get(context.vertex); if(context.getVertices(VISITED).contains(sAlias) && context.getVertices(SHARED).contains(sAlias) && !context.getVertices(LOCAL_ANCESTOR).contains(sAlias)) { subNode.setAttribute(SHARED_ATTR, "true"); } } else { if((singleton || !aliased) && !vertexAliases.containsKey(t)) { vertexAliases.put(t, outgoing.getTarget()); aliased = true; } DLFContext c = context.copy(true); c.vertex = t; importedVertices.add(t); c.graphPosition = B; // use foreign graph for substitution c.parent.appendChild(createDisjunctiveElement(c)); } } } private void processSubstitutedSimilarTarget(DLFContext context, LFEdge outgoing) { LFVertex target = outgoing.getTarget(); Map> subsBySource = graphDifference.substitutionsBySourceFor(outgoing); DLFContext ctxt = context.copy(); // for each substituted edge, look for similar target for(LFVertex subSource : subsBySource.keySet()) { Set similarTargetEdges = new FilteredLFEdgeSet(subsBySource.get(subSource), new SimilarTargetEdgeFilter(ctxt.vertex, outgoing.getLabel())); if(!similarTargetEdges.isEmpty()) { if(similarTargetEdges.size() > 1) { // more than one similar target? System.err.println("more than one similar target edge for " + ctxt.vertex + ": " + similarTargetEdges); // TODO figure out what to do about this } assimilateAttributes(ctxt, target, similarTargetEdges.iterator().next().getTarget()); LFVertex hp = ctxt.getGraph().highestLFAncestorOf(target); if(hp == null || hp.equals(outgoing.getSource())) { context.getVertices(SHARED).add(target); ctxt.vertex = target; ctxt.vertices = context.copyVertices(LOCAL_ANCESTOR, PREDICATES); ctxt.parent.appendChild(createDisjunctiveElement(ctxt)); return; // stop after similar target found } } } } private void processSubstitutedPredicates(DLFContext context, LFEdge outgoing) { final String predicate = context.vertex.getPredicate(); if(predicate != null) { Set alternates = new FilteredSet( graphDifference.substitutionsBySourceFor(outgoing).keySet(), new DelegatedFilter(new Filter(){ @Override public boolean allows(String s) { return !predicate.equals(s); } }) { @Override public String delegateValueFor(LFVertex e) { return e.getPredicate(); } }); if(alternates.isEmpty()) { // the simple case, no other predicates involved setPredicateName(context); } else { // add alternates as choice, with predicate an option DLFContext ctxt = context.copy(); ctxt.getVertices(PREDICATES).add(ctxt.vertex); ctxt.parent = addChoice(ctxt); addAttributes(ctxt, PRED_ATTR, predicate); for(LFVertex ap : alternates) { addAttributes(ctxt, PRED_ATTR, ap.getPredicate()); } } } } private Element addRelation(DLFContext context, LFEdgeLabel label) { Element newRel = addElement(context, RELATION_TAG); newRel.setAttribute(NAME_ATTR, label.getName()); return newRel; } private Element addOptional(DLFContext context) { return addElement(context, OPTIONAL_TAG); } private Element addChoice(DLFContext context) { return addElement(context, CHOICE_TAG); } private Element addElement(DLFContext context, String elementName) { Element newEl = document.createElement(elementName); context.parent.appendChild(newEl); return newEl; } private Element addAttributes(DLFContext context, String name, String value) { Element newAtts = document.createElement(ATTS_TAG); context.parent.appendChild(newAtts); newAtts.setAttribute(name, value); return newAtts; } private Element addAttributes(DLFContext context, Map attributes) { Element newAtts = document.createElement(ATTS_TAG); context.parent.appendChild(newAtts); for(Mode m : attributes.keySet()) { String n = m.getName(); if(!n.equals(PRED_ATTR)) { // TODO does this ever happen? newAtts.setAttribute(n, attributes.get(m).getName()); } } return newAtts; } private void fixLabelReferences(Element newNode) { NodeList rels = newNode.getChildNodes(); int rlen = rels.getLength(); Map refRels = new HashMap(rlen); for(int k = 0; k < rlen; k++) { Node n = rels.item(k); if(n != null && n.getNodeType() == Node.ELEMENT_NODE && n.getNodeName().equals(RELATION_TAG)) { Element ne = (Element)n; Node m = ne.getFirstChild(); if(m != null && m.getNodeType() == Node.ELEMENT_NODE && m.getNodeName().equals(NODE_TAG)) { Element me = (Element)m; String l = ne.getAttribute(NAME_ATTR); String idref = me.getAttribute(IDREF_ATTR); if(idref == null || idref.length() == 0) { String id = me.getAttribute(ID_ATTR); if(id != null && id.length() > 0) { refRels.put(l, id); } } else { if(idref.equals(refRels.get(l))) { newNode.removeChild(n); } else { refRels.put(l, idref); } } } } } } private void fixOptions(DLFContext context, LFEdgeLabel label) { String cPred = context.vertex.getPredicate(); if(cPred == null) { return; } NodeList ncs = context.parent.getChildNodes(); for(int j = 0; j < ncs.getLength(); j++) { Node c = ncs.item(j); if(c != null && c.getNodeType() == Node.ELEMENT_NODE && c.getNodeName().equals(OPTIONAL_TAG)) { NodeList rs = c.getChildNodes(); for(int k = 0; k < rs.getLength(); k++) { Node r = rs.item(k); if(r != null && r.getNodeType() == Node.ELEMENT_NODE && r.getNodeName().equals(RELATION_TAG)) { Element re = (Element)r; if(label.getName().equals(re.getAttribute(NAME_ATTR))) { Node d = re.getFirstChild(); if(d != null && d.getNodeType() == Node.ELEMENT_NODE && d.getNodeName().equals(NODE_TAG) && cPred.equals(((Element)d).getAttribute(PRED_ATTR))) { context.parent.removeChild(c); break; // don't try to remove more than once, throws DOMException } } } } } } } private void setPredicateName(DLFContext context) { String p = context.vertex.getPredicate(); if(p != null) { context.parent.setAttribute(PRED_ATTR, p); context.getVertices(PREDICATES).add(context.vertex); } } private void addNonPredAttributes(DLFContext context) { for(Mode m : context.vertex.attributeNames()) { String n = m.getName(); if(!n.equals(PRED_ATTR)) { // TODO is this attribute ever present?? context.parent.setAttribute(n, context.vertex.getAttributeValue(m).getName()); } } } private void assimilateAttributes(DLFContext context, LFVertex one, LFVertex two) { // copy attribute maps for both vertices Map oneAttrs = new HashMap(one.getAttributeMap()), twoAttrs = new HashMap(two.getAttributeMap()); // add all attributes common to both vertices, remove from both maps Iterator> i = oneAttrs.entrySet().iterator(); while(i.hasNext()) { Map.Entry e = i.next(); Set> tes = twoAttrs.entrySet(); if(tes.contains(e)) { context.parent.setAttribute(e.getKey().getName(), e.getValue().getName()); i.remove(); tes.remove(e); } } if(oneAttrs.isEmpty()) { // if first is empty, add second as optional if(!twoAttrs.isEmpty()) { DLFContext ctxt = context.copy(true); ctxt.parent = addOptional(context); addAttributes(ctxt, twoAttrs); } } else if(twoAttrs.isEmpty()) { // some attributes remain for first vertex addAttributes(context.copy(), oneAttrs); } else { // both are non-empty, make choice DLFContext ctxt = context.copy(true); ctxt.parent = addChoice(context); addAttributes(ctxt, oneAttrs); addAttributes(ctxt, twoAttrs); } } static class SimilarTargetVertexFilter implements Filter { LFVertex vertex; SimilarTargetVertexFilter(LFVertex vertex) { this.vertex = vertex; } @Override public boolean allows(LFVertex v) { String p = vertex.getPredicate(); return p != null && p.equals(v.getPredicate()); } } static class SimilarTargetEdgeFilter extends DelegatedFilter { LFEdgeLabel label; SimilarTargetEdgeFilter(LFVertex vertex, LFEdgeLabel label) { super(new SimilarTargetVertexFilter(vertex)); this.label = label; } @Override public boolean allows(LFEdge e) { return super.allows(e) && label.equals(e.getLabel()); } @Override public LFVertex delegateValueFor(LFEdge e) { return e.getTarget(); } } class TerminalFilter implements Filter { LFGraph graph; TerminalFilter(LFGraph graph) { this.graph = graph; } @Override public boolean allows(LFVertex e) { return graph.outDegreeOf(e) == 0; } } static enum VertexType { LOCAL_ANCESTOR, OPTIONAL, PREDICATES, SHARED, VISITED; } class DLFContext { PhrasePosition graphPosition; LFVertex vertex; Element parent; private Map> vertices; DLFContext(PhrasePosition graphPosition, LFVertex vertex, Element parent) { this(graphPosition, vertex, parent, new EnumMap>(VertexType.class)); } DLFContext(PhrasePosition graphPosition, LFVertex vertex, Element parent, Map> vertices) { this.graphPosition = graphPosition; this.vertex = vertex; this.parent = parent; this.vertices = vertices; } LFGraph getGraph() { return graphDifference.get(graphPosition); } DLFContext copy() { return copy(false); } DLFContext copy(boolean copyVertices) { return copyVertices ? copyWithVertexMask(VertexType.values()) : new DLFContext(graphPosition, vertex, parent, vertices); } DLFContext copyWithVertexMask(VertexType... vertexType) { return new DLFContext(graphPosition, vertex, parent, copyVertices(vertexType)); } Map> copyVertices() { return copyVertices(VertexType.values()); } Map> copyVertices(VertexType... vertexType) { Map> m = new EnumMap>(vertices); m.keySet().retainAll(Arrays.asList(vertexType)); return m; } Set getVertices(VertexType vertexType) { Set vs = vertices.get(vertexType); if(vs == null) { vs = new HashSet(); vertices.put(vertexType, vs); } return vs; } boolean addVertex(LFVertex vertex, VertexType vertexType) { return getVertices(vertexType).add(vertex); } } static class LFGraphIterator extends DepthFirstIterator { LFGraphIterator(LFGraph graph) { super(graph); } LFGraphIterator(LFGraph graph, LFVertex startVertex) { super(graph, startVertex); } } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/EdgeMatchFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import static opennlp.ccg.disjunctivizer.MatchType.LABEL_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.LABEL_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_PREDICATE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_PREDICATE_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_PREDICATE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_PREDICATE_MISMATCH; import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.util.CompositeFilter; import opennlp.ccg.util.Filter; import opennlp.ccg.util.InverseFilter; /** * A filter for LF edges based on a set of {@linkplain #getMatchTypes() match type criteria}. Instances of * this class compare a specified {@linkplain #getBasis() basis edge} based on their match type criteria. * This class extends {@link CompositeFilter}, and the constructors add various filters as members depending on * the criteria in effect. *

* Edge match filters use the following match types as criteria in addition to the ones used by * {@link VertexMatchFilter}: {@link MatchType#LABEL_MATCH}, and {@link MatchType#LABEL_MISMATCH}. * * @see VertexMatchFilter * @see LabelMatchFilter * * @author Scott Martin */ public class EdgeMatchFilter extends CompositeFilter { LFEdge basis; final EnumSet matchTypes; /** * Creates a new edge match filter based on the specified edge and criteria. * @see #EdgeMatchFilter(LFEdge, Collection) */ public EdgeMatchFilter(LFEdge basis, MatchType... matchTypes) { this(basis, Arrays.asList(matchTypes)); } /** * Creates a new edge match filter based on the specified edge, using the specified * match type criteria. Depending on the criteria, this constructor adds instances of * {@link VertexMatchFilter} and {@link LabelMatchFilter} (or their inverses) to the * set of filters composing it. * * @param basis The LF edge to use for comparison. * @param matchTypes The comparison criteria, used to populate this composite filter. * * @throws IllegalArgumentException if basis is null. */ public EdgeMatchFilter(LFEdge basis, Collection matchTypes) { super(); checkBasis(basis); this.basis = basis; this.matchTypes = EnumSet.copyOf(matchTypes); for(MatchType t : matchTypes) { Filter f = null; if(t == LABEL_MATCH || t == LABEL_MISMATCH) { f = new LabelMatchFilter(basis.getLabel()); if(t == LABEL_MISMATCH) { f = new InverseFilter(f); } } else if(t == SOURCE_MATCH || t == SOURCE_MISMATCH || t == SOURCE_PREDICATE_MATCH || t == SOURCE_PREDICATE_MISMATCH) { f = new VertexMatchFilter(basis.getSource(), t); } else if(t == TARGET_MATCH || t == TARGET_MISMATCH || t == TARGET_PREDICATE_MATCH || t == TARGET_PREDICATE_MISMATCH) { f = new VertexMatchFilter(basis.getTarget(), t); } if(f != null) { addFilter(f); } } } private void checkBasis(LFEdge basis) { if(basis == null) { throw new IllegalArgumentException("basis is null"); } } /** * Gets the edge that comparisons are based on. */ public LFEdge getBasis() { return basis; } /** * Sets the edge used for comparisons. * @throws IllegalArgumentException if basis is null. */ public void setBasis(LFEdge basis) { checkBasis(basis); this.basis = basis; } /** * Gets the match type criteria used by this edge match filter. */ public EnumSet getMatchTypes() { return matchTypes; } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/FilteredLFEdgeSet.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import java.util.AbstractCollection; import java.util.Collection; import java.util.Iterator; import java.util.Set; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFVertex; import opennlp.ccg.util.Filter; import opennlp.ccg.util.FilteredSet; /** * A filtered set of LF edges. This class extends {@link FilteredSet} to provide the additional functionality * of edge-based views of its contents: {@link #sourceView()}, {@link #targetView()}, and {@link #labelView()}, * which respectively get the set of source vertices, set of target vertices, and set of labels contained * in the edges in this set. All of these view sets are read-only, so that attempting to add or remove * elements from them (including via the iterator) throws {@link UnsupportedOperationException}. * * @author Scott Martin */ public class FilteredLFEdgeSet extends FilteredSet { private VertexView sourceView = null, targetView = null; private LabelView labelView = null; /** * Creates a new filtered edge set based on the specified underlying edge set and edge filter. */ public FilteredLFEdgeSet(Set edges, Filter edgeFilter) { super(edges, edgeFilter); } /** * Gets a view of this filtered edge set as a set of LF vertices that are the * {@linkplain LFEdge#getSource() source vertices} for each edge in this set. * @return A set containing every LF vertex that is the source vertex of some edge in this set. * Note that the returned collection is immutable, and may contain duplicate vertices. */ public Collection sourceView() { return (sourceView == null) ? (sourceView = new VertexView(true)) : sourceView; } /** * Gets a view of this filtered edge set as a set of LF vertices that are the * {@linkplain LFEdge#getTarget() target vertices} for each edge in this set. * @return A set containing every LF vertex that is the target vertex of some edge in this set. * Note that the returned collection is immutable, and may contain duplicate vertices. */ public Collection targetView() { return (targetView == null) ? (targetView = new VertexView(false)) : targetView; } /** * Gets a view of this filtered edge set as a set of LF vertices that are the * {@linkplain LFEdge#getLabel() labels} for each edge in this set. * @return A set containing every LF edge label that is the label of some edge in this set. * Note that the returned collection is immutable, and may contain duplicate labels. */ public Collection labelView() { return (labelView == null) ? (labelView = new LabelView()) : labelView; } abstract class ComponentView extends AbstractCollection { abstract T componentOf(LFEdge edge); @Override public Iterator iterator() { return new Iterator() { private Iterator i = FilteredLFEdgeSet.this.iterator(); @Override public boolean hasNext() { return i.hasNext(); } @Override public T next() { // don't have to worry whether hasNext() is true, iterator should throw exception if not return componentOf(i.next()); } @Override public void remove() { throw new UnsupportedOperationException(); } }; } @Override public int size() { return FilteredLFEdgeSet.this.size(); } } class VertexView extends ComponentView { boolean source; VertexView(boolean source) { this.source = source; } @Override LFVertex componentOf(LFEdge edge) { return source ? edge.getSource() : edge.getTarget(); } } class LabelView extends ComponentView { @Override LFEdgeLabel componentOf(LFEdge edge) { return edge.getLabel(); } } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/LFGraphDifference.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import static opennlp.ccg.alignment.PhrasePosition.A; import static opennlp.ccg.alignment.PhrasePosition.B; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_ALIGNED; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_UNALIGNED; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_ALIGNED; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_UNALIGNED; import java.util.AbstractMap; import java.util.AbstractSet; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import opennlp.ccg.alignment.Alignment; import opennlp.ccg.alignment.PhrasePosition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFGraph; import opennlp.ccg.hylo.graph.LFVertex; import opennlp.ccg.util.CompositeFilter; import opennlp.ccg.util.Filter; import opennlp.ccg.util.FilteredMap; import opennlp.ccg.util.FilteredSet; import opennlp.ccg.util.VisitedFilter; /** * Represents the difference between two {@link LFGraph}s that characterizes their difference as a set * of edits: either {@link #inserts()}, {@link #deletes()}, or {@link #substitutions()}. These edits are * determined by a specified {@linkplain #getAlignment() alignment} between the phrases the graphs * are supposed to represent. All of the sets of edges returned by this class, and the convenience maps * build on top of them, are read-only. Attempting to add or remove elements or keys from any of these * (including via any of the iterators) throws an {@link UnsupportedOperationException}. *

* This class also provides the convenience methods {@link #insertsFor(LFVertex)} and * {@link #deletesFor(LFVertex)}, for getting just the inserts or deletes for a specified vertex. * The convenience methods {@link #substitutionsFor(LFEdge)} gets the set of substitutions correspoding * to a given edge, and {@link #substitutionsBySource()} and {@link #substitutionsBySourceFor(LFEdge)} * are similar methods that provide maps whose keys are the source vertices of the substituted edges. * * @author Scott Martin */ public class LFGraphDifference { final LFGraph a, b; final Alignment alignment; private Set deletes, inserts, substitutions; /** * Creates a new graph difference between a and b, as determined by the specified * alignment. * @param a The {@linkplain PhrasePosition#A A-position} graph. * @param b The {@linkplain PhrasePosition#B B-position} graph. * @param alignment An alignment between a and b where the * {@linkplain PhrasePosition#A A-position} indices are understood to correspond to a and * {@linkplain PhrasePosition#B B-position} indices are understood to correspond to b. * @throws IllegalArgumentException If either graph is null, or if the alignment is * null. */ public LFGraphDifference(LFGraph a, LFGraph b, Alignment alignment) { checkGraph(a, A); checkGraph(b, B); if(alignment == null) { throw new IllegalArgumentException("alignment is null"); } this.a = a; this.b = b; this.alignment = alignment; } private void checkGraph(LFGraph g, PhrasePosition pos) { if(g == null) { throw new IllegalArgumentException(pos.name() + " graph is null"); } } /** * Gets the {@linkplain PhrasePosition#A A-position} graph. */ public LFGraph getA() { return get(A); } /** * Gets the {@linkplain PhrasePosition#B B-position} graph. */ public LFGraph getB() { return get(B); } /** * Gets the graph in the specified position. * @param position The position to retrieve a graph for. * @return The value of {@link #getA()} if position is {@link PhrasePosition#A}, and * the value of {@link #getB()} otherwise. */ public LFGraph get(PhrasePosition position) { return (position == A) ? a : b; } /** * Gets the alignment used to determine the edits between the two graphs. */ public Alignment getAlignment() { return alignment; } /** * Computes a hash code for this graph difference based on its graphs and the * alignment between them. */ @Override public int hashCode() { return 31 * a.hashCode() + b.hashCode() + alignment.hashCode(); } /** * Tests whether this LF graph difference is equivalent to another by comparing their * graphs and the alignment between them. */ @Override public boolean equals(Object obj) { if(obj instanceof LFGraphDifference) { LFGraphDifference diff = (LFGraphDifference)obj; return a.equals(diff.a) && b.equals(diff.b) && alignment.equals(diff.alignment); } return false; } /** * Gets a string representation of this graph difference. */ @Override public String toString() { StringBuilder sb = new StringBuilder("difference for graphs: "); for(PhrasePosition pos : PhrasePosition.values()) { sb.append(pos); sb.append(": "); sb.append(get(pos)); sb.append(", "); } sb.append("alignment: "); sb.append(alignment.toString()); return sb.toString(); } /** * Gets an LF graph difference that is the reverse of the present one. * @return An LF graph difference whose {@linkplain PhrasePosition#A A-position} graph is the value of this * difference's {@link #getB()}, whose {@linkplain PhrasePosition#B B-position} graph is the value of this * difference's {@link #getA()}, and whose alignments are the {@linkplain Alignment#reverse() reverse} of * this difference's {@link #getAlignment()}. */ public LFGraphDifference reverse() { return new LFGraphDifference(b, a, alignment.reverse()); } /** * Gets the deletes for this graph difference. * @return The set of edges in the {@linkplain PhrasePosition#A A-position} graph that have an aligned * {@linkplain LFEdge#getSource() source vertex} and an unaligned * {@linkplain LFEdge#getTarget() target vertex}. * * @see AlignedEdgeFilter */ public Set deletes() { return (deletes == null) ? (deletes = doDeletes(A)) : deletes; } /** * Gets the inserts for this graph difference. * @return The set of edges in the {@linkplain PhrasePosition#B B-position} graph that have an aligned * {@linkplain LFEdge#getSource() source vertex} and an unaligned * {@linkplain LFEdge#getTarget() target vertex}. * * @see AlignedEdgeFilter */ public Set inserts() { return (inserts == null) ? (inserts = doDeletes(B)) : inserts; } Set doDeletes(PhrasePosition keyPosition) { return Collections.unmodifiableSet(new FilteredLFEdgeSet(get(keyPosition).edgeSet(), new AlignedEdgeFilter(alignment.asMap(keyPosition).keySet(), SOURCE_ALIGNED, TARGET_UNALIGNED))); } /** * Gets the inserts for a specified vertex. * @param vertex The vertex to return the inserts for. * @return The subset of {@link #inserts()} whose {@linkplain LFEdge#getSource() source} index is among the * {@linkplain Alignment#getTargets(Integer) targets} for the specified vertex, or {@link Collections#EMPTY_SET} * if none exist. * * @see AlignedEdgeFilter */ @SuppressWarnings("unchecked") public Set insertsFor(LFVertex vertex) { Set indices = alignment.getTargets(vertex.getIndex()); return (indices.isEmpty()) ? Collections.EMPTY_SET : new FilteredLFEdgeSet(inserts(), new AlignedEdgeFilter(indices, SOURCE_ALIGNED)); } /** * Gets the deletes for a specified vertex. * @param vertex The vertex to get the deletes for. * @return The subset of {@link #deletes()} whose {@linkplain LFEdge#getSource() source vertex} is * the specified vertex. * * @see VertexMatchFilter */ public Set deletesFor(LFVertex vertex) { return new FilteredLFEdgeSet(deletes(), new VertexMatchFilter(vertex, SOURCE_MATCH)); } /** * Gets the substitutions for this graph difference. * @return The subset of the {@linkplain PhrasePosition#B B-position} graph's edges for which there * exists an edge in the {@linkplain PhrasePosition#A A-position} graph that meets the following * conditions: *

    *
  1. The B edge's source is aligned to the A edge's source, but the B edge's target is not.
  2. *
  3. The B edge's target is aligned to the A edge's target, but the B edge's source is not.
  4. *
* * @see CompositeFilter */ public Set substitutions() { if(substitutions == null) { substitutions = new LinkedHashSet(); Set bEdges = b.edgeSet(); AlignedEdgeFilter sourceFilter = null, targetFilter = null; CompositeFilter filter = new CompositeFilter(); for(LFEdge aEdge : a.edgeSet()) { Set sMaps = alignment.getTargets(aEdge.getSource().getIndex()), tMaps = alignment.getTargets(aEdge.getTarget().getIndex()); if(!sMaps.isEmpty() && !tMaps.isEmpty()) { if(sourceFilter == null) { sourceFilter = new AlignedEdgeFilter(sMaps, SOURCE_ALIGNED, TARGET_UNALIGNED); targetFilter = new AlignedEdgeFilter(tMaps, TARGET_ALIGNED, SOURCE_UNALIGNED); filter.addFilter(sourceFilter); filter.addFilter(targetFilter); } else { sourceFilter.setAlignmentIndices(sMaps); targetFilter.setAlignmentIndices(tMaps); } substitutions.addAll(new FilteredLFEdgeSet(bEdges, filter)); } } } return Collections.unmodifiableSet(substitutions); } /** * Gets the substitutions for the specified edge. * @param edge The edge to get substitutions for. * @return The subset of {@link #substitutions()} whose source is aligned to the edge's source and * whose target is aligned to the edge's target, or {@link Collections#EMPTY_SET} if none exist. */ @SuppressWarnings("unchecked") public Set substitutionsFor(LFEdge edge) { Set srcMapsTo = alignment.getTargets(edge.getSource().getIndex()), trgMapsTo = alignment.getTargets(edge.getTarget().getIndex()); return (srcMapsTo.isEmpty() || trgMapsTo.isEmpty()) ? Collections.EMPTY_SET : Collections.unmodifiableSet(new FilteredLFEdgeSet(substitutions(), new CompositeFilter(new AlignedEdgeFilter(srcMapsTo, SOURCE_ALIGNED), new AlignedEdgeFilter(trgMapsTo, TARGET_ALIGNED)))); } /** * Gets a map view of the substitutions in this graph difference. * @return A map whose keys are the source vertices in the set of {@link #substitutions()} and whose values * are the edges whose {@linkplain LFEdge#getSource() source vertex} is the same as the corresponding key. * If there are no substitutions, {@link Collections#EMPTY_MAP} is returned. *

* Note that the returned map is * read-only, that is, both its {@link Map#put(Object, Object)} method and its * {@linkplain Map#entrySet() entry set}'s iterator's {@link Iterator#remove()} method throw an * {@link UnsupportedOperationException}. Also, the members of the returned map's entry set are immutable, * so that their {@link Entry#setValue(Object)} methods also throw an {@link UnsupportedOperationException}. */ @SuppressWarnings("unchecked") public Map> substitutionsBySource() { Set subs = substitutions(); return subs.isEmpty() ? Collections.EMPTY_MAP : Collections.unmodifiableMap( new FilteredMap>(new SourceView(), new VisitedFilter())); } /** * Gets a map view of the substitutions for the specified edge. * @param edge The edge to get substitutions for. * @return The subset of {@link #substitutionsBySource()} in which the keys are aligned to the specified edge's * source and the associated values' targets are aligned to the specified edge's target. Since this map is * based on the one returned by {@link #substitutionsBySource()}, it is also read-only, and the same stipulations * apply to it. * * @see #substitutionsBySource() */ public Map> substitutionsBySourceFor(LFEdge edge) { Map> subsBySource = substitutionsBySource(); return subsBySource.isEmpty() ? subsBySource : new SubstitutedSourceView(subsBySource, edge); } class SourceView extends AbstractMap { @Override public Set> entrySet() { return new AbstractSet>() { Set subs = substitutions(); @Override public int size() { return subs.size(); } @Override public Iterator> iterator() { return new Iterator>() { private Iterator edgeIterator = null; @Override public boolean hasNext() { if(edgeIterator == null) { edgeIterator = subs.iterator(); } return edgeIterator.hasNext(); } @Override public Entry next() { if(edgeIterator == null) { edgeIterator = subs.iterator(); } LFVertex src = edgeIterator.next().getSource(); return new SimpleImmutableEntry(src, new FilteredLFEdgeSet(subs, new VertexMatchFilter(src, SOURCE_MATCH))); } @Override public void remove() { // subs.iterator() should be read-only, but just in case throw new UnsupportedOperationException(); } }; } }; } } class SubstitutedSourceView extends AbstractMap> { Map> sourceView; LFEdge edge; private Set>> entrySet; SubstitutedSourceView(Map> sourceView, LFEdge edge) { this.sourceView = sourceView; this.edge = edge; } @Override public Set>> entrySet() { return (entrySet == null) ? (entrySet = new EntrySet()) : entrySet; } class EntrySet extends AbstractSet>> { private Set>> entries; Set srcMapsTo = alignment.getTargets(edge.getSource().getIndex()), trgMapsTo = alignment.getTargets(edge.getTarget().getIndex()); Set>> entries() { if(entries == null) { entries = new FilteredSet>>( sourceView.entrySet(), new Filter>>() { @Override public boolean allows(Entry> e) { if(srcMapsTo.contains(e.getKey().getIndex())) { for(LFEdge t : e.getValue()) { if(trgMapsTo.contains(t.getTarget().getIndex())) { return true; } } } return false; } } ); } return entries; } @Override public int size() { return entries().size(); } @Override public Iterator>> iterator() { return new Iterator>>() { private Iterator>> i = entries().iterator(); @Override public boolean hasNext() { return i.hasNext(); } @Override public Entry> next() { Entry> e = i.next(); return new SimpleImmutableEntry>( e.getKey(), new FilteredLFEdgeSet(e.getValue(), new Filter() { @Override public boolean allows(LFEdge e) { return trgMapsTo.contains(e.getTarget().getIndex()); } } ) ); } @Override public void remove() { // source view is already read-only, so just in case throw new UnsupportedOperationException(); } }; } } } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/LabelMatchFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.util.Filter; /** * A filter for edges based on a comparison of their {@linkplain LFEdge#getLabel() labels}. Instances of * this class compare a specified {@linkplain #getBasis() basis edge label}, so that the * {@link #allows(LFEdge)} method returns true if it is * {@linkplain LFEdgeLabel#equals(Object) equivalent to} the specified edge's label. * * @author Scott Martin */ public class LabelMatchFilter implements Filter { LFEdgeLabel basis; /** * Creates a new label match filter that will compare edge labels to the specified label. * @param basis The label to be used for comparison. * @throws IllegalArgumentException if basis is null. */ public LabelMatchFilter(LFEdgeLabel basis) { checkBasis(basis); this.basis = basis; } private void checkBasis(LFEdgeLabel basis) { if(basis == null) { throw new IllegalArgumentException("basis is null"); } } /** * Gets the label used as the basis for comparison in the {@link #allows(LFEdge)} method. * @return The edge label specified at creation. * * @see #LabelMatchFilter(LFEdgeLabel) */ public LFEdgeLabel getBasis() { return basis; } /** * Sets the edge label used as the basis for comparison. * @throws IllegalArgumentException if basis is null. */ public void setBasis(LFEdgeLabel basis) { checkBasis(basis); this.basis = basis; } /** * Tests whether the specified edge's label is equivalent to this filter's {@linkplain #getBasis() basis * edge}. * @param edge The edge to test. * @return true if the basis edge label is equivalent to edge.getLabel() based on a * comparison via their {@link LFEdgeLabel#equals(Object)} method. */ @Override public boolean allows(LFEdge edge) { return basis.equals(edge.getLabel()); } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/MatchType.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFVertex; /** * A set of enum constants for identifying the matching criteria used by a {@link MatchTypeFilter}. * Not all match type filters will use all of the match type criteria contained in this enum. * * @see LFVertex * @see LFEdge * * @author Scott Martin */ public enum MatchType { /** * Matching source vertices. */ SOURCE_MATCH, /** * Matching target vertices. */ TARGET_MATCH, /** * Matching edge labels. */ LABEL_MATCH, /** * Matching predicates for source vertices. */ SOURCE_PREDICATE_MATCH, /** * Matching predicates for target vertices. */ TARGET_PREDICATE_MATCH, /** * Mismatching source vertices. */ SOURCE_MISMATCH, /** * Mismatching target vertices. */ TARGET_MISMATCH, /** * Mismatching edge labels. */ LABEL_MISMATCH, /** * Mismatching source vertex predicates. */ SOURCE_PREDICATE_MISMATCH, /** * Mismatching target vertex predicates. */ TARGET_PREDICATE_MISMATCH, /** * Source vertex is aligned. */ SOURCE_ALIGNED, /** * Source vertex is not aligned. */ SOURCE_UNALIGNED, /** * Target vertex is aligned. */ TARGET_ALIGNED, /** * Target vertex is unaligned. */ TARGET_UNALIGNED } ================================================ FILE: src/opennlp/ccg/disjunctivizer/MatchTypeFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import java.util.Arrays; import java.util.Collection; import java.util.EnumSet; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.util.Filter; /** * Abstract class for filters that allow {@link LFEdge}s based on criteria indicated by the set of * {@linkplain #getMatchTypes() match types} they use. * * @author Scott Martin * */ public abstract class MatchTypeFilter implements Filter { /** * The set of match types used as criteria by this filter. */ protected final EnumSet matchTypes; /** * Creates a new match type filter based on the specified match types. * @see #MatchTypeFilter(Collection) */ protected MatchTypeFilter(MatchType... matchTypes) { this(Arrays.asList(matchTypes)); } /** * Creates a new match type filter based on the specified match types. * @param matchTypes The collection of match types to use. The specified collection is * copied via {@link EnumSet#copyOf(Collection)}. */ protected MatchTypeFilter(Collection matchTypes) { this.matchTypes = EnumSet.copyOf(matchTypes); } /** * Gets the match types used by this match type filter. */ public EnumSet getMatchTypes() { return matchTypes; } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/VertexMatchFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.disjunctivizer; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_PREDICATE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.SOURCE_PREDICATE_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_MISMATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_PREDICATE_MATCH; import static opennlp.ccg.disjunctivizer.MatchType.TARGET_PREDICATE_MISMATCH; import java.util.Collection; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFVertex; /** * A filter that matches vertices based on a basis vertex and a set of match type criteria. *

* Instances of this class use the following match types: {@link MatchType#SOURCE_MATCH}, * {@link MatchType#SOURCE_MISMATCH}, {@link MatchType#TARGET_MATCH}, {@link MatchType#TARGET_MISMATCH}, * {@link MatchType#SOURCE_PREDICATE_MATCH}, {@link MatchType#SOURCE_PREDICATE_MISMATCH}, * {@link MatchType#TARGET_PREDICATE_MATCH}, {@link MatchType#TARGET_PREDICATE_MISMATCH}. * * @author Scott Martin */ public class VertexMatchFilter extends MatchTypeFilter { LFVertex basis; /** * Creates a new vertex match filter using the specified vertex as a basis for comparison and the * specified match type criteria. * @throws IllegalArgumentException if basis is null. * @see #VertexMatchFilter(LFVertex, Collection) */ public VertexMatchFilter(LFVertex basis, MatchType... matchTypes) { super(matchTypes); checkBasis(basis); this.basis = basis; } /** * Creates a new vertex match filter using the specified vertex as a basis for comparison and the * specified match type criteria. * @param basis The vertex to use as a basis for comparison. * @param matchTypes The set of match type criteria to use. * @throws IllegalArgumentException if basis is null. */ public VertexMatchFilter(LFVertex basis, Collection matchTypes) { super(matchTypes); checkBasis(basis); this.basis = basis; } private void checkBasis(LFVertex basis) { if(basis == null) { throw new IllegalArgumentException("basis is null"); } } /** * Gets the vertex that is the basis for comparison in this filter's {@link #allows(LFEdge)} method. */ public LFVertex getBasis() { return basis; } /** * Sets the vertex used as a basis for comparison. * @throws IllegalArgumentException if basis is null. */ public void setBasis(LFVertex basis) { checkBasis(basis); this.basis = basis; } /** * Tests whether a specified edge is allowed based on the match type criteria in effect and the * vertex used as a basis for comparison. * @param edge The edge to test. * @return false if {@link #getMatchTypes()} contains *

    *
  • {@link MatchType#SOURCE_MATCH}, but the basis edge does not equal the edge's source,
  • *
  • {@link MatchType#SOURCE_MISMATCH}, but the basis edge is equal to the edge's source,
  • *
  • {@link MatchType#TARGET_MATCH}, but the basis edge does not equal the edge's target,
  • *
  • {@link MatchType#TARGET_MISMATCH}, but the basis edge is equal to the edge's target,
  • *
  • {@link MatchType#SOURCE_PREDICATE_MATCH}, but the basis edge's predicate does not equal the * edge's source vertex's predicate,
  • *
  • {@link MatchType#SOURCE_PREDICATE_MISMATCH}, but the basis edge's predicate is equal to the * edge's source vertex's predicate,
  • *
  • {@link MatchType#TARGET_PREDICATE_MATCH}, but the basis edge's predicate does not equal the * edge's target vertex's predicate,
  • *
  • {@link MatchType#TARGET_PREDICATE_MISMATCH}, but the basis edge's predicate is equal to the * edge's target vertex's predicate,
  • *
* and true otherwise. */ @Override public boolean allows(LFEdge edge) { for(MatchType t : matchTypes) { if(t == SOURCE_MATCH && !basis.equals(edge.getSource())) { return false; } else if(t == SOURCE_MISMATCH && basis.equals(edge.getSource())) { return false; } else if(t == TARGET_MATCH && !basis.equals(edge.getTarget())) { return false; } else if(t == TARGET_MISMATCH && basis.equals(edge.getTarget())) { return false; } else if(basis.getPredicate() != null) { if(t == SOURCE_PREDICATE_MATCH && !basis.getPredicate().equals(edge.getSource().getPredicate())) { return false; } else if(t == SOURCE_PREDICATE_MISMATCH && basis.getPredicate().equals(edge.getSource().getPredicate())) { return false; } else if(t == TARGET_PREDICATE_MATCH && !basis.getPredicate().equals(edge.getTarget().getPredicate())) { return false; } else if(t == TARGET_PREDICATE_MISMATCH && basis.getPredicate().equals(edge.getTarget().getPredicate())) { return false; } } } return true; } } ================================================ FILE: src/opennlp/ccg/disjunctivizer/package.html ================================================

Package for the disjunctivizer, which creates a disjunctive LF XML structure based on an {@link opennlp.ccg.disjunctivizer.LFGraphDifference}. An LF graph difference is a characterization of the difference between two {@link opennlp.ccg.hylo.graph.LFGraph graphs} and an {@link opennlp.ccg.alignment.Alignment} between them in terms of the edits needed to make one into the other: inserts, deletes, and substitutions.

The {@link opennlp.ccg.disjunctivizer.FilteredLFEdgeSet} class extends {@link opennlp.ccg.util.FilteredSet} to also provide views of its edges by {@linkplain opennlp.ccg.hylo.graph.LFEdge#getSource() source}, {@linkplain opennlp.ccg.hylo.graph.LFEdge#getTarget() target}, or {@linkplain opennlp.ccg.hylo.graph.LFEdge#getLabel() label}. The various filters in this package provide a lot of flexibility in terms of how to filter edges: whether their source, target, or label matches a given edge's (or just their predicates), whether their source or target is aligned, etc. These filters, and the {@link opennlp.ccg.disjunctivizer.FilteredLFEdgeSet} class, are used internally by the {@link opennlp.ccg.disjunctivizer.Disjunctivizer} for generating disjunctive LFs.

================================================ FILE: src/opennlp/ccg/grammar/AbstractApplicationRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Super class for application rules. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.8 $, $Date: 2009/12/21 03:27:18 $ */ public abstract class AbstractApplicationRule extends AbstractRule { private static final long serialVersionUID = 1L; protected Slash _functorSlash; /** Returns an XML element representing the rule. */ public Element toXml(String dir) { Element retval = new Element("application"); retval.setAttribute("dir", dir); return retval; } public int arity() { return 2; } protected List apply(Category xyCat, Category yCat) throws UnifyFailure { if (xyCat instanceof ComplexCat) { ComplexCat xyCurCat = (ComplexCat)xyCat; Arg xyOuter = xyCurCat.getOuterArg(); List results; _headCats.clear(); if (xyOuter instanceof BasicArg) { xyOuter.unifySlash(_functorSlash); Category xyOuterCat = ((BasicArg)xyOuter).getCat(); Substitution sub = new GSubstitution(); GUnifier.unify(xyOuterCat, yCat, sub); results = new ArrayList(1); ((GSubstitution)sub).condense(); Category result = (Category) xyCurCat.getResult().fill(sub); appendLFs(xyCat, yCat, result, sub); results.add(result); Slash xyOuterSlash = ((BasicArg)xyOuter).getSlash(); _headCats.add(xyOuterSlash.isModifier() ? yCat : xyCat); } else if (xyOuter instanceof SetArg) { SetArg xyOuterSet = (SetArg)xyOuter; results = new ArrayList(xyOuterSet.size()); for (int i=0; i apply(Category xyCat, Category yzCat) throws UnifyFailure { if (xyCat instanceof ComplexCat && yzCat instanceof ComplexCat) { List results = new ArrayList(1); _headCats.clear(); ComplexCat xyCC = (ComplexCat) xyCat; ComplexCat yzCC = (ComplexCat) yzCat; Arg xyOuter = xyCC.getOuterArg(); if (xyOuter instanceof BasicArg) { Slash xySlash = ((BasicArg) xyOuter).getSlash(); xySlash.unifyCheck(_functorSlash); if (eisner() && xySlash.isHarmonicCompositionResult()) throw new UnifyFailure(); Category xyOuterCat = ((BasicArg) xyOuter).getCat(); if (xyOuterCat instanceof AtomCat) { // e.g. s/s Y/Z ArgStack zStack = yzCC.getArgStack(); zStack.slashesUnify(_argSlash); Substitution sub = new GSubstitution(); GUnifier.unify(xyOuterCat, yzCC.getTarget(), sub); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); Category outcome = createResult(xyCC.getResult(), zStack, xySlash, sub); appendLFs(xyCat, yzCat, outcome, sub); results.add(outcome); _headCats.add(xySlash.isModifier() ? yzCat : xyCat); } else if (xyOuterCat instanceof ComplexCat) { // e.g. s/(s/n) Y/Z Substitution sub = new GSubstitution(); ArgStack zStack = composeComplexY((ComplexCat) xyOuterCat, xySlash, yzCC, sub); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); Category outcome = createResult(xyCC.getResult(), zStack, xySlash, sub); appendLFs(xyCat, yzCat, outcome, sub); results.add(outcome); _headCats.add(xySlash.isModifier() ? yzCat : xyCat); } } else if (xyOuter instanceof SetArg) { // e.g. s/{s,n} Y/Z Category yzTarget = yzCC.getTarget(); SetArg xyOuterSet = (SetArg) xyOuter; int targetIndex = xyOuterSet.indexOf(yzTarget); if (targetIndex > -1) { Slash xySlash = xyOuterSet.get(targetIndex).getSlash(); xySlash.unifyCheck(_functorSlash); if (eisner() && xySlash.isHarmonicCompositionResult()) throw new UnifyFailure(); Substitution sub = new GSubstitution(); GUnifier.unify(xyOuterSet.getCat(targetIndex), yzTarget, sub); Category result = xyCC.copy(); ((ComplexCat) result).setOuterArgument(xyOuterSet.copyWithout(targetIndex)); ArgStack zStack = yzCC.getArgStack(); zStack.slashesUnify(_argSlash); Category outcome = createResult(result, zStack, xySlash, sub); appendLFs(xyCat, yzCat, outcome, sub); results.add(outcome); _headCats.add(xySlash.isModifier() ? yzCat : xyCat); } else { boolean success = false; for (int i = 0; i < xyOuterSet.size(); i++) { BasicArg yInSet = xyOuterSet.get(i); if (yInSet.getCat() instanceof ComplexCat) { Slash xySlash = yInSet.getSlash(); xySlash.unifyCheck(_functorSlash); if (eisner() && xySlash.isHarmonicCompositionResult()) throw new UnifyFailure(); ComplexCat yCat = (ComplexCat) yInSet.getCat(); Substitution sub = new GSubstitution(); ArgStack zStack = composeComplexY((ComplexCat) yCat, xySlash, yzCC, sub); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); Category result = xyCC.copy(); ((ComplexCat) result).setOuterArgument(xyOuterSet.copyWithout(i)); Category outcome = createResult(result, zStack, xySlash, sub); appendLFs(xyCat, yzCat, outcome, sub); results.add(outcome); _headCats.add(xySlash.isModifier() ? yzCat : xyCat); success = true; } } if (!success) { throw new UnifyFailure(); } } } else { throw new UnifyFailure(); } return results; } else { throw new UnifyFailure(); } } private Category createResult(Category result, ArgStack zStack, Slash xySlash, Substitution sub) throws UnifyFailure { ((GSubstitution) sub).condense(); result = (Category) result.fill(sub); ArgStack newStack = zStack.fill(sub); if (!_isHarmonic && (!xySlash.sameDirAsModality() || zStack .containsContrarySlash())) { newStack.deepMap(INERTIZER_FCN); } newStack.get(0).setSlashModifier(false); if (_isHarmonic && useEisnerConstraints) newStack.setSlashHarmonicCompositionResult(true); if (result instanceof ComplexCat) { ((ComplexCat) result).add(newStack); } else { result = new ComplexCat((TargetCat) result, newStack); } return result; } /** * A function that tries to unify the value ant=+ into feature structures. */ private static ModFcn INERTIZER_FCN = new ModFcn() { public void modify(Mutable m) { if (m instanceof Slash) { ((Slash) m).setAbility("inert"); } } }; private ArgStack composeComplexY(ComplexCat xyOuterCC, Slash xySlash, ComplexCat yzCC, Substitution sub) throws UnifyFailure { GUnifier.unify(xyOuterCC.getTarget(), yzCC.getTarget(), sub); ArgStack zStack = yzCC.getArgStack(); if (xyOuterCC.containsDollarArg()) { // e.g. s$/(s$\n) s\n/n xyOuterCC.getArgStack().unifyPrefix(zStack, zStack.size() - 1, sub); zStack = zStack.subList(zStack.size() - 1); zStack.slashesUnify(_argSlash); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); return zStack; } else if (xyOuterCC.arity() == 1) { ArgStack yzStack = yzCC.getArgStack(); if (!(xyOuterCC.getArg(0) instanceof BasicArg)) { throw new UnifyFailure(); } BasicArg xyOuterOuter = (BasicArg) xyOuterCC.getArg(0); Arg yzStackInner = yzStack.get(0); if (yzStackInner instanceof SetArg) { // e.g. s/(s/n) s/{s,n} SetArg yzSetArg = (SetArg) yzStackInner; int iaIndex = yzSetArg.indexOf(xyOuterOuter); if (iaIndex == -1) throw new UnifyFailure(); xyOuterOuter.unify(yzSetArg.get(iaIndex), sub); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); zStack = yzStack.copy(); zStack.set(0, yzSetArg.copyWithout(iaIndex)); zStack.slashesUnify(_argSlash); return zStack; } else { // e.g. s/(s/n) s/n/s if (yzStack.size() < 2) { throw new UnifyFailure(); } xyOuterOuter.unify(yzStackInner, sub); zStack = yzStack.subList(1).copy(); zStack.slashesUnify(_argSlash); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); return zStack; } } else if (xyOuterCC.arity() == 2) { // e.g. s\np/(s\np) s\np/(s\np)/n // nb: not dealing with set args ArgStack yzStack = yzCC.getArgStack(); if (!(xyOuterCC.getArg(0) instanceof BasicArg) || !(xyOuterCC.getArg(1) instanceof BasicArg) || yzStack.size() < 3) { throw new UnifyFailure(); } BasicArg xyOuterOuter1 = (BasicArg) xyOuterCC.getArg(0); BasicArg xyOuterOuter2 = (BasicArg) xyOuterCC.getArg(1); Arg yzStackInner1 = yzStack.get(0); Arg yzStackInner2 = yzStack.get(1); xyOuterOuter1.unify(yzStackInner1, sub); xyOuterOuter2.unify(yzStackInner2, sub); zStack = yzStack.subList(2).copy(); zStack.slashesUnify(_argSlash); xySlash = (Slash) xySlash.fill(sub); xySlash.unifyCheck(_functorSlash); return zStack; } else { // nb: not dealing with xyOuterCC arity > 2 throw new UnifyFailure(); } } } ================================================ FILE: src/opennlp/ccg/grammar/AbstractRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import java.io.Serializable; import java.util.*; import org.jdom.Element; /** * Implements some default behavior for Rule objects. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.18 $, $Date: 2009/12/21 03:27:18 $ */ public abstract class AbstractRule implements Rule, Serializable { private static final long serialVersionUID = 1L; /** The interned name of this rule. */ protected String _name; /** The rule group which contains this rule. */ protected RuleGroup _ruleGroup; /** Reusable list of head cats, one for each result. */ protected List _headCats = new ArrayList(); /** Returns an XML element representing the rule. */ abstract public Element toXml(); /** Applies the rule to the given input signs, adding to the given list of results. */ public void applyRule(Sign[] inputs, List results) { if (inputs.length != arity()) { // shouldn't happen throw new RuntimeException("Inputs must have length " + arity()); } Category[] cats = new Category[inputs.length]; for (int i=0; i < cats.length; i++) { cats[i] = inputs[i].getCategory(); } try { List resultCats = applyRule(cats); if (resultCats.isEmpty()) return; for (int i=0; i < resultCats.size(); i++) { Category catResult = resultCats.get(i); distributeTargetFeatures(catResult); Category headCat = _headCats.get(i); Sign lexHead = inputs[0].getLexHead(); for (int j=0; j < inputs.length; j++) { if (inputs[j].getCategory() == headCat) lexHead = inputs[j].getLexHead(); } Sign sign = Sign.createDerivedSign(catResult, inputs, this, lexHead); results.add(sign); } } catch (UnifyFailure uf) {} } /** Propagates distributive features from target cat to the rest. */ // nb: it would be nicer to combine inheritsFrom with $, but // this would be complicated, as inheritsFrom is compiled out protected void distributeTargetFeatures(Category cat) { if (_ruleGroup == null) return; if (_ruleGroup.grammar.lexicon.getDistributiveAttrs() == null) return; if (!(cat instanceof ComplexCat)) return; ComplexCat complexCat = (ComplexCat) cat; Category targetCat = (Category) complexCat.getTarget(); targetFS = (GFeatStruc) targetCat.getFeatureStructure(); if (targetFS == null) return; cat.forall(distributeTargetFeaturesFcn); } // target cat's feature structure private GFeatStruc targetFS = null; // copies ground distributive features from _targetFS to the rest private CategoryFcn distributeTargetFeaturesFcn = new DistributeTargetFeaturesFcn(); private class DistributeTargetFeaturesFcn extends CategoryFcnAdapter implements Serializable { private static final long serialVersionUID = 5247861522003485434L; public void forall(Category c) { if (!(c instanceof AtomCat)) return; FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; if (fs == targetFS) return; String[] distrAttrs = _ruleGroup.grammar.lexicon.getDistributiveAttrs(); for (int i = 0; i < distrAttrs.length; i++) { Object targetVal = targetFS.getValue(distrAttrs[i]); if (targetVal != null && !(targetVal instanceof Variable)) { fs.setFeature(distrAttrs[i], UnifyControl.copy(targetVal)); } } } } /** * The number of arguments this rule takes. For example, the arity of the * forward application rule of categorial grammar (X/Y Y => Y) is 2. * * @return the number of arguments this rule takes **/ public abstract int arity(); /** * Apply this rule to some input categories. * * @param inputs the input categories to try to combine * @return the categories resulting from using this rule to combine the * inputs * @exception UnifyFailure if the inputs cannot be combined by this rule **/ public abstract List applyRule(Category[] inputs) throws UnifyFailure; /** Prints an apply instance for the given categories to System.out. */ protected void showApplyInstance(Category[] inputs) { StringBuffer sb = new StringBuffer(); sb.append(_name).append(": "); for (int i=0; i < inputs.length; i++) { sb.append(inputs[i]).append(' '); } System.out.println(sb); } /** Prints an apply instance for the given categories to System.out. */ protected void showApplyInstance(Category first, Category second) { Category[] ca = {first,second}; showApplyInstance(ca); } /** * Returns the interned name of this rule. */ public String name() { return _name; } /** * Returns the rule group which contains this rule. */ public RuleGroup getRuleGroup() { return _ruleGroup; } /** * Sets this rule's rule group. */ public void setRuleGroup(RuleGroup ruleGroup) { _ruleGroup = ruleGroup; } /** Appends, fills, sorts and checks the LFs from cats 1 and 2 into the result cat. */ protected void appendLFs(Category cat1, Category cat2, Category result, Substitution sub) throws UnifyFailure { LF lf = HyloHelper.append(cat1.getLF(), cat2.getLF()); if (lf != null) { lf = (LF) lf.fill(sub); HyloHelper.sort(lf); HyloHelper.check(lf); } result.setLF(lf); } } ================================================ FILE: src/opennlp/ccg/grammar/AbstractSubstitutionRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-3 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Super class for substitution rules. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/12/21 03:27:18 $ */ public abstract class AbstractSubstitutionRule extends AbstractApplicationRule { private static final long serialVersionUID = 1L; protected boolean _isHarmonic; protected Slash _argSlash; /** Returns an XML element representing the rule. */ public Element toXml(String dir) { Element retval = new Element("substitution"); retval.setAttribute("dir", dir); retval.setAttribute("harmonic", Boolean.toString(_isHarmonic)); return retval; } protected List apply (Category xyzCat, Category yzCat) throws UnifyFailure { if (xyzCat instanceof ComplexCat && yzCat instanceof ComplexCat) { ComplexCat xyzCC = (ComplexCat)xyzCat; ComplexCat yzCC = (ComplexCat)yzCat; if (xyzCC.arity() < 2 || xyzCC.containsDollarArg() || xyzCC.containsSetArg() || yzCC.containsSetArg() || yzCC.containsDollarArg()) { throw new UnifyFailure(); } ArgStack primaryStack = xyzCC.getArgStack(); int size = primaryStack.size(); BasicArg primaryArgY = (BasicArg)primaryStack.get(size-2); primaryArgY.unifySlash(_functorSlash); BasicArg primaryArgZ = (BasicArg)primaryStack.get(size-1); primaryArgZ.unifySlash(_argSlash); BasicArg secondaryArgZ = (BasicArg)yzCC.getOuterArg(); secondaryArgZ.unifySlash(_argSlash); Category secondaryY = yzCC.getResult(); GSubstitution sub = new GSubstitution(); GUnifier.unify(primaryArgZ.getCat(), secondaryArgZ.getCat(), sub); GUnifier.unify(primaryArgY.getCat(), secondaryY, sub); Category result = new ComplexCat(xyzCC.getTarget(), primaryStack.copyWithout(size-2)); ((GSubstitution)sub).condense(); result = (Category)result.fill(sub); ((ComplexCat)result).getOuterArg().setSlashModifier(false); appendLFs(xyzCat, yzCat, result, sub); List results = new ArrayList(1); _headCats.clear(); results.add(result); _headCats.add(primaryArgY.getSlash().isModifier() ? yzCat : xyzCat); return results; } else { throw new UnifyFailure(); } } } ================================================ FILE: src/opennlp/ccg/grammar/AbstractTypeRaisingRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Type-raising, e.g. np => s/(s\np). * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/12/21 03:27:18 $ */ // NB: It might make sense to eventually make this a subclass of TypeChangingRule, // but currently it's structured a bit differently. public abstract class AbstractTypeRaisingRule extends AbstractRule { private static final long serialVersionUID = 1L; /** The upper slash, eg the first slash in s/(s\np). */ protected Slash _upperSlash; /** The embedded slash, eg the second slash in s/(s\np). */ protected Slash _embeddedSlash; /** * The result of the type raised category, eg the "s" in s/(s\np). Defaults to "s". */ protected Category _result; /** * The argument of the ComplexCat argument of the type raised category, * eg the "np" in s/(s\np). Defaults to "np". */ protected Category _arg; /** * Creates a new type raising rule with the given name; upper and lower slashes; * use dollar switch; arg category; and result category. Defaults are used * for the arg and result categories if null. */ protected AbstractTypeRaisingRule( String name, Slash uslash, Slash eslash, boolean useDollar, Category arg, Category result ) { _name = name; _upperSlash = uslash; _upperSlash.setAbility("active"); _upperSlash.setModifier(true); _embeddedSlash = eslash; _embeddedSlash.setAbility("active"); if (arg != null) { _arg = arg; } else { _arg = new AtomCat("np", new GFeatStruc()); } if (result != null) { _result = result; result.getFeatureStructure().setIndex(1); } else { GFeatStruc resfs = new GFeatStruc(); resfs.setIndex(1); _result = new AtomCat("s", resfs); } if (useDollar) { Dollar dol = new Dollar("1"); dol.setIndex(1); _result = new ComplexCat((AtomCat)_result, dol); } } /** Returns an XML element representing the rule. */ public Element toXml(String dir) { Element retval = new Element("typeraising"); retval.setAttribute("dir", dir); boolean usesDollar = (_result instanceof ComplexCat) && ((ComplexCat)_result).containsDollarArg(); retval.setAttribute("useDollar", Boolean.toString(usesDollar)); if (!(_arg instanceof AtomCat) || !((AtomCat)_arg).getType().equals("np")) { Element argElt = new Element("arg"); retval.addContent(argElt); argElt.addContent(_arg.toXml()); } if (!((AtomCat)_result.getTarget()).getType().equals("s")) { Element resultElt = new Element("result"); retval.addContent(resultElt); resultElt.addContent(_result.getTarget().toXml()); } return retval; } /** Returns 1. */ public int arity() { return 1; } /** Applies this rule to the given inputs. */ public List applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 1) { throw new UnifyFailure(); } return apply(inputs[0]); } /** Applies this rule to the given input. */ protected List apply(Category input) throws UnifyFailure { Substitution sub = new GSubstitution(); Category arg = (Category)_arg.unify(input, sub); ((GSubstitution)sub).condense(); Category result = _result.copy(); ComplexCat range; UnifyControl.reindex(result); if (result instanceof ComplexCat) { range = (ComplexCat)result.copy(); range.add(new BasicArg(_embeddedSlash, arg)); ((ComplexCat)result).add(new BasicArg(_upperSlash, range)); } else { range = new ComplexCat((TargetCat)result.copy(), new BasicArg(_embeddedSlash, arg)); result = new ComplexCat((TargetCat)result.copy(), new BasicArg(_upperSlash, range)); } // nb: with defined type changing rules, this step is done when the // rule is created; with type raising, it is done here, so that // the arg need not have its distributive features yet, and since // the full result category doesn't exist beforehand _ruleGroup.grammar.lexicon.propagateDistributiveAttrs(result); LF inputLF = input.getLF(); if (inputLF != null) { result.setLF((LF)inputLF.copy()); } List results = new ArrayList(1); _headCats.clear(); results.add(result); _headCats.add(input); return results; } } ================================================ FILE: src/opennlp/ccg/grammar/BackwardApplication.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Forward application: X/Y Y => X * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class BackwardApplication extends AbstractApplicationRule { private static final long serialVersionUID = 6981288425455531650L; public BackwardApplication () { _name = "<"; _functorSlash = new Slash('\\'); _functorSlash.setAbility("active"); } /** Returns an XML element representing the rule. */ public Element toXml() { return super.toXml("backward"); } public List applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } return apply(inputs[1], inputs[0]); } public String toString() { return "Y X\\Y => X"; } } ================================================ FILE: src/opennlp/ccg/grammar/BackwardComposition.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Backward composition, e.g. Y\Z X\Y => X\Z * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class BackwardComposition extends AbstractCompositionRule { private static final long serialVersionUID = -937944882697380690L; public BackwardComposition() { this(true); } public BackwardComposition(boolean isHarmonic) { _isHarmonic = isHarmonic; if (isHarmonic) { _name = " applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } return apply(inputs[1], inputs[0]); } public String toString() { StringBuffer sb = new StringBuffer(); sb.append("Y").append(_argSlash.toString()).append("Z ").append( "X\\Y => X").append(_argSlash.toString()).append("Z"); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/grammar/BackwardSubstitution.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Backward substitution, e.g. Y\Z X\Y\Z => X\Z * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class BackwardSubstitution extends AbstractSubstitutionRule { private static final long serialVersionUID = -4597839433754132265L; public BackwardSubstitution() { this(true); } public BackwardSubstitution(boolean isHarmonic) { _isHarmonic = isHarmonic; if (isHarmonic) { _name = " applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } return apply(inputs[1], inputs[0]); } public String toString() { StringBuffer sb = new StringBuffer(); sb.append("Y").append(_argSlash.toString()).append("Z ").append( "X\\Y => X").append(_argSlash.toString()).append("Z"); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/grammar/BackwardTypeRaising.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import org.jdom.Element; import opennlp.ccg.synsem.*; /** * Backward type-raising: X => Y\(Y/X). * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/07/17 04:23:30 $ */ public class BackwardTypeRaising extends AbstractTypeRaisingRule { private static final long serialVersionUID = 4334502669369098203L; /** Creates a backward type raising rule with the given parameters. */ public BackwardTypeRaising(boolean useDollar, Category arg, Category result) { super(" Y\\(Y/X)"; } } ================================================ FILE: src/opennlp/ccg/grammar/ForwardApplication.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Forward application: X/Y Y => X * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class ForwardApplication extends AbstractApplicationRule { private static final long serialVersionUID = 1336124476870410093L; public ForwardApplication() { _name = ">"; _functorSlash = new Slash('/'); _functorSlash.setAbility("active"); } /** Returns an XML element representing the rule. */ public Element toXml() { return super.toXml("forward"); } public List applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } return apply(inputs[0], inputs[1]); } public String toString() { return "X/Y Y => X"; } } ================================================ FILE: src/opennlp/ccg/grammar/ForwardComposition.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Forward composition, e.g. X/Y Y/Z => X/Z * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class ForwardComposition extends AbstractCompositionRule { private static final long serialVersionUID = -5029901211362928251L; public ForwardComposition() { this(true); } public ForwardComposition(boolean isHarmonic) { _isHarmonic = isHarmonic; if (isHarmonic) { _name = ">B"; _functorSlash = new Slash('/', "^"); _argSlash = new Slash('/', "^"); } else { _name = ">Bx"; _functorSlash = new Slash('/', "x"); _argSlash = new Slash('\\', "x"); } _functorSlash.setAbility("active"); } /** Returns an XML element representing the rule. */ public Element toXml() { return super.toXml("forward"); } public List applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } return apply(inputs[0], inputs[1]); } public String toString() { StringBuffer sb = new StringBuffer(); sb.append("X").append(_functorSlash.toString()).append("Y Y").append( _argSlash.toString()).append("Z => X").append( _argSlash.toString()).append("Z"); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/grammar/ForwardSubstitution.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * Forward substitution, e.g. X/Y/Z Y/Z => X/Z * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class ForwardSubstitution extends AbstractSubstitutionRule { private static final long serialVersionUID = 7324585108055853456L; public ForwardSubstitution() { this(true); } public ForwardSubstitution(boolean isHarmonic) { _isHarmonic = isHarmonic; if (isHarmonic) { _name = ">S"; _functorSlash = new Slash('/', "^"); _argSlash = new Slash('/', "^"); } else { _name = ">Sx"; _functorSlash = new Slash('/', "x"); _argSlash = new Slash('\\', "x"); } _functorSlash.setAbility("active"); } /** Returns an XML element representing the rule. */ public Element toXml() { return super.toXml("forward"); } public List applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } return apply(inputs[0], inputs[1]); } public String toString() { StringBuffer sb = new StringBuffer(); sb.append("X").append(_functorSlash.toString()).append("Y Y").append( _argSlash.toString()).append("Z => X").append( _argSlash.toString()).append("Z"); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/grammar/ForwardTypeRaising.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import org.jdom.Element; import opennlp.ccg.synsem.*; /** * Forward type-raising: X => Y/(Y\X). * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/07/17 04:23:30 $ */ public class ForwardTypeRaising extends AbstractTypeRaisingRule { private static final long serialVersionUID = 1417585756957436141L; /** Creates a forward type raising rule with the given parameters. */ public ForwardTypeRaising (boolean useDollar, Category arg, Category result) { super(">T", new Slash('/', new VarModality("i")), new Slash('\\', new VarModality("i")), useDollar, arg, result); } /** Returns an XML element representing the rule. */ public Element toXml() { return super.toXml("forward"); } public String toString() { return "X => Y/(Y\\X)"; } } ================================================ FILE: src/opennlp/ccg/grammar/FragmentJoining.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2007 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import java.util.*; import org.jdom.Element; /** * Rule for joining fragments that don't fit together, when all else fails. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/07/17 04:23:30 $ */ public class FragmentJoining extends AbstractRule { private static final long serialVersionUID = 7451163798607652012L; /** Constructor. */ public FragmentJoining() { _name = "*"; } /** Returns an XML element representing the rule (not supported). */ public Element toXml() { throw new RuntimeException("toXml not supported for FragmentJoining rules"); } /** * Returns the result of applying this rule to two input signs. */ public Sign applyRule(Sign sign1, Sign sign2) { List results = new ArrayList(1); Sign[] inputs = new Sign[] { sign1, sign2 }; applyRule(inputs, results); return results.get(0); } /** * Apply this rule to two input categories. * Returns a copy of the first cat with the LFs appended. **/ public List applyRule(Category[] inputs) throws UnifyFailure { if (inputs.length != 2) { throw new UnifyFailure(); } List results = new ArrayList(1); _headCats.clear(); Category result = inputs[0].shallowCopy(); try { appendLFs(inputs[0], inputs[1], result, new EmptySubstitution()); } catch (UnifyFailure uf) { // not expected // System.err.println("Unexpected unify failure in appending LFs when joining fragments:"); // System.err.println("cat0: " + inputs[0] + " lf: " + inputs[0].getLF()); // System.err.println("cat1: " + inputs[1] + " lf: " + inputs[1].getLF()); } results.add(result); _headCats.add(inputs[0]); return results; } /** * The number of arguments this rule takes. **/ public int arity() { return 2; } /** Returns a string for this rule. */ public String toString() { return "X Y *=> X"; } } ================================================ FILE: src/opennlp/ccg/grammar/GlueRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import java.util.*; import org.jdom.Element; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; /** * Implements a glue rule for combining a sequence of fragments. * The rule is frag|cat cat => frag, allowing only the first input * to itself be a fragment, unless the second input has the * frag completion flag set, meaning that it completes a chunk/alt. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2011/06/07 05:12:01 $ */ public class GlueRule extends AbstractRule { private static final long serialVersionUID = 4867141181941895272L; // empty subst for combining LFs private static final Substitution emptySubst = new SimpleSubstitution(); /** Fragment result type. */ public static final String resultType = "frag"; /** Constructor. */ public GlueRule() { _name = "glue"; } /** Returns an XML element representing the rule (not supported). */ public Element toXml() { throw new RuntimeException("toXml not supported for GlueRule rules"); } /** Arity. */ public int arity() { return 2; } /** Glues cats into fragments. */ public List applyRule(Category[] inputs) throws UnifyFailure { // check num inputs if (inputs.length != 2) { throw new UnifyFailure(); } // check for frag as second input with completion false if (inputs[1] instanceof AtomCat) { AtomCat ac2 = (AtomCat) inputs[1]; if (ac2.isFragment() && !ac2.fragCompletion) throw new UnifyFailure(); } // make result cat List results = new ArrayList(1); _headCats.clear(); AtomCat ac = new AtomCat(resultType); appendLFs(inputs[0], inputs[1], ac, emptySubst); results.add(ac); // guess head, with left as default boolean leftHead = true; boolean leftMod = isModifier(inputs[0]); boolean rightMod = isModifier(inputs[1]); if ((inputs[0] instanceof AtomCat && inputs[1] instanceof ComplexCat && !rightMod) || (leftMod && !rightMod)) { leftHead = false; } // return result cat with guessed head _headCats.add(leftHead ? inputs[0] : inputs[1]); return results; } // modifier check private static boolean isModifier(Category cat) { if (cat instanceof ComplexCat) { ComplexCat xyCat = (ComplexCat) cat; Arg arg = xyCat.getOuterArg(); if (arg instanceof BasicArg) { return ((BasicArg)arg).getSlash().isModifier(); } } return false; } /** toString. */ public String toString() { return "frag|cat cat => frag"; } } ================================================ FILE: src/opennlp/ccg/grammar/Grammar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 University of Edinburgh (Michael White) and Gunes Erkan // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.lexicon.*; import opennlp.ccg.util.*; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import opennlp.ccg.parse.Parser; import opennlp.ccg.parse.ParseException; import opennlp.ccg.realize.Realizer; import org.jdom.*; import org.jdom.input.*; import org.jdom.output.*; import org.jdom.transform.*; import org.xml.sax.*; import javax.xml.parsers.*; import javax.xml.transform.*; import javax.xml.transform.stream.*; import javax.xml.transform.sax.*; import java.io.*; import java.net.URL; import java.util.*; /** * A CCG grammar is essentially a lexicon plus a rule group. * A grammar may also have sequences of transformations to use in * loading/saving LFs from/to XML. * * @author Michael White * @author Gunes Erkan * @version $Revision: 1.45 $, $Date: 2010/12/06 02:39:35 $ */ public class Grammar { /** The lexicon. */ public final Lexicon lexicon; /** The rule group. */ public final RuleGroup rules; /** The type hierarchy. */ public final Types types; /** The features to include in supertags. */ public final Set supertagFeatures = new HashSet(); /** The sequence of transformations to use when loading LFs from XML. */ public final URL[] fromXmlTransforms; /** The sequence of transformations to use when saving LFs to XML. */ public final URL[] toXmlTransforms; /** Preferences for displaying elements in this grammar. */ public DisplayPrefs prefs = new DisplayPrefs(); /** For access to the current grammar; should be generalized eventually. */ public static Grammar theGrammar; // name of the grammar private String grammarName = null; // parser, for getting parsed words private Parser parser = null; // XML factories private SAXParserFactory spf = null; private static SAXTransformerFactory stf = null; // transformer for loading/saving LFs from/to XML private Transformer transformer = null; // transformations for loading/saving LFs from/to XML private Templates[] fromXmlTemplates = null; private Templates[] toXmlTemplates = null; // transformer for saving strings to APML private Transformer apmlTransformer = null; /** The pitch accents recognized as underscored suffixes for translation to APML. */ public static final String[] pitchAccents = { "H*", "L*", "L+H*", "L*+H", "H*+L", "H+L*" }; // set of pitch accents private static Set pitchAccentsSet = null; /** The boundary tones recognized as separate tokens for translation to APML. */ public static final String[] boundaryTones = { "L", "H", "LL%", "HH%", "LH%", "HL%" }; // set of boundary tones private static Set boundaryTonesSet = null; /** Loads a grammar from the given filename. */ public Grammar(String filename) throws IOException { this(new File(filename).toURI().toURL()); } /** Loads a grammar from the given URL. */ public Grammar(URL url) throws IOException { this(url, false); } /** Loads a grammar from the given URL, with the given flag for whether to ignore rule combos. */ @SuppressWarnings("unchecked") public Grammar(URL url, boolean ignoreCombos) throws IOException { theGrammar = this; // read XML SAXBuilder builder = new SAXBuilder(); Document doc; try { doc = builder.build(url); } catch (JDOMException jde) { throw (IOException) new IOException().initCause(jde); } Element root = doc.getRootElement(); // root corresponds to grammarName = root.getAttributeValue("name"); Element supertagsElt = root.getChild("supertags"); if (supertagsElt != null) { String feats = supertagsElt.getAttributeValue("feats"); if (feats != null) { String[] names = feats.split("\\s+"); for (int i = 0; i < names.length; i++) { supertagFeatures.add(names[i]); } } } if (supertagFeatures.isEmpty()) { // default is "form" and "lex" supertagFeatures.add("form"); supertagFeatures.add("lex"); } Tokenizer tokenizer = null; Element tokenizerElt = root.getChild("tokenizer"); if (tokenizerElt != null) { String tokenizerClass = tokenizerElt.getAttributeValue("classname"); if (tokenizerClass != null) { try { tokenizer = (Tokenizer) Class.forName(tokenizerClass).newInstance(); } catch (Exception exc) { throw (IOException) new IOException().initCause(exc); } } else tokenizer = new DefaultTokenizer(); String replacementSemClasses = tokenizerElt.getAttributeValue("replacement-sem-classes"); if (replacementSemClasses != null) { String[] semClasses = replacementSemClasses.split("\\s+"); for (int i = 0; i < semClasses.length; i++) { tokenizer.addReplacementSemClass(semClasses[i]); } } } Element typesElt = root.getChild("types"); URL typesUrl; if (typesElt != null) { typesUrl = new URL(url, typesElt.getAttributeValue("file")); } else typesUrl = null; Element lexiconElt = root.getChild("lexicon"); boolean openlex = "true".equals(lexiconElt.getAttributeValue("openlex")); URL lexiconUrl = new URL(url, lexiconElt.getAttributeValue("file")); Element morphElt = root.getChild("morphology"); URL morphUrl = new URL(url, morphElt.getAttributeValue("file")); Element rulesElt = root.getChild("rules"); URL rulesUrl = new URL(url, rulesElt.getAttributeValue("file")); Element fromXmlElt = root.getChild("LF-from-XML"); if (fromXmlElt != null) { List children = fromXmlElt.getChildren(); fromXmlTransforms = new URL[children.size()]; for (int i = 0; i < children.size(); i++) { Element transformElt = (Element) children.get(i); fromXmlTransforms[i] = new URL(url, transformElt.getAttributeValue("file")); } } else { fromXmlTransforms = new URL[0]; } Element toXmlElt = root.getChild("LF-to-XML"); if (toXmlElt != null) { List children = toXmlElt.getChildren(); toXmlTransforms = new URL[children.size()]; for (int i = 0; i < children.size(); i++) { Element transformElt = (Element) children.get(i); toXmlTransforms[i] = new URL(url, transformElt.getAttributeValue("file")); } } else { toXmlTransforms = new URL[0]; } // load type hierarchy, lexicon and rules if (typesUrl != null) types = new Types(typesUrl, this); else types = new Types(this); if (tokenizer != null) lexicon = new Lexicon(this, tokenizer); else lexicon = new Lexicon(this); lexicon.openlex = openlex; lexicon.init(lexiconUrl, morphUrl); rules = new RuleGroup(rulesUrl, this); // add observed supertag-rule combos for filtering, if any, unless ignoring combos if (!ignoreCombos) { String combosfile = rulesElt.getAttributeValue("combosfile"); if (combosfile != null) { URL combosUrl = new URL(url, combosfile); rules.loadSupercatRuleCombos(combosUrl); } // set dynamic combos: defaults to true with a combosfile, otherwise defaults to false boolean dynamic = (combosfile != null); String dynamicCombos = rulesElt.getAttributeValue("dynamic-combos"); if (dynamicCombos != null) dynamic = Boolean.parseBoolean(dynamicCombos); rules.setDynamicCombos(dynamic); } } /** * Returns a file url string relative to the user's current directory * for the given filename. */ public static String convertToFileUrl(String filename) { try { return new File(filename).toURI().toURL().toString(); } catch (java.net.MalformedURLException exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } // return "file:"+System.getProperty("user.dir")+"/"+filename; } // initializes factories and transformers private void initializeTransformers() throws TransformerConfigurationException { // init factories if (spf == null) { spf = SAXParserFactory.newInstance(); spf.setNamespaceAware(true); } if (stf == null) { stf = (SAXTransformerFactory) TransformerFactory.newInstance(); try { // try setting indent at factory level stf.setAttribute("indent-number", new Integer(2)); } catch (IllegalArgumentException exc) {} // ignore } // set up transformer with indenting // nb: with some JVMs (eg JDK 1.4.1 on Windows), // the transformer needs to be reinitialized each time, in order to // run multiple :r FN commands in tccg if (transformer == null) { transformer = stf.newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); try { // also try setting indent as a xalan property transformer.setOutputProperty("{http://xml.apache.org/xalan}indent-amount", "2"); } catch (IllegalArgumentException exc) {} // ignore } // set up apml transformer if (apmlTransformer == null) { InputStream toApmlStr = ClassLoader.getSystemResourceAsStream("opennlp/ccg/grammar/to-apml.xsl"); apmlTransformer = stf.newTransformer(new StreamSource(toApmlStr)); // nb: DOCTYPE SYSTEM also specified in to-apml.xsl; including // redundant specification here to workaround omission of DOCTYPE with Linux 1.5 JVM apmlTransformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "apml.dtd"); } } // does setup for LF from XML transformation, and returns a SAXSource for the given input stream // nb: need a new filter chain one for each use (perhaps due to an underyling bug) private SAXSource fromXmlSetup(InputStream istream) throws IOException { try { // initialize transformer initializeTransformers(); // load transformations if (fromXmlTemplates == null) { fromXmlTemplates = new Templates[fromXmlTransforms.length]; for (int i = 0; i < fromXmlTemplates.length; i++) { String url = fromXmlTransforms[i].toString(); fromXmlTemplates[i] = stf.newTemplates(new StreamSource(url)); } } // set up initial reader SAXParser parser = spf.newSAXParser(); XMLReader reader = parser.getXMLReader(); // set up chain of filters XMLFilter[] filters = new XMLFilter[fromXmlTransforms.length]; for (int i = 0; i < filters.length; i++) { // create filter filters[i] = stf.newXMLFilter(fromXmlTemplates[i]); // set parent if (i == 0) { filters[0].setParent(reader); } else { filters[i].setParent(filters[i-1]); } } // set final reader/filter XMLReader finalReader = (filters.length == 0) ? reader : filters[filters.length-1]; // set up and return LF from XML SAX source with final reader/filter return new SAXSource(finalReader, new InputSource(istream)); } catch (ParserConfigurationException pce) { throw (IOException) new IOException().initCause(pce); } catch (SAXException se) { throw (IOException) new IOException().initCause(se); } catch (TransformerConfigurationException tce) { throw (IOException) new IOException().initCause(tce); } } /** * Loads a document from the XML in the given input stream, * applying the configured from-XML transformations. */ public synchronized Document loadFromXml(InputStream istream) throws IOException { try { // do setup and get source Source source = fromXmlSetup(istream); // do transformation JDOMResult result = new JDOMResult(); transformer.transform(source, result); // return result doc return result.getDocument(); } catch (TransformerException exc) { throw (IOException) new IOException().initCause(exc); } } /** * Loads a document from the XML file with the given filename, * applying the configured from-XML transformations. */ public synchronized Document loadFromXml(String filename) throws IOException { BufferedInputStream bis = new BufferedInputStream(new FileInputStream(filename)); Document retval = loadFromXml(bis); bis.close(); return retval; } // does setup for LF to XML transformation, and returns a SAXSource for the given source // nb: need a new filter chain one for each use (perhaps due to an underyling bug) private SAXSource toXmlSetup(Source source) throws IOException { try { // initialize transformer initializeTransformers(); // load transformations if (toXmlTemplates == null) { toXmlTemplates = new Templates[toXmlTransforms.length]; for (int i = 0; i < toXmlTemplates.length; i++) { // File file = new File(toXmlTransforms[i]); // toXmlTemplates[i] = stf.newTemplates(new StreamSource(file)); String url = toXmlTransforms[i].toString(); toXmlTemplates[i] = stf.newTemplates(new StreamSource(url)); } } // set up initial reader SAXParser parser = spf.newSAXParser(); XMLReader reader = parser.getXMLReader(); // set up chain of filters XMLFilter[] filters = new XMLFilter[toXmlTransforms.length]; for (int i = 0; i < filters.length; i++) { // create filter filters[i] = stf.newXMLFilter(toXmlTemplates[i]); // set parent if (i == 0) { filters[0].setParent(reader); } else { filters[i].setParent(filters[i-1]); } } // set final reader/filter XMLReader finalReader = (filters.length == 0) ? reader : filters[filters.length-1]; // set up and return LF to XML SAX source with final reader/filter return new SAXSource(finalReader, SAXSource.sourceToInputSource(source)); } catch (ParserConfigurationException pce) { throw (IOException) new IOException().initCause(pce); } catch (SAXException se) { throw (IOException) new IOException().initCause(se); } catch (TransformerConfigurationException tce) { throw (IOException) new IOException().initCause(tce); } } /** * Saves the given LF with the given target string to an XML file * with the given filename, applying the configured to-XML * transformations. */ public synchronized void saveToXml(LF lf, String target, String filename) throws IOException { // ensure dirs exist for filename File file = new File(filename); File parent = file.getParentFile(); if (parent != null && !parent.exists()) { parent.mkdirs(); } FileOutputStream out = new FileOutputStream(file); saveToXml(lf, target, out); out.close(); } /** * Saves the given LF with the given target string as XML to the * given output stream, applying the configured to-XML * transformations. */ public synchronized void saveToXml(LF lf, String target, OutputStream out) throws IOException { // make doc with XML for LF and target Document doc = new Document(); Element root = new Element("xml"); doc.setRootElement(root); root.addContent(HyloHelper.toXml(lf)); Element targetElt = new Element("target"); targetElt.addContent(target); root.addContent(targetElt); // write transformed doc to file try { // do setup and get source Source source = toXmlSetup(new JDOMSource(doc)); // do transformation transformer.transform(source, new StreamResult(new OutputStreamWriter(out))); } catch (TransformerException exc) { throw (IOException) new IOException().initCause(exc); } } /** * Transforms an LF by applying the configured to-XML and from-XML transformations, * then loading the LF from the resulting doc. */ public synchronized LF transformLF(LF lf) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); saveToXml(lf, "", out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); Document doc = loadFromXml(in); return Realizer.getLfFromDoc(doc); } /** * Loads an LF by applying the configured from-XML transformations, * then loading the LF from the resulting doc. */ public synchronized LF loadLF(Document doc) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); serializeXml(doc, out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); Document doc2 = loadFromXml(in); return Realizer.getLfFromDoc(doc2); } /** * Convenience method to serialize XML. */ public synchronized void serializeXml(Document doc, OutputStream out) throws IOException { try { initializeTransformers(); JDOMResult result = new JDOMResult(); // as suggested by Amy Isard, for better java/xml version compatibility transformer.transform(new JDOMSource(doc), result); XMLOutputter outputter = new XMLOutputter(); outputter.setFormat(Format.getPrettyFormat()); outputter.output(result.getDocument(), new OutputStreamWriter(out)); // end of A.I. suggestion } catch (TransformerException exc) { throw (IOException) new IOException().initCause(exc); } } /** * Makes an element for the given LF, applying the configured to-XML transformations. */ public synchronized Element makeLfElt(LF lf) throws IOException { // make doc with LF in it Document lfDoc = new Document(); lfDoc.setRootElement(HyloHelper.toXml(lf)); // apply to-XML transformations try { // do setup and get source Source source = toXmlSetup(new JDOMSource(lfDoc)); // do transformation and get resulting doc JDOMResult result = new JDOMResult(); transformer.transform(source, result); lfDoc = result.getDocument(); } catch (TransformerException exc) { throw (IOException) new IOException().initCause(exc); } return lfDoc.detachRootElement(); } /** * Returns whether the given string is a recognized pitch accent. */ public static boolean isPitchAccent(String s) { if (pitchAccentsSet == null) { pitchAccentsSet = new HashSet(); for (int i = 0; i < pitchAccents.length; i++) { pitchAccentsSet.add(pitchAccents[i]); } } return pitchAccentsSet.contains(s); } /** * Returns whether the given string is a recognized boundary tone. */ public static boolean isBoundaryTone(String s) { if (boundaryTonesSet == null) { boundaryTonesSet = new HashSet(); for (int i = 0; i < boundaryTones.length; i++) { boundaryTonesSet.add(boundaryTones[i]); } } return boundaryTonesSet.contains(s); } /** * Saves the given sign's words, pitch accents and boundary tones * to an APML file with the given filename. */ public synchronized void saveToApml(Sign sign, String filename) throws IOException { // ensure dirs exist for filename File file = new File(filename); File parent = file.getParentFile(); if (parent != null && !parent.exists()) { parent.mkdirs(); } // do transformation FileWriter fw = new FileWriter(file); saveToApml(sign, fw); fw.close(); } /** * Saves the given sign's words, pitch accents and boundary tones * as APML to the given writer. * The orthography is first converted to XML using Sign.getWordsInXml, * and then converted to APML using opennlp/ccg/grammar/to-apml.xsl. * The string is assumed to be a single performative. */ public synchronized void saveToApml(Sign sign, Writer writer) throws IOException { // convert words Document doc = sign.getWordsInXml(); // write transformed doc to file try { // do setup and get source initializeTransformers(); Source source = new JDOMSource(doc); // do transformation apmlTransformer.transform(source, new StreamResult(writer)); } catch (TransformerException exc) { throw (IOException) new IOException().initCause(exc); } } /** * Returns the words for the given string, as determined by its * first parse, or an empty list, if it cannot be parsed. */ // NB: Could try to extend this to find the parse with the intended LF. public List getParsedWords(String s) { // ensure parser instantiated if (parser == null) parser = new Parser(this); // get parses try { parser.parse(s); } catch (ParseException pe) { return new ArrayList(0); } List parses = parser.getResult(); // return words of first parse Sign sign = parses.get(0); return sign.getWords(); } /** * Returns the name of the loaded grammar. Null if no name given. */ public final String getName() { return grammarName; } /** * Writes the list of words to a basic morph file. * @throws IOException */ public void toMorphXml(List words, String filename) throws IOException { Collections.sort(words); XMLOutputter xout = new XMLOutputter(); xout.setFormat(Format.getPrettyFormat()); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename))); out.println(""); out.println(""); for (Word w : words) { Element e = new Element("entry"); e.setAttribute("word", w.getForm()); if (w.getForm() != w.getStem() && w.getStem() != null) e.setAttribute("stem", w.getStem()); if (w.getPOS() != null) e.setAttribute("pos", w.getPOS()); if (w.getSemClass() != null) e.setAttribute("class", w.getSemClass()); xout.output(e, out); out.println(); } out.println(""); out.flush(); out.close(); } /** * Writes the list of categories and associated POS tags to a basic lexicon file. * Note that the LFs are expected to have a [*DEFAULT*] proposition in the * desired location for predicate insertion. * @throws IOException */ public void toLexiconXml(List cats, List POSs, String filename) throws IOException { // create map from supertags with unique suffixes to cat/pos pairs Map> stagMap = new HashMap>(); for (int i=0; i < cats.size(); i++) { Category cat = cats.get(i); String pos = POSs.get(i); String stag = cat.getSupertag(); if (stagMap.containsKey(stag)) { int j = 1; while (stagMap.containsKey(stag+"-"+j)) j++; stag += "-"+j; } stagMap.put(stag, new Pair(cat, pos)); } List keys = new ArrayList(stagMap.keySet()); Collections.sort(keys); XMLOutputter xout = new XMLOutputter(); xout.setFormat(Format.getPrettyFormat()); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename))); out.println(""); out.println(""); for (String key : keys) { Pair p = stagMap.get(key); Category cat = p.a; String pos = p.b; Element fam = new Element("family"); fam.setAttribute("name", key); fam.setAttribute("pos", pos); Element ent = new Element("entry"); ent.setAttribute("name", "1"); fam.addContent(ent); ent.addContent(cat.toXml()); xout.output(fam, out); out.println(); } out.println(""); out.flush(); out.close(); } } ================================================ FILE: src/opennlp/ccg/grammar/Rule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import java.util.*; import org.jdom.Element; /** * Interface for categorial rules. * * @author Gann Bierner * @author Jason Baldridge * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/11/30 20:36:15 $ */ public interface Rule { /** * Apply this rule to some input categories. * * @param inputs the input categories to try to combine * @return the Category (or categories) resulting from using this Rule to combine the * inputs * @exception UnifyFailure if the inputs cannot be combined by this Rule **/ public List applyRule(Category[] inputs) throws UnifyFailure; /** * The number of arguments this rule takes. For example, the arity of the * forward application rule of categorial grammar (X/Y Y => Y) is 2. * * @return the number of arguments this rule takes **/ public int arity(); /** * Returns the interned name of this rule. */ public String name(); /** * Returns the rule group which contains this rule. */ public RuleGroup getRuleGroup(); /** * Sets this rule's rule group. */ public void setRuleGroup(RuleGroup ruleGroup); /** Returns an XML element representing the rule. */ public Element toXml(); } ================================================ FILE: src/opennlp/ccg/grammar/RuleGroup.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-6 Jason Baldridge, Gann Bierner and // Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import opennlp.ccg.unify.*; import opennlp.ccg.util.*; import org.jdom.*; import org.jdom.output.*; import gnu.trove.*; import java.io.*; import java.net.*; import java.util.*; /** * A set of rules for combining categories. * Observed rule combos can be cached, either statically or dynamically. * * During deserialization, the grammar is set to the current grammar, * and supercat rule combos are borrowed from the current grammar's rule group. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.32 $, $Date: 2011/06/07 05:12:01 $ */ public class RuleGroup implements Serializable { private static final long serialVersionUID = -6240266013357142289L; /** The grammar that this rule group is part of. */ public transient Grammar grammar; // rules private List unaryRules = new ArrayList(); private List binaryRules = new ArrayList(); // maps of type changing rules by their semantics private GroupMap predsToRules = new GroupMap(); private GroupMap relsToRules = new GroupMap(); // rule for use in applying coarticulations private BackwardApplication bapp = new BackwardApplication(); // glue rule private GlueRule glueRule = new GlueRule(); // supercat-rule combos, to support filtering on observed ones private class SupercatRuleCombo { // NB: strings must be interned private String supercat; private String supercat2; private String rule; // unary rule constructor public SupercatRuleCombo(String supercat, String rule) { setCombo(supercat.intern(), (rule != null) ? rule.intern() : null); } // binary rule constructor public SupercatRuleCombo(String supercat, String supercat2, String rule) { setCombo(supercat.intern(), supercat2.intern(), (rule != null) ? rule.intern() : null); } // setters // NB: assume interned strings! public void setCombo(String supercat, String rule) { this.supercat = supercat; this.supercat2 = null; this.rule = rule; } public void setCombo(String supercat, String supercat2, String rule) { this.supercat = supercat; this.supercat2 = supercat2; this.rule = rule; } // hashcode public int hashCode() { return 31*System.identityHashCode(supercat) + 17*System.identityHashCode(rule) + System.identityHashCode(supercat2); } // equals public boolean equals(Object obj) { if (!(obj instanceof SupercatRuleCombo)) return false; SupercatRuleCombo combo = (SupercatRuleCombo) obj; return supercat == combo.supercat && supercat2 == combo.supercat2 && rule == combo.rule; } // supercat hashcode, excluding rule public int supercatHashCode() { return 31*System.identityHashCode(supercat) + System.identityHashCode(supercat2); } // supercat equals public boolean supercatEquals(Object obj) { if (!(obj instanceof SupercatRuleCombo)) return false; SupercatRuleCombo combo = (SupercatRuleCombo) obj; return supercat == combo.supercat && supercat2 == combo.supercat2; } // toString public String toString() { StringBuffer sb = new StringBuffer(supercat); if (supercat2 != null) sb.append(' ').append(supercat2); sb.append(' ').append(rule); return sb.toString(); } } // class for seen combos when determined dynamically // nb: for space efficiency, allows representative to be retrieved from set private static class SupercatComboSet extends THashSet { private static final long serialVersionUID = 1L; SupercatComboSet() { super( new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(Object o) { return (o instanceof SupercatRuleCombo) ? ((SupercatRuleCombo)o).supercatHashCode() : 0; } public boolean equals(Object o1, Object o2) { return (o1 instanceof SupercatRuleCombo) ? ((SupercatRuleCombo)o1).supercatEquals(o2) : false; } } ); } // return the seen combo, or null if none SupercatRuleCombo get(SupercatRuleCombo combo) { int index = index(combo); if (index < 0) return null; return (SupercatRuleCombo) this._set[index]; } } // observed supercat-rule combos private transient Set supercatRuleCombos = null; // observed supercat combos (for which complete rule combos are known) private transient SupercatComboSet supercatCombosSeen = null; // reusable combo for checking presence private transient SupercatRuleCombo combo = new SupercatRuleCombo("dummy", "dummy"); // flag for whether observed supercat combos is determined dynamically private boolean dynamicCombos = false; /** * Constructs an empty rule group for the given grammar. */ public RuleGroup(Grammar grammar) { this.grammar = grammar; bapp.setRuleGroup(this); } /** * Constructs a rule group from the given URL, for * the given grammar. */ public RuleGroup(URL url, Grammar grammar) throws IOException { this.grammar = grammar; bapp.setRuleGroup(this); XmlScanner ruleScanner = new XmlScanner() { public void handleElement(Element ruleEl) { String active = ruleEl.getAttributeValue("active"); if (active == null || active.equals("true")) { try { addRule(readRule(ruleEl)); } catch (RuntimeException exc) { System.err.println("Skipping rule: " + ruleEl.getAttributeValue("name")); System.err.println(exc.toString()); } } } }; ruleScanner.parse(url); } // during deserialization, sets grammar to the current grammar private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); grammar = Grammar.theGrammar; borrowSupercatRuleCombos(grammar.rules); } // reads in a rule private Rule readRule(Element ruleEl) { Rule r; String type = ruleEl.getName(); if (type.equals("application")) { String dir = ruleEl.getAttributeValue("dir"); if (dir.equals("forward")) { r = new ForwardApplication(); } else { r = new BackwardApplication(); } } else if (type.equals("composition")) { String dir = ruleEl.getAttributeValue("dir"); String harmonic = ruleEl.getAttributeValue("harmonic"); boolean isHarmonic = new Boolean(harmonic).booleanValue(); if (dir.equals("forward")) { r = new ForwardComposition(isHarmonic); } else { r = new BackwardComposition(isHarmonic); } } else if (type.equals("substitution")) { String dir = ruleEl.getAttributeValue("dir"); String harmonic = ruleEl.getAttributeValue("harmonic"); boolean isHarmonic = new Boolean(harmonic).booleanValue(); if (dir.equals("forward")) { r = new ForwardSubstitution(isHarmonic); } else { r = new BackwardSubstitution(isHarmonic); } } else if (type.equals("typeraising")) { String dir = ruleEl.getAttributeValue("dir"); String useDollar = ruleEl.getAttributeValue("useDollar"); boolean addDollar = new Boolean(useDollar).booleanValue(); Category arg = null; Element argElt = ruleEl.getChild("arg"); if (argElt != null) { arg = CatReader.getCat((Element)argElt.getChildren().get(0)); } Category result = null; Element resultElt = ruleEl.getChild("result"); if (resultElt != null) { result = CatReader.getCat((Element)resultElt.getChildren().get(0)); } if (dir.equals("forward")) { r = new ForwardTypeRaising(addDollar, arg, result); } else { r = new BackwardTypeRaising(addDollar, arg, result); } } else if (type.equals("typechanging")) { r = readTypeChangingRule(ruleEl); } else { throw new RuntimeException("Invalid element in rules: " + type); } return r; } // reads in a type changing rule private Rule readTypeChangingRule(Element ruleEl) { String rname = ruleEl.getAttributeValue("name"); Element argCatElt = (Element)ruleEl.getChild("arg").getChildren().get(0); Category arg = CatReader.getCat(argCatElt); Element resultCatElt = (Element)ruleEl.getChild("result").getChildren().get(0); Element lfElt = resultCatElt.getChild("lf"); Category result = CatReader.getCat(resultCatElt); LF firstEP = null; if (lfElt != null) { firstEP = HyloHelper.firstEP(HyloHelper.getLF(lfElt)); } grammar.lexicon.propagateTypes(result, arg); grammar.lexicon.propagateDistributiveAttrs(result, arg); grammar.lexicon.expandInheritsFrom(result, arg); return new TypeChangingRule(arg, result, rname, firstEP); } /** * Writes the rules to an XML file with the given name. * @throws IOException */ public void toXml(String filename) throws IOException { XMLOutputter xout = new XMLOutputter(); xout.setFormat(Format.getPrettyFormat()); PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(filename))); out.println(""); out.println(""); for (Rule r : binaryRules) { xout.output(r.toXml(), out); out.println(); } for (Rule r : unaryRules) { xout.output(r.toXml(), out); out.println(); } out.println(""); out.flush(); out.close(); } /** * Sets the dynamic combos flag to the given value, controlling whether the * observed supercat combos is determined dynamically. */ public void setDynamicCombos(boolean dynamic) { this.dynamicCombos = dynamic; if (!dynamicCombos) supercatCombosSeen = null; else if (dynamicCombos) { if (supercatCombosSeen == null) supercatCombosSeen = new SupercatComboSet(); if (supercatRuleCombos == null) supercatRuleCombos = new HashSet(); } } /** * Returns the dynamic combos flag. */ public boolean getDynamicCombos() { return dynamicCombos; } /** * Loads the observed supercat-rule combos, for filtering. * Only file URLs are supported at present. * Missing files are ignored. **/ public void loadSupercatRuleCombos(URL url) throws IOException { supercatRuleCombos = new HashSet(); File combosFile = new File(url.getFile()); if (!combosFile.exists()) return; System.out.println("Loading supercat combos from " + url.getFile()); BufferedReader in = new BufferedReader(new FileReader(combosFile)); String line; while ((line = in.readLine()) != null) { String[] tokens = line.split("\\s"); if (tokens.length < 2) { System.err.println("Warning: skipping supercat-rule combo with fewer than two tokens: " + line); continue; } if (tokens.length == 2) { supercatRuleCombos.add(new SupercatRuleCombo(tokens[0], tokens[1])); } else { if (tokens.length > 3) { System.err.println("Warning: ignoring extra tokens (beyond 3rd) in supercat-rule combo: " + line); } supercatRuleCombos.add(new SupercatRuleCombo(tokens[0], tokens[1], tokens[2])); } } in.close(); } /** Borrows the observed supercat-rule combos from the given rule group. */ public void borrowSupercatRuleCombos(RuleGroup ruleGroup) { supercatRuleCombos = ruleGroup.supercatRuleCombos; supercatCombosSeen = ruleGroup.supercatCombosSeen; } /** Adds the given rule. */ public void addRule(Rule r) { r.setRuleGroup(this); if (r instanceof TypeChangingRule) { unaryRules.add(r); index((TypeChangingRule)r); } else if (r.arity() == 1) { unaryRules.add(r); } else if (r.arity() == 2) { binaryRules.add(r); } else { // shouldn't happen throw new RuntimeException("Can't determine arity of rule: " + r); } } // indexes type changing rules by preds and rels private void index(TypeChangingRule rule) { LF firstEP = rule.getFirstEP(); if (firstEP == null) { return; } String pred = HyloHelper.getLexPred(firstEP); if (pred != null) { predsToRules.put(pred, rule); return; } String rel = HyloHelper.getRel(firstEP); if (rel != null) { relsToRules.put(rel, rule); } } /** Returns the unary rules. */ public List getUnaryRules() { return unaryRules; } /** Returns the binary rules. */ public List getBinaryRules() { return binaryRules; } /** Returns the type changing rule with the given name, or null if none. */ public TypeChangingRule getTypeChangingRule(String name) { for (Iterator it = unaryRules.iterator(); it.hasNext(); ) { Object rule = it.next(); if (rule instanceof TypeChangingRule) { TypeChangingRule tcr = (TypeChangingRule) rule; if (tcr.name().equals(name)) return tcr; } } return null; } /** * Returns the type changing rules indexed by the given lexical predicate. * The type changing rules are indexed by their first elementary predication. */ public Collection getRulesForPred(String pred) { return predsToRules.get(pred); } /** * Returns the type changing rules indexed by the given relation. * The type changing rules are indexed by their first elementary predication. */ public Collection getRulesForRel(String rel) { return relsToRules.get(rel); } /** Applies the unary rules to the given input sign, returning the list of results. */ public List applyUnaryRules(Sign input) { Sign[] inputs = { input }; List results = new ArrayList(2); String supertag = input.getCategory().getSupertag(); // check whether dynamic combos update required, or whether rules can be skipped boolean dynamicCombosUpdate = false; boolean skip = false; if (dynamicCombos) { combo.setCombo(supertag, null); SupercatRuleCombo rep = supercatCombosSeen.get(combo); if (rep == null) dynamicCombosUpdate = true; else if (rep.rule == null) skip = true; } // skip if possible if (skip) return results; // try each rule for (Rule r : unaryRules) { // filter on observed supercat-rule combos, if any, if not updating if (!dynamicCombosUpdate && supercatRuleCombos != null) { combo.setCombo(supertag, r.name()); if (!supercatRuleCombos.contains(combo)) { continue; } } // if updating combos, apply rule and record results if (dynamicCombosUpdate) { int prevsize = results.size(); ((AbstractRule)r).applyRule(inputs, results); // update upon success if (results.size() > prevsize) { SupercatRuleCombo newCombo = null; combo.setCombo(supertag, r.name()); if (!supercatRuleCombos.contains(combo)) { newCombo = new SupercatRuleCombo(supertag, r.name()); supercatRuleCombos.add(newCombo); } if (!supercatCombosSeen.contains(combo)) { if (newCombo == null) newCombo = new SupercatRuleCombo(supertag, r.name()); supercatCombosSeen.add(newCombo); } } } // otherwise just apply rule else ((AbstractRule)r).applyRule(inputs, results); } // if updating combos and none succeeded, add one with null rule if (dynamicCombosUpdate) { combo.setCombo(supertag, null); if (!supercatCombosSeen.contains(combo)) { SupercatRuleCombo newCombo = new SupercatRuleCombo(supertag, null); supercatCombosSeen.add(newCombo); } } // done return results; } /** Applies the binary rules to the given input signs, returning the list of results. */ public List applyBinaryRules(Sign input1, Sign input2) { Sign[] inputs = { input1, input2 }; List results = new ArrayList(2); String supertag1 = input1.getCategory().getSupertag(); String supertag2 = input2.getCategory().getSupertag(); // check whether dynamic combos update required, or whether rules can be skipped boolean dynamicCombosUpdate = false; boolean skip = false; if (dynamicCombos) { combo.setCombo(supertag1, supertag2, null); SupercatRuleCombo rep = supercatCombosSeen.get(combo); if (rep == null) dynamicCombosUpdate = true; else if (rep.rule == null) skip = true; } // skip if possible if (skip) return results; // try each rule for (Rule r : binaryRules) { // filter on observed supercat-rule combos, if any, if not updating if (!dynamicCombosUpdate && supercatRuleCombos != null) { combo.setCombo(supertag1, supertag2, r.name()); if (!supercatRuleCombos.contains(combo)) { continue; } } // if updating combos, apply rule and record results if (dynamicCombosUpdate) { int prevsize = results.size(); ((AbstractRule)r).applyRule(inputs, results); // update upon success if (results.size() > prevsize) { SupercatRuleCombo newCombo = null; combo.setCombo(supertag1, supertag2, r.name()); if (!supercatRuleCombos.contains(combo)) { newCombo = new SupercatRuleCombo(supertag1, supertag2, r.name()); supercatRuleCombos.add(newCombo); } if (!supercatCombosSeen.contains(combo)) { if (newCombo == null) newCombo = new SupercatRuleCombo(supertag1, supertag2, r.name()); supercatCombosSeen.add(newCombo); } } } // otherwise just apply rule else ((AbstractRule)r).applyRule(inputs, results); } // if updating combos and none succeeded, add one with null rule if (dynamicCombosUpdate) { combo.setCombo(supertag1, supertag2, null); if (!supercatCombosSeen.contains(combo)) { SupercatRuleCombo newCombo = new SupercatRuleCombo(supertag1, supertag2, null); supercatCombosSeen.add(newCombo); } } // done return results; } /** Applies the glue rule to the given input signs, returning the list of results. */ public List applyGlueRule(Sign input1, Sign input2) { Sign[] inputs = { input1, input2 }; List results = new ArrayList(1); glueRule.applyRule(inputs, results); return results; } /** Applies the coarticulation to the given sign, adding the result (if any) to the given ones. */ public void applyCoart(Sign lexSign, Sign coartSign, List results) { Category[] cats = new Category[] { lexSign.getCategory(), coartSign.getCategory() }; try { List resultCats = bapp.applyRule(cats); if (resultCats.isEmpty()) return; for (Iterator it = resultCats.iterator(); it.hasNext();) { Category catResult = it.next(); bapp.distributeTargetFeatures(catResult); Sign sign = Sign.createCoartSign(catResult, lexSign, coartSign); results.add(sign); } } catch (UnifyFailure uf) {} } } ================================================ FILE: src/opennlp/ccg/grammar/TypeChangingRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.unify.*; import opennlp.ccg.hylo.HyloHelper; import opennlp.ccg.synsem.*; import java.util.*; import org.jdom.Element; /** * A CCG unary type changing rule. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.12 $, $Date: 2009/11/30 20:36:16 $ **/ public class TypeChangingRule extends AbstractRule implements LexSemOrigin { private static final long serialVersionUID = -2654945192870162776L; /** * String used as the POS for all type changing rules, * to satisfy the LexSemOrigin interface. * Defaults to "URULE". */ public static String POS_STRING = "URULE"; /** The argument category. */ protected Category _arg; /** The result category. */ protected Category _result; /** The first elementary predication in the result LF (or null), before sorting. */ protected LF _firstEP; /** Constructor. */ public TypeChangingRule(Category arg, Category result, String name, LF firstEP) { _arg = arg; _result = result; _name = name.intern(); _firstEP = firstEP; setOrigin(); } /** Returns an XML element representing the rule. */ public Element toXml() { Element retval = new Element("typechanging"); retval.setAttribute("name", _name); Element argElt = new Element("arg"); retval.addContent(argElt); argElt.addContent(_arg.toXml()); Element resultElt = new Element("result"); retval.addContent(resultElt); resultElt.addContent(_result.toXml()); return retval; } /** Returns 1. */ public int arity() { return 1; } /** Returns the arg. */ public Category getArg() { return _arg; } /** Returns the result. */ public Category getResult() { return _result; } /** Returns the first elementary predication in the result LF (or null), before sorting. */ public LF getFirstEP() { return _firstEP; } /** Applies this rule to the given inputs. */ public List applyRule(Category[] inputs) throws UnifyFailure { // check arity if (inputs.length != 1) { throw new UnifyFailure(); } return apply(inputs[0]); } /** Applies this rule to the given input. */ protected List apply(Category input) throws UnifyFailure { // unify quick check _arg.unifyCheck(input); // copy arg and result Category arg = _arg.copy(); Category result = _result.copy(); // make variables unique UnifyControl.reindex(result, arg); // unify Substitution sub = new GSubstitution(); GUnifier.unify(input, arg, sub); ((GSubstitution)sub).condense(); // fill in result Category $result = (Category)result.fill(sub); appendLFs(input, result, $result, sub); // return List results = new ArrayList(1); _headCats.clear(); results.add($result); _headCats.add(input); return results; } /** Returns 'name: arg => result'. */ public String toString() { StringBuffer sb = new StringBuffer(); sb.append(_name).append(": "); sb.append(_arg).append(' '); sb.append("=> ").append(_result); return sb.toString(); } /** Returns 'arg_=>_result' as the supertag. */ public String getSupertag() { StringBuffer sb = new StringBuffer(); sb.append(_arg.getSupertag()).append("_=>_").append(_result.getSupertag()); return sb.toString(); } /** * Always returns POS_STRING. */ public String getPOS() { return POS_STRING; } /** * Sets the origin of the elementary predications. */ public void setOrigin() { HyloHelper.setOrigin(_result.getLF(), this); } } ================================================ FILE: src/opennlp/ccg/grammar/Types.java ================================================ /////////////////////////////////////////////////////////////////////////////// //// Copyright (C) 2003-4 Gunes Erkan and University of Edinburgh (Michael White) //// //// This library is free software; you can redistribute it and/or //// modify it under the terms of the GNU Lesser General Public //// License as published by the Free Software Foundation; either //// version 2.1 of the License, or (at your option) any later version. //// //// This library is distributed in the hope that it will be useful, //// but WITHOUT ANY WARRANTY; without even the implied warranty of //// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //// GNU Lesser General Public License for more details. //// //// You should have received a copy of the GNU Lesser General Public //// License along with this program; if not, write to the Free Software //// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. //////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.grammar; import opennlp.ccg.util.*; import opennlp.ccg.unify.*; import org.jdom.*; import org.jdom.input.*; import java.io.*; import java.net.*; import java.util.*; import gnu.trove.*; /** * Class for constructing and holding the hierarchical simple type maps. * * @author Gunes Erkan * @author Michael White * @version $Revision: 1.13 $, $Date: 2009/12/21 03:27:18 $ */ public class Types { public final Grammar grammar; private final HashMap nameToType = new HashMap(); private final ArrayList indexToType = new ArrayList(); private int maxTypeIndex = 0; public static final String TOP_TYPE = "top"; public static final String BOT_TYPE = "bottom"; /** Constructor for an empty hierarchy (with just the top type). */ public Types(Grammar grammar) { getSimpleType(TOP_TYPE); this.grammar = grammar; } /** * Constructs the type hierarchy from the given URL, for * the given grammar. */ @SuppressWarnings("unchecked") public Types(URL url, Grammar grammar) throws IOException { this.grammar = grammar; SAXBuilder builder = new SAXBuilder(); Document doc; try { doc = builder.build(url); } catch (JDOMException exc) { getSimpleType(TOP_TYPE); throw (IOException) new IOException().initCause(exc); } List entries = doc.getRootElement().getChildren(); readTypes(entries); // for debugging: print the indexToType list //printTypes(); } /** Returns the simple type with the given name, or a new one if none yet exists. */ public SimpleType getSimpleType(String typeName) { SimpleType type = nameToType.get(typeName); if (type == null) { BitSet bs = new BitSet(); bs.set(maxTypeIndex); SimpleType newtype = new SimpleType(maxTypeIndex, typeName, bs, this); nameToType.put(typeName, newtype); indexToType.add(newtype); nameToType.get(TOP_TYPE).getBitSet().set(maxTypeIndex++); return newtype; } else return type; } /** Returns whether there is a simple type with the given name. */ public boolean containsSimpleType(String typeName) { return nameToType.containsKey(typeName); } /** Returns the list of types, with parents preceding children in the hierarchy. */ public ArrayList getIndexMap() { return indexToType; } /** Reads the rules and constructs the nameToType and indexToType maps. */ private void readTypes(List _types) { GroupMap hierarchy = new GroupMap(); // map from types to all subtypes GroupMap parents = new GroupMap(); // map from types to parents TObjectIntHashMap depthMap = new TObjectIntHashMap(); // map from types to max depth // Construct the initial hierarchy of types without // taking transitive closure. // Also store parents. for (int i=0; i < _types.size(); i++) { Element typeEl = _types.get(i); String typeName = typeEl.getAttributeValue("name"); String _parents = typeEl.getAttributeValue("parents"); hierarchy.put(typeName, BOT_TYPE); if (_parents == null) { hierarchy.put(TOP_TYPE, typeName); parents.put(typeName, TOP_TYPE); } else { String[] parentsArray = _parents.split("\\s+"); for (int j = 0; j < parentsArray.length; j++) { hierarchy.put(parentsArray[j], typeName); parents.put(typeName, parentsArray[j]); } } } // Compute depth from parents. for (String type : parents.keySet()) { int depth = computeDepth(type, parents, type); depthMap.put(type, depth); } // Compute ALL subtypes of each type and insert into the hierarchy. for (String type : hierarchy.keySet()) { hierarchy.putAll(type, findAllSubtypes(hierarchy, type)); } // Assign a unique int to each type in breadth-first order. // Then create the string -> SimpleType map. createSimpleTypes(hierarchy, depthMap); } /** Returns the max depth of the given type, checking for cycles. */ private static int computeDepth(String type, GroupMap parents, String startType) { if (type.equals(TOP_TYPE)) return 0; int maxParentDepth = 0; Set parentSet = parents.get(type); if (parentSet != null) { for (String parent : parentSet) { if (parent.equals(startType)) { throw new RuntimeException("Error, type hierarchy contains cycle from/to: " + startType); } int parentDepth = computeDepth(parent, parents, startType); maxParentDepth = Math.max(maxParentDepth, parentDepth); } } return maxParentDepth + 1; } /** * Computes the list of all sub-types of a given type (key) * in depth-first order. */ private Collection findAllSubtypes(GroupMap hierarchy, String key) { ArrayList subs = new ArrayList(); if (hierarchy.get(key) != null) { Stack look = new Stack(); for (String type : hierarchy.get(key)) { look.push(type); } for (; !look.empty() ; ) { String new_sub = look.pop(); subs.add(new_sub); if (hierarchy.get(new_sub) != null) { for (String type : hierarchy.get(new_sub)) { look.push(type); } } } } return subs; } /** * Creates the SimpleType objects and constructs the nameToType and indexToType maps. */ private void createSimpleTypes(GroupMap hierarchy, TObjectIntHashMap depthMap) { // find max depth int maxDepth = 0; int[] depths = depthMap.getValues(); for (int i = 0; i < depths.length; i++) { maxDepth = Math.max(maxDepth, depths[i]); } // add types in order of increasing depth ArrayList typesVisited = new ArrayList(); typesVisited.add(TOP_TYPE); Object[] types = depthMap.keys(); ArrayList typesAtSameDepth = new ArrayList(); for (int i = 1; i <= maxDepth; i++) { typesAtSameDepth.clear(); for (int j = 0; j < types.length; j++) { if (depthMap.get(types[j]) == i) typesAtSameDepth.add((String)types[j]); } Collections.sort(typesAtSameDepth); typesVisited.addAll(typesAtSameDepth); } // construct the maps for (int i=0; i < typesVisited.size(); i++) { String typeName = typesVisited.get(i); BitSet bitset = new BitSet(); bitset.set(i); if (hierarchy.get(typeName) != null) { for (String type : hierarchy.get(typeName)) { int indexToSet = typesVisited.indexOf(type); if (indexToSet != -1) bitset.set(indexToSet); } } SimpleType st = new SimpleType(i, typeName, bitset, this); nameToType.put(typeName, st); indexToType.add(st); } maxTypeIndex = typesVisited.size(); } /** * Prints the types and their subtypes to System.out. */ public void printTypes() { System.out.println("types:"); for (int i=0; i < indexToType.size(); i++) { SimpleType st = indexToType.get(i); System.out.println(i + ": " + st.getName() + " subtypes: " + st.getBitSet()); } System.out.println(); } /** Tests serialization of simple types, including resolution. */ public void debugSerialization() throws IOException, ClassNotFoundException { // test serialization SimpleType st = indexToType.get(1); String filename = "tmp.ser"; ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(filename)); System.out.println("Writing st: " + st.getIndex() + ": " + st + " " + st.getBitSet()); out.writeObject(st); out.close(); ObjectInputStream in = new ObjectInputStream(new FileInputStream(filename)); System.out.print("Reading st2: "); SimpleType st2 = (SimpleType) in.readObject(); System.out.println(st2.getIndex() + ": " + st2 + " " + st2.getBitSet()); in.close(); // test identity (and thus readResolve) System.out.println("st == st2?: " + (st == st2)); } } ================================================ FILE: src/opennlp/ccg/grammar/to-apml.xsl ================================================ ' n't Hstar Lstar LplusHstar LstarplusH HstarplusL HplusLstar aamm ppmm ================================================ FILE: src/opennlp/ccg/grammardoc/AbstractDocumenter.java ================================================ /* * $Id: AbstractDocumenter.java,v 1.2 2006/11/01 02:53:20 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * Convenience base class for documenters to extend * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.2 $ */ public abstract class AbstractDocumenter implements Documenter { protected DocumenterContext documenterContext; protected String name; /** * Default constructor. All documenters must have a no-argument constructor. */ protected AbstractDocumenter() {} /** * Creates a new documenter with the specified name. */ protected AbstractDocumenter(String name) { this.name = name; } /** * Sets this documenter's context. */ public void setDocumenterContext(DocumenterContext documenterContext) { this.documenterContext = documenterContext; } /** * Gets this documenter's name. */ public String getName() { return name; } } ================================================ FILE: src/opennlp/ccg/grammardoc/Documenter.java ================================================ /* * $Id: Documenter.java,v 1.2 2006/11/01 02:53:20 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * A documenter that produces documentation for an OpenCCG grammar. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.2 $ */ public interface Documenter { /** * Gets the name of this documenter. * * @return A string like SGML Documenter. */ public String getName(); /** * Sets this documenter's context. * * @param documenterContext The context this documenter should use. */ public void setDocumenterContext(DocumenterContext documenterContext); /** * Causes a documenter to produce documentation for the provided grammar. * Before any calls to this method are made, the documenter will first be * configured with a (single) call to * {@link #setDocumenterContext(DocumenterContext)}. * * @param grammar The grammar to document. * @throws DocumenterException If any problems occur during the process of * generating documentation. */ public void document(SourceGrammar grammar) throws DocumenterException; } ================================================ FILE: src/opennlp/ccg/grammardoc/DocumenterContext.java ================================================ /* * $Id: DocumenterContext.java,v 1.2 2006/11/01 02:53:20 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; import java.io.File; /** * Provides a context inside which a {@link Documenter} will execute. This class * gives the documenter access to objects it will need to produce its * documentation. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.2 $ */ public interface DocumenterContext { /** * Logs a message from a documenter. */ public void log(String message); /** * Gets the target location where the documenter should generate its * documentation files. * * @return A directory that exists in a filesystem. */ public File getDestinationDirectory(); } ================================================ FILE: src/opennlp/ccg/grammardoc/DocumenterException.java ================================================ /* * $Id: DocumenterException.java,v 1.2 2006/11/01 02:53:20 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * An exception thrown by a {@link Documenter}. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.2 $ */ public class DocumenterException extends GrammarDocException { private static final long serialVersionUID = 1L; /** * Creates a new documenter exception. */ public DocumenterException() { super(); } /** * Creates a new exception with the specified message. */ public DocumenterException(String message) { super(message); } /** * Creates a new exception with the specified message and underlying cause. */ public DocumenterException(String message, Throwable cause) { super(message, cause); } /** * Creates a new exception with the specified underlying cause. */ public DocumenterException(Throwable cause) { super(cause); } } ================================================ FILE: src/opennlp/ccg/grammardoc/DocumenterFactory.java ================================================ /* * $Id: DocumenterFactory.java,v 1.4 2006/12/11 18:19:24 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; import java.util.EnumMap; import java.util.Map; /** * Factory class for creating documenters based on a predefined name. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.4 $ */ public class DocumenterFactory { private static final Map documenters = new EnumMap(DocumenterName.class); static final DocumenterFactory documenterFactory = new DocumenterFactory(); private DocumenterFactory() {} /** * Gets a new documenter factory. */ public static DocumenterFactory newInstance() { return documenterFactory; } /** * Gets a new instance of the default documenter. * * @return Calls {@link #newDocumenter(DocumenterName)} with * {@link DocumenterName#DEFAULT} as its argument. * @throws DocumenterNotFoundException Does not throw this exception. This * is included for binary compatibility with * {@link #newDocumenter(DocumenterName)}. */ public Documenter newDocumenter() throws DocumenterNotFoundException { return newDocumenter(DocumenterName.DEFAULT); } /** * Gets a new instance of the named documenter. * * @param name Used to look up the documenter {@link DocumenterName name}. * @return A documenter that corresponds to the specified name. * @throws DocumenterNotFoundException If no documenter can be created for * the specified name. */ public Documenter newDocumenter(String name) throws DocumenterNotFoundException { try { DocumenterName nm = DocumenterName.valueOf(name); return newDocumenter(nm); } catch(IllegalArgumentException iae) { throw new DocumenterNotFoundException(name); } } /** * Gets a new instance of the named documenter. * * @param name The {@link DocumenterName name} of the documenter to create. * @return A documenter that corresponds to the specified name. * @throws DocumenterNotFoundException If no documenter can be created for * the specified name. */ public synchronized Documenter newDocumenter(DocumenterName name) throws DocumenterNotFoundException { Documenter d = documenters.get(name); if(d == null) { try { d = name.documenterClass.newInstance(); } catch(InstantiationException ie) { throw new DocumenterNotFoundException(name, ie); } catch(IllegalAccessException iae) { throw new DocumenterNotFoundException(name, iae); } documenters.put(name, d); } return d; } } ================================================ FILE: src/opennlp/ccg/grammardoc/DocumenterName.java ================================================ /* * $Id: DocumenterName.java,v 1.3 2006/12/11 18:19:24 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; import opennlp.ccg.grammardoc.html.HTMLDocumenter; /** * The known documenters that the {@link DocumenterFactory documenter factory} * is aware of. Calling {@link DocumenterFactory#newDocumenter(DocumenterName)} * with any of these values will return a valid documenter without throwing an * exception. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.3 $ */ public enum DocumenterName { /** * The default documenter. */ DEFAULT(HTMLDocumenter.class), /** * A documenter that produces HTML output. */ HTML(HTMLDocumenter.class); Class documenterClass; private DocumenterName(Class documenterClass) { this.documenterClass = documenterClass; } } ================================================ FILE: src/opennlp/ccg/grammardoc/DocumenterNotFoundException.java ================================================ /* * $Id: DocumenterNotFoundException.java,v 1.3 2006/12/11 18:19:24 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * Signals that no documenter could be found for a given name. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.3 $ */ public class DocumenterNotFoundException extends DocumenterException { private static final long serialVersionUID = 1L; /** * Creates a new exception indicating that a documenter with the specified * name was not found. */ public DocumenterNotFoundException(DocumenterName name) { this(name.name()); } /** * Creates a new exception indicating that a documenter with the specified * name was not found. */ public DocumenterNotFoundException(String name) { super(name); } /** * Creates a new exception indicating that a documenter with the specified * name was not found for the specified underlying reason. */ DocumenterNotFoundException(DocumenterName name, Throwable cause) { super(name.name(), cause); } } ================================================ FILE: src/opennlp/ccg/grammardoc/DocumenterSourceException.java ================================================ /* * $Id: DocumenterSourceException.java,v 1.2 2006/11/01 02:53:20 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * An exception thrown by a {@link Documenter} because of a problem in the * source grammar. This exception tracks the * {@link SourceGrammarFile source grammar file} where the problem occurred. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.2 $ */ public class DocumenterSourceException extends DocumenterException { private static final long serialVersionUID = 1L; protected SourceGrammarFile sourceGrammarFile; /** * Creates a new exception signaling a problem in the specified source * grammar file. */ public DocumenterSourceException(SourceGrammarFile sourceGrammarFile) { this.sourceGrammarFile = sourceGrammarFile; } /** * Creates a new exception with the specified message, signaling a problem * in the specified source grammar file. */ public DocumenterSourceException(String message, SourceGrammarFile sourceGrammarFile) { super(message); this.sourceGrammarFile = sourceGrammarFile; } /** * Creates a new exception with the specified message and underlying cause, * signaling a problem in the specified source grammar file. */ public DocumenterSourceException(String message, Throwable cause, SourceGrammarFile sourceGrammarFile) { super(message, cause); this.sourceGrammarFile = sourceGrammarFile; } /** * Creates a new exception with the specified underlying cause, signaling a * problem in the specified source grammar file. */ public DocumenterSourceException(Throwable cause, SourceGrammarFile sourceGrammarFile) { super(cause); this.sourceGrammarFile = sourceGrammarFile; } /** * Gets the source grammar file where this problem occurred. */ public SourceGrammarFile getSourceGrammarFile() { return sourceGrammarFile; } } ================================================ FILE: src/opennlp/ccg/grammardoc/GrammarDoc.java ================================================ /* * $Id: GrammarDoc.java,v 1.7 2007/05/30 22:53:17 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; import java.io.File; import java.io.IOException; import java.io.PrintStream; import java.util.Arrays; import java.util.Iterator; import java.util.List; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.Project; import org.apache.tools.ant.Target; import org.apache.tools.ant.Task; import org.w3c.dom.Document; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Class that implements the grammardoc ant task. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.7 $ */ public class GrammarDoc extends Task implements DocumenterContext { private File srcDir, destDir; /** * Executes the grammardoc task. This method finds the specified * srcDir and destDir, then invokes a * {@link Documenter documenter} with those parameters. * * @throws BuildException If a source directory or destination directory is * not specified, or the source directory does not exist. */ @Override public void execute() throws BuildException { if(srcDir == null) { srcDir = new File(System.getProperty("user.dir")); } if(!srcDir.exists()) { throw new BuildException("Source directory does not exist"); } if(destDir == null) { destDir = new File(srcDir, "docs"); } if(!destDir.exists()) { log("Creating directory " + destDir); destDir.mkdirs(); } try { Documenter documenter = DocumenterFactory.newInstance() .newDocumenter(); log("Using " + documenter.getName()); log("Documenting " + srcDir.getAbsolutePath()); log("Generating documentation to " + destDir); documenter.setDocumenterContext(this); documenter.document(loadSourceGrammar()); log("Done"); } catch(DocumenterNotFoundException dnfe) { throw new BuildException("documenter not found: " + dnfe.getMessage(), getLocation()); } catch(DocumenterSourceException dse) { throw new BuildException("problem in source file " + dse.getSourceGrammarFile() + ": " + dse.getMessage(), getLocation()); } catch(DocumenterException de) { throw new BuildException("problem documenting: " + de.getMessage(), de); } catch(GrammarDocException gde) { throw new BuildException(gde.getMessage(), getLocation()); } } SourceGrammar loadSourceGrammar() throws GrammarDocException { SourceGrammar sourceGrammar = new SourceGrammar(srcDir); try { SourceGrammarFile grammar = loadGrammarFile( SourceGrammarFileType.GRAMMAR, new File(srcDir, SourceGrammarFileType.GRAMMAR.fileName + ".xml")); sourceGrammar.addSourceGrammarFile(SourceGrammarFileType.GRAMMAR, grammar); //TODO make these StreamSource instead File gd = grammar.sourceFile; for(SourceGrammarFileType fileType : SourceGrammarFileType.values()) { if(!fileType.equals(SourceGrammarFileType.GRAMMAR)) { // already DocumentBuilder db = DocumentBuilderFactory.newInstance() .newDocumentBuilder(); Document doc = db.parse(gd); NodeList fileEls = doc.getElementsByTagName( fileType.name().toLowerCase()); if(fileEls.getLength() == 0) { if(fileType.isRequired()) { throw new GrammarDocException( "file type required but missing: " + fileType); } } else { sourceGrammar.addSourceGrammarFile(fileType, loadGrammarFile(fileType, new File(srcDir, fileType.fileName + ".xml"))); } } } } catch(ParserConfigurationException pce) { throw new GrammarDocException("parser configuration problem: " + pce.getMessage(), pce); } catch(SAXException se) { throw new GrammarDocException("problem parsing source files: " + se.getMessage(), se); } catch(IOException io) { throw new GrammarDocException("io problem with source files: " + io.getMessage(), io); } return sourceGrammar; } SourceGrammarFile loadGrammarFile(SourceGrammarFileType fileType, File file) throws GrammarDocException { if(!file.exists()) { throw new GrammarDocException("file " + file.getName() + " does not exist in " + srcDir); } else if(file.isDirectory()) { throw new GrammarDocException(file.getName() + " refers to a directory in " + srcDir); } else { log("Loading " + file.getName()); try { return new SourceGrammarFile(fileType, file); } catch(Exception e) { throw new GrammarDocException("problem parsing " + file + ": " + e.getMessage(), e); } } } /** * For conformance with {@link DocumenterContext}. */ public File getDestinationDirectory() { return getDestDir(); } /** * @return Returns the destDir. */ public File getDestDir() { return destDir; } /** * @param destDir The destDir to set. */ public void setDestDir(File destDir) { this.destDir = destDir.getAbsoluteFile(); } /** * @return Returns the sourceDirectory. */ public File getSrcDir() { return srcDir; } /** * @param srcDir The sourceDirectory to set. */ public void setSrcDir(File srcDir) { this.srcDir = srcDir.getAbsoluteFile(); } public static void main(String[] args) { List arguments = Arrays.asList(args); PrintStream out = System.out; GrammarDoc gd = new CommandGrammarDoc(out); if(arguments.contains("-h") || arguments.contains("--help")) { out.println("usage: ccg-grammardoc [-s|--source sourceDir] " + "[-d|--dest destDir]"); } else { Iterator i = arguments.iterator(); try { while(i.hasNext()) { String s = i.next(); if(s.equals("-s") || s.equals("--source")) { if(gd.srcDir != null) { throw new IllegalArgumentException( "source directory already specified"); } if(!i.hasNext()) { throw new IllegalArgumentException( "encountered flag " + s + ", but no directory specified"); } gd.setSrcDir(new File(i.next())); } else if(s.equals("-d") || s.equals("--dest")) { if(gd.destDir != null) { throw new IllegalArgumentException( "destination directory already specified"); } if(!i.hasNext()) { throw new IllegalArgumentException( "encountered flag " + s + ", but no directory specified"); } gd.setDestDir(new File(i.next())); } } gd.execute(); } catch(Exception e) { gd.log("Error: " + e.getMessage()); } } } static final class CommandGrammarDoc extends GrammarDoc { PrintStream out; static final String logPrefix = "[grammardoc] "; CommandGrammarDoc(PrintStream out) { this.out = out; setProject(new Project()); setOwningTarget(new Target()); } @Override public void log(String s) { out.print(logPrefix); out.println(s); } } } ================================================ FILE: src/opennlp/ccg/grammardoc/GrammarDocException.java ================================================ /* * $Id: GrammarDocException.java,v 1.2 2006/11/01 02:53:20 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * An exception that occurs during the execution of {@link GrammarDoc}. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.2 $ */ public class GrammarDocException extends Exception { private static final long serialVersionUID = 1L; /** * Creates a new exception. */ public GrammarDocException() { super(); } /** * Creates a new exception with the specified message. */ public GrammarDocException(String message) { super(message); } /** * Creates a new exception with the specified message and underlying cause. */ public GrammarDocException(String message, Throwable cause) { super(message, cause); } /** * Creates a new exception with the specified underlying cause. */ public GrammarDocException(Throwable cause) { super(cause); } } ================================================ FILE: src/opennlp/ccg/grammardoc/SourceGrammar.java ================================================ /* * $Id: SourceGrammar.java,v 1.4 2007/03/19 17:45:35 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; import java.io.File; import java.util.EnumMap; import java.util.Map; import java.util.Set; /** * Represents an OpenCCG grammar that is specified in a series of XML files. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.4 $ */ public class SourceGrammar { File sourceDirectory; Map sourceFiles; SourceGrammar(File sourceDirectory) { this.sourceDirectory = sourceDirectory; sourceFiles = new EnumMap (SourceGrammarFileType.class); } /** * @return Returns the sourceDirectory. */ public File getSourceDirectory() { return sourceDirectory; } /** * Gets the {@link SourceGrammarFileType file names} found in this source * grammar. */ public Set getSourceGrammarFileTypes() { return sourceFiles.keySet(); } /** * Gets a source grammar file based on a specified * {@link SourceGrammarFileType file name}. * * @param fileType The file name identifying the desired * {@link SourceGrammarFile source grammar file}. */ public SourceGrammarFile getSourceGrammarFile( SourceGrammarFileType fileType) { return sourceFiles.get(fileType); } void addSourceGrammarFile(SourceGrammarFileType fileType, SourceGrammarFile sourceGrammarFile) { sourceFiles.put(fileType, sourceGrammarFile); } } ================================================ FILE: src/opennlp/ccg/grammardoc/SourceGrammarFile.java ================================================ /* * $Id: SourceGrammarFile.java,v 1.4 2007/05/30 22:53:17 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; import java.io.File; import org.w3c.dom.Document; /** * A file found in a source grammar. This class encapsulates the predefined * {@link SourceGrammarFileType file name} and {@link Document DOM document}. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.4 $ */ public class SourceGrammarFile { SourceGrammarFileType fileType; File sourceFile; /** * Creates a new source grammar file. */ SourceGrammarFile(SourceGrammarFileType fileName, File sourceFile) { this.fileType = fileName; this.sourceFile = sourceFile; } /** * @return Returns the fileType. */ public SourceGrammarFileType getFileType() { return fileType; } /** * @return Returns the source file. */ public File getSourceFile() { return sourceFile; } /** * Gets a string version of this source grammar file. * * @return The value of this grammar file's * {@link SourceGrammarFileType#toString() file type}. */ @Override public String toString() { return fileType.toString(); } } ================================================ FILE: src/opennlp/ccg/grammardoc/SourceGrammarFileType.java ================================================ /* * $Id: SourceGrammarFileType.java,v 1.3 2007/05/02 21:51:35 coffeeblack Exp $ */ package opennlp.ccg.grammardoc; /** * File types for source grammars. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.3 $ */ public enum SourceGrammarFileType { GRAMMAR("grammar"), LEXICON("lexicon"), MORPHOLOGY("morph"), RULES("rules"), TYPES("types", false), DOCUMENTATION("documentation", false); final String fileName; final boolean required; private SourceGrammarFileType(String fileName) { this(fileName, true); } private SourceGrammarFileType(String fileName, boolean required) { this.fileName = fileName; this.required = required; } /** * Gets the file name associated with this file type. */ public String getFileName() { return fileName; } /** * Tests whether or not this grammar file name is required. * @return true iff this grammar file type is required to be present in a * grammar. */ public boolean isRequired() { return required; } } ================================================ FILE: src/opennlp/ccg/grammardoc/html/HTMLDocumenter.java ================================================ /* * $Id: HTMLDocumenter.java,v 1.9 2009/12/21 04:18:31 mwhite14850 Exp $ */ package opennlp.ccg.grammardoc.html; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Map; import javax.xml.transform.Source; import javax.xml.transform.Templates; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.URIResolver; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import opennlp.ccg.grammardoc.AbstractDocumenter; import opennlp.ccg.grammardoc.DocumenterException; import opennlp.ccg.grammardoc.DocumenterSourceException; import opennlp.ccg.grammardoc.SourceGrammar; import opennlp.ccg.grammardoc.SourceGrammarFile; import opennlp.ccg.grammardoc.SourceGrammarFileType; /** * A grammardoc documenter that produces HTML documentation. * * @author Scott Martin (http://www.ling.osu.edu/~scott/) * @version $Revision: 1.9 $ */ public class HTMLDocumenter extends AbstractDocumenter implements URIResolver { private static final int FILE_BUFFER_SIZE = 256; private SourceGrammar sourceGrammar; private Map templateCache; final TransformerFactory factory = TransformerFactory.newInstance(); static enum FileName { STYLESHEET("grammardoc.css"), LEXICON_SCRIPT("lexicon.js"); final String name; private FileName(String name) { this.name = name; } } /** * Creates a new HTML Documenter. */ public HTMLDocumenter() { super("GrammarDoc HTML Documenter"); templateCache = new HashMap(); factory.setURIResolver(this); } /** * Documents a source grammar, producing linked HTML files for its source * files. */ public void document(SourceGrammar grammar) throws DocumenterException { this.sourceGrammar = grammar; File destDir = documenterContext.getDestinationDirectory(); copyFiles(destDir); String sections; StringBuilder sb = new StringBuilder(); for(SourceGrammarFileType fileType : grammar.getSourceGrammarFileTypes()) { if(sb.length() > 0) { sb.append('|'); } sb.append(fileType.getFileName()); } sections = sb.toString(); for(SourceGrammarFileType fileType : grammar .getSourceGrammarFileTypes()) { String baseName = fileType.getFileName(); StringBuilder fb = new StringBuilder(); fb.append(baseName.equals( SourceGrammarFileType.GRAMMAR.getFileName()) ? "index" : baseName); fb.append(".html"); String targetName = fb.toString(); SourceGrammarFile sourceFile = grammar.getSourceGrammarFile(fileType); Templates templates = loadTemplates(baseName); if(templates != null) { documenterContext.log("Generating " + targetName); try { File f = new File(destDir, targetName); StreamResult res = new StreamResult( new BufferedOutputStream(new FileOutputStream(f))); res.setSystemId(f); Transformer transformer = templates.newTransformer(); transformer.setURIResolver(this); transformer.setParameter("sections", sections); transformer.transform( new StreamSource(sourceFile.getSourceFile()), res); } catch(TransformerException te) { throw new DocumenterSourceException( "problem transforming output: " + te.getMessageAndLocation(), te, sourceFile); } catch(IOException ioe) { throw new DocumenterException(ioe); } } } } /** * Resolves URIs to sources. Used by the XSLT files in this package to * resolve xsl:import and document() URIs. */ public Source resolve(String href, String base) throws TransformerException { StreamSource ss = null; if(href != null && href.length() > 0) { if(href.endsWith(".xsl")) { ss = new StreamSource(getResource(href)); } else { File f = new File(sourceGrammar.getSourceDirectory(), href); if(!f.exists()) { throw new TransformerException("file does not exist: " + f); } if(f.isDirectory()) { throw new TransformerException("file is a directory: " + f); } ss = new StreamSource(f); ss.setSystemId(f); } } return ss; } private Templates loadTemplates(String baseName) throws DocumenterException { StringBuilder tb = new StringBuilder(baseName); tb.append(".xsl"); String templateName = tb.toString(); if(!templateCache.containsKey(templateName)) { InputStream is = getResource(templateName.toString()); if(is == null) { return null; } try { // cache for later templateCache.put(templateName, factory.newTemplates(new StreamSource(is))); } catch(TransformerConfigurationException tce) { throw new DocumenterException("problem loading template " + templateName.toString() + ": " + tce.getMessageAndLocation(), tce); } } return templateCache.get(templateName); } private void copyFiles(File destDir) throws DocumenterException { for(FileName fileName : FileName.values()) { doCopyFile(fileName, destDir); } } private void doCopyFile(FileName fileName, File destDir) throws DocumenterException { InputStream in = getResource(fileName.name); if(in == null) { throw new DocumenterException("Could not find " + fileName.name); } File f = new File(destDir, fileName.name); documenterContext.log("Writing " + f.getAbsolutePath()); try { FileOutputStream fileOut = new FileOutputStream(f); byte[] buffer = new byte[HTMLDocumenter.FILE_BUFFER_SIZE]; int i; while((i = in.read(buffer)) != -1) { fileOut.write(buffer, 0, i); } in.close(); fileOut.close(); } catch(IOException ioe) { throw new DocumenterException("problem copying file: " + ioe.getMessage(), ioe); } } private InputStream getResource(String resourceName) { Class cl = getClass(); String cn = cl.getName(); String pkg = cn.substring(0, cn.lastIndexOf('.')); StringBuilder sb = new StringBuilder(); sb.append(pkg.replace('.', '/')); sb.append('/'); sb.append(resourceName); return cl.getClassLoader().getResourceAsStream(sb.toString()); } } ================================================ FILE: src/opennlp/ccg/grammardoc/html/base.xsl ================================================ grammar lexicon morph types rules <xsl:text>Documentation for CCG grammar </xsl:text> <xsl:value-of select="$grammar-name"/> <xsl:if test="not($page-name='grammar')"> <xsl:text> : </xsl:text> <xsl:call-template name="capitalize"> <xsl:with-param name="word" select="$page-name"/> </xsl:call-template> </xsl:if> Generated documentation for the CCG grammar

================================================ FILE: src/opennlp/ccg/grammardoc/html/categories.xsl ================================================
(
)
< ~ >
, [ ]
{ }
slash redundant . | application only associative permutative permutative right permutative left associative permutative right associative permutative left all rules
:
^
@
( )
^ ( < > ( ) ) ^
$
================================================ FILE: src/opennlp/ccg/grammardoc/html/comments.xsl ================================================
				
			
================================================ FILE: src/opennlp/ccg/grammardoc/html/grammar.xsl ================================================ ================================================ FILE: src/opennlp/ccg/grammardoc/html/grammardoc.css ================================================ /* * $Id: grammardoc.css,v 1.8 2007/04/18 22:54:13 coffeeblack Exp $ * Author: Scott Martin (http://www.ling.osu.edu/~scott/) */ body { margin: 0 0 2em 0; padding: 0; background-color: rgb(250,250,250); color: rgb(25,25,25); font-family: Verdana, Arial, Helvetica, sans-serif; font-size: small; } a:hover { text-decoration: none; } acronym { border-bottom: 1px dotted; cursor: help; } div.back { text-align: right; } div#container { margin: 0 20px; clear: left; } h1, h2, h3, h4, h5 { font-family: Trebuchet MS, Arial, Helvetica, sans-serif; } h1 { margin: 0 0 1em 0; border-bottom: 1px dotted rgb(150,150,150); padding: 2em 0 .5em 20px; background-color: rgb(153,0,0); color: rgb(250,250,250); font-size: 1.8em; } h2 { margin: 2em 0 1em 0; border-bottom: 1px dotted rgb(160,160,160); padding-bottom: .25em; color: rgb(153,0,0); font-size: 1.6em; } h3 { color: rgb(80,0,0); font-size: 1.4em; } h4 { font-size: 1.2em; } h5 { font-size: 1.1em; } ul { padding-left: 40px; } dl.box { border: 1px solid rgb(150,150,150); background-color: rgb(235,235,235); padding: 10px; } dl.box dt { color: rgb(153,51,0); font-family: Trebuchet MS, Arial, Helvetica, sans-serif; font-size: 1.1em; font-weight: bold; } dl.box dd, dl.attributes dd { margin-left: 0; } dl.box dd li, ul.members li { color: rgb(153,51,0); } ol.categories li { /*font-size: 1.2em;*/ } dl.category dt { font-size: 1.1em; } dl.category, dl.category dt, dl.category dd, dl.category dd dl { display: inline; } dl.attributes dt { float: left; margin-right: .25em; color: rgb(153,51,0); font-weight: bold; } dl.attributes dd { color: rgb(153,85,50); } dl.category dt { color: rgb(80,0,0); } dl.feature-structure { font-size: .85em; } dl.category dd, dl.feature-structure dt, span.ruleLabel { color: rgb(153,85,50); font-family: Georgia, "Times New Roman", serif; } dl.feature-structure dt { font-family: Verdana, Arial, Helvetica, sans-serif; font-weight: normal; } dl.feature-structure dt, dl.feature-structure dd { display: none; } span.enclosure, span.divider { font-size: 1.1em; font-weight: bold; } dl.feature-structure dd span.enclosure { font-style: normal; } ol.categories li a { margin-right: 10px; /*font-size: .85em;*/ text-decoration: none; } .expanded dl.feature-structure dt, .expanded dl.feature-structure dd { display: inline; } dl.category dd, span.ruleLabel { vertical-align: sub; } dl.category dd.category-container { vertical-align: baseline; } dd.category-container dl.category { /*font-size: 1.1em;*/ } dl.category dd.feat { font-style: italic; } dt.slash span.mode { /*font-size: .7em;*/ } dl.category dt.redundant { display: none; } dl.lf { display: inline; } dl.lf dd.nomvar, dl.lf span.prop { font-weight: normal; } dl.lf dd.nomvar { font-style: italic; } dl.category span.prop { font-weight: bold; } dl.lf span.diamond, dl.lf span.prop { color: rgb(153,85,50); } dl.entries dl dt { margin-top: 1em; } dl.entries dl dd { margin-left: 20px; } ul.rules { padding-left: 20px; list-style-type: none; } ul.rules li { padding-top: .5em; } ul#navigation { margin-left: 20px; padding-bottom: 2em; padding-left: 0; list-style-type: none; } ul#navigation li a { float: left; margin-right: 10px; border: 1px solid rgb(150,150,150); background-color: rgb(235,235,235); color: rgb(153,0,0); padding: 5px; text-decoration: none; font-weight: bold; } ul#navigation li a.current, ul#navigation li a:hover, ul#navigation li a:active { background-color: rgb(153,0,0); border: 1px dotted rgb(150,150,150); color: rgb(255,255,255); } a.unaryRuleExpander { float: left; margin-right: 5px; text-decoration: none; } dd.unaryResult { display: inline; } dl.unaryRule { /*font-size: 1.1em !important;*/ color: rgb(153,51,0); } dl.unaryRule span.arrow { color: rgb(153,85,50); font-weight: bold; } ul.short { list-style-type: none; } ul.short li { display: inline; } dl.shortcuts ul.short li { color: rgb(25,25,25); } ul.short li + li:before { content: ","; } ================================================ FILE: src/opennlp/ccg/grammardoc/html/lexicon.js ================================================ /* * $Id: lexicon.js,v 1.2 2006/12/13 19:25:22 coffeeblack Exp $ * Author: Scott Martin (http://www.ling.osu.edu/~scott/) */ function toggleFeatures(elem) { elem.className = (elem.className == "expanded") ? "" : "expanded"; var anchors = elem.getElementsByTagName("a"); anchors[0].innerHTML = (elem.className == "expanded") ? "[-]" : "[+]"; anchors[0].setAttribute("title", ((elem.className == "expanded") ? "collapse" : "expand") + " feature structures"); } ================================================ FILE: src/opennlp/ccg/grammardoc/html/lexicon.xsl ================================================
Lexical Families

Closed:
false
Part of Speech:
Lexical Items

Members

-
  • Categories
    1. [+]
    ================================================ FILE: src/opennlp/ccg/grammardoc/html/morph.xsl ================================================
    Entries
    Macros

    Entries

    Macros

    Member of:
  • Feature Structures
    Logical Form
    =
    , =
    ================================================ FILE: src/opennlp/ccg/grammardoc/html/navigation.xsl ================================================ grammar|lexicon|morph|rules|types
  • current index .html
  • ================================================ FILE: src/opennlp/ccg/grammardoc/html/rules.xsl ================================================
    List of Rules

    Application

    Rules

    Composition

    Rules

    Substitution

    Rules

    Typeraising

    Rules

    Typechanging

    [+]
  • B
  • np s \ /
  • (
    \ /
    )
  • $ 1
  • S
  • > < x &nbsp;&mdash;>&nbsp;
    ================================================ FILE: src/opennlp/ccg/grammardoc/html/types.xsl ================================================
    Type Hierarchy
  • ================================================ FILE: src/opennlp/ccg/hylo/Alt.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-6 Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import java.io.Serializable; import java.util.BitSet; /** * Class for representing alts. * LF alts are used during realization to represent * exclusive disjunctions in the input. * The alts are represented as pairs of ints, * one for the alt set and one for the alt within the set. * The alts are numbered starting with 0. * An alt has a bitset for the elementary predications * within the alt. * * @author Michael White * @version $Revision: 1.6 $, $Date: 2009/07/17 04:23:30 $ */ public final class Alt implements Comparable, Serializable { private static final long serialVersionUID = 7241395629445814238L; /** The alt set number. */ public final int altSet; /** The alt within the set. */ public final int numInSet; /** The bitset. */ public final BitSet bitset = new BitSet(); /** Constructor. */ public Alt(int altSet, int numInSet) { this.altSet = altSet; this.numInSet = numInSet; } /** Equals. */ public boolean equals(Object o) { if (!(o instanceof Alt)) return false; Alt a = (Alt) o; return altSet == a.altSet && numInSet == a.numInSet; } /** Comparison. */ public int compareTo(Alt a) { if (altSet < a.altSet) return -1; if (altSet == a.altSet && numInSet < a.numInSet) return -1; if (altSet == a.altSet && numInSet == a.numInSet) return 0; return 1; } } ================================================ FILE: src/opennlp/ccg/hylo/Box.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A modal box operator, such as [F]q. * * @author Jason Baldridge * @version $Revision: 1.5 $, $Date: 2009/07/17 04:23:30 $ **/ public final class Box extends ModalOp { private static final long serialVersionUID = 1575311851235814524L; public Box(Element e) { super(e); } private Box(Mode mode, LF arg) { super(mode, arg); } public LF copy() { return new Box ((Mode)_mode.copy(), _arg.copy()); } public boolean equals(Object o) { if (o instanceof Box) { return super.equals((Box)o); } else { return false; } } public void unifyCheck(Object u) throws UnifyFailure { if (u instanceof Box) { super.unifyCheck((Box)u); } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { return new Box((Mode)_mode.fill(sub), (LF)_arg.fill(sub)); } /** Returns the string form of this modal op, without the arg. */ public String modalOpString() { return new StringBuffer().append('[').append(_mode.toString()).append(']').toString(); } /** * Returns an XML representation of this LF (not currently supported). * Throws a runtime exception. */ public Element toXml() { throw new RuntimeException("toXml() not currently supported for Box."); } } ================================================ FILE: src/opennlp/ccg/hylo/Compacter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-6 Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.util.*; import java.util.*; /** * A class implementing compaction of flattened LFs. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2006/09/04 14:09:10 $ **/ public class Compacter { /** * Returns a compacted LF from the given flattened one. * A root nominal may also be given (otherwise null). * Nominals with multiple parents are kept separate. * If there are any duplicate predications, an attempt * is made to attach them in different locations. */ static LF compact(LF lf, Nominal root) { // get preds, make copies List preds = HyloHelper.getPreds(lf); for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); preds.set(i, (SatOp) pred.copy()); } // check for single pred if (preds.size() == 1) return preds.get(0); // find unique parents and multiple parents Map parents = new HashMap(); GroupMap multipleParents = new GroupMap(); for (int i = 0; i < preds.size(); i++) { SatOp pred = preds.get(i); // get principal nominal as nom1 Nominal nom1 = HyloHelper.getPrincipalNominal(pred); // get secondary nominal Nominal nom2 = HyloHelper.getSecondaryNominal(pred); // skip if none or nom2 equal to root if (nom2 == null) continue; if (root != null && nom2.equals(root)) continue; // if nom2 already in group map, add nom1 as another parent if (multipleParents.containsKey(nom2)) { multipleParents.put(nom2, nom1); } // if nom2 already in parent map, add existing parent and nom1 to group map, // record pred, then remove nom2 from parent map else if (parents.containsKey(nom2)) { multipleParents.put(nom2, parents.get(nom2)); multipleParents.put(nom2, nom1); parents.remove(nom2); } // otherwise put in nom1 as parent else { parents.put(nom2, nom1); } } // check multiple parent nominals for cycles int prevSize = -1; List history = new ArrayList(); while (multipleParents.size() != prevSize) { prevSize = multipleParents.size(); for (Iterator it = multipleParents.keySet().iterator(); it.hasNext(); ) { Nominal nom = it.next(); Set nomParents = multipleParents.get(nom); for (Iterator it2 = nomParents.iterator(); it2.hasNext(); ) { Nominal parent = it2.next(); history.clear(); history.add(nom); while (parent != null && !history.contains(parent)) { history.add(parent); parent = parents.get(parent); } // remove if cycle found if (parent != null) it2.remove(); } // switch to single parent if others removed if (nomParents.size() == 1) { Nominal parent = nomParents.iterator().next(); parents.put(nom, parent); it.remove(); } } } // break any remaining cycles in parent relationships for (Iterator it = parents.keySet().iterator(); it.hasNext(); ) { Nominal nom = it.next(); Nominal parent = parents.get(nom); history.clear(); history.add(nom); while (parent != null && !history.contains(parent)) { history.add(parent); parent = parents.get(parent); } if (parent != null) { it.remove(); } } // ensure sorted HyloHelper.sort(preds); // combine preds on same nominal // also: gather any duplicate preds List combinedPreds = new ArrayList(preds.size()); List dupPreds = new ArrayList(preds.size()); SatOp currentSatOp = preds.get(0); Nominal currentNominal = currentSatOp.getNominal(); combinedPreds.add(currentSatOp); for (int i = 1; i < preds.size(); i++) { SatOp satOp = preds.get(i); // skip if equal to previous, saving in dupPreds if (satOp.equals(preds.get(i-1))) { dupPreds.add(satOp); continue; } // check for different nominal Nominal nominal = satOp.getNominal(); if (!nominal.equals(currentNominal)) { // add to combined preds, update current refs currentSatOp = satOp; currentNominal = nominal; combinedPreds.add(currentSatOp); } // otherwise combine else { combine(currentSatOp, satOp); } } // compact preds with unique parent for (int i = 0; i < combinedPreds.size(); i++) { SatOp satOp1 = combinedPreds.get(i); Nominal nom1 = satOp1.getNominal(); if (!parents.containsValue(nom1)) continue; for (int j = 0; j < combinedPreds.size(); j++) { SatOp satOp2 = combinedPreds.get(j); Nominal nom2 = satOp2.getNominal(); if (nom1.equals(nom2)) continue; if (!parents.containsKey(nom2)) continue; if (nom1.equals(parents.get(nom2))) { subst(satOp1, satOp2, nom2, null); } } } // get root nominals, root preds, and multiple parent preds List roots = new ArrayList(); List rootPreds = new ArrayList(); List multipleParentPreds = new ArrayList(); for (int i = 0; i < combinedPreds.size(); i++) { SatOp pred = combinedPreds.get(i); Nominal nom = pred.getNominal(); if (!parents.containsKey(nom) && !multipleParents.containsKey(nom)) { roots.add(nom); rootPreds.add(pred); } if (multipleParents.containsKey(nom)) { multipleParentPreds.add(pred); } } // compact preds with multiple parents, using parent that is closest to a root prevSize = -1; while (multipleParentPreds.size() != prevSize) { prevSize = multipleParentPreds.size(); // for each nominal with multiple parents for (Iterator it = multipleParentPreds.iterator(); it.hasNext(); ) { SatOp pred = it.next(); Nominal nom = pred.getNominal(); // find parent closest to root, but checking for a parent not below a root Set nomParents = multipleParents.get(nom); Nominal parentClosestToRoot = null; int closestDist = 0; int closestRootIndex = -1; for (Iterator it2 = nomParents.iterator(); it2.hasNext(); ) { Nominal parent = it2.next(); int dist = 0; // trace parents to top ancestor Nominal topAncestor = parent; while (parents.containsKey(topAncestor)) { topAncestor = parents.get(topAncestor); dist++; } // if top ancestor a root, update closest parent if (roots.contains(topAncestor)) { if (parentClosestToRoot == null || dist < closestDist) { parentClosestToRoot = parent; closestDist = dist; closestRootIndex = roots.indexOf(topAncestor); } } // otherwise set closest dist to -1, to indicate that not all ancestors are roots else { closestDist = -1; } } // check for a parent not below a root, or no closest root, and skip this nom if so if (closestDist == -1 || closestRootIndex == -1) { continue; } // otherwise compact under root pred of parent closest to root SatOp closestRootPred = rootPreds.get(closestRootIndex); subst(closestRootPred, pred, nom, parentClosestToRoot); // update parents map parents.put(nom, parentClosestToRoot); // and remove from iterator it.remove(); } } // set retval to single remaining pred or conjunction of remaining ones LF retval; List retPreds = new ArrayList(); retPreds.addAll(rootPreds); retPreds.addAll(multipleParentPreds); if (retPreds.size() == 1) { retval = retPreds.get(0); } else { retval = new Op(Op.CONJ, retPreds); } // tmp for (SatOp dup : dupPreds) { Nominal nom = dup.getNominal(); Nominal dupParent = findDupParent(retval, dup, nom); subst(retval, dup, nom, dupParent); } // return return retval; } // combines two preds for the same nominal into the first pred, // where either both preds are elementary, // or the first is the result of an earlier combination private static void combine(SatOp satOp1, SatOp satOp2) { // get args LF arg1 = satOp1.getArg(); LF arg2 = satOp2.getArg(); // check if arg1 already conj op if (arg1 instanceof Op && ((Op)arg1).getName().equals(Op.CONJ)) { List args = ((Op)arg1).getArguments(); args.add(arg2); } // or make it one else { List args = new ArrayList(2); args.add(arg1); args.add(arg2); satOp1.setArg(new Op(Op.CONJ, args)); } } // substitutes the second satop into the first lf at nom2, optionally // respecting the given parent constraint (if non-null) // returns whether the substitution has been made private static boolean subst(LF lf, SatOp satOp2, Nominal nom2, Nominal requiredParent) { return subst(lf, null, satOp2, nom2, requiredParent); } // recursive implementation that tracks the current parent and // returns whether the substitution has been made private static boolean subst(LF lf, Nominal currentParent, SatOp satOp2, Nominal nom2, Nominal requiredParent) { // recurse to nom2, then append if requiredParent constraint met if (lf instanceof SatOp) { SatOp satOp = (SatOp) lf; return subst(satOp.getArg(), satOp.getNominal(), satOp2, nom2, requiredParent); } else if (lf instanceof Diamond) { Diamond d = (Diamond) lf; LF arg = d.getArg(); // check for nom2, and that requiredParent constraint met if (arg.equals(nom2) && (requiredParent == null || requiredParent.equals(currentParent))) { // make substitution d.setArg(HyloHelper.append(arg, satOp2.getArg())); return true; } else { return subst(arg, currentParent, satOp2, nom2, requiredParent); } } else if (lf instanceof Op) { Op op = (Op) lf; List args = op.getArguments(); for (int i = 0; i < args.size(); i++) { LF arg = args.get(i); if (arg instanceof Nominal) { // check for nom2, and that requiredParent constraint met if (arg.equals(nom2) && (requiredParent == null || requiredParent.equals(currentParent))) { // make substitution // nb: this (rarely used) operation doesn't nec. preserve the sort order, unfortunately op.appendArgs(satOp2.getArg()); return true; } // otherwise, set current parent and continue else { currentParent = (Nominal) arg; continue; } } boolean madeSubst = subst(arg, currentParent, satOp2, nom2, requiredParent); if (madeSubst) return true; } } return false; } // returns a parent nominal where the duplicate pred can be substituted // if there is no equivalent pred there already; otherwise returns null private static Nominal findDupParent(LF lf, SatOp dup, Nominal dupNom) { return findDupParent(lf, null, dup, dupNom); } // recursive implementation that tracks the current parent private static Nominal findDupParent(LF lf, Nominal currentParent, SatOp dup, Nominal dupNom) { // recurse to dupNom, then return parent if apropos if (lf instanceof SatOp) { SatOp satOp = (SatOp) lf; return findDupParent(satOp.getArg(), satOp.getNominal(), dup, dupNom); } else if (lf instanceof Diamond) { Diamond d = (Diamond) lf; LF arg = d.getArg(); // check for dupNom by itself, and return parent if (arg.equals(dupNom)) return currentParent; else return findDupParent(arg, currentParent, dup, dupNom); } else if (lf instanceof Op) { Op op = (Op) lf; List args = op.getArguments(); for (int i = 0; i < args.size(); i++) { LF arg = args.get(i); if (arg instanceof Nominal) { // check for dupNom, and that no equiv pred constraint met if (arg.equals(dupNom) && !args.contains(dup.getArg())) // return parent return currentParent; // otherwise, set current parent and continue else { currentParent = (Nominal) arg; continue; } } Nominal retval = findDupParent(arg, currentParent, dup, dupNom); if (retval != null) return retval; } } return null; } } ================================================ FILE: src/opennlp/ccg/hylo/Converter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-9 Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.TextCCG; import opennlp.ccg.synsem.*; import java.util.*; import java.util.prefs.Preferences; /** * A class implementing conversion of nominal variables to nominal atoms. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2010/08/31 16:20:43 $ **/ public class Converter { /** Preference key for using word positions to name atoms. */ public static final String USE_WORD_POSITIONS_FOR_ATOM_CONVERSION = "Use Word Positions To Convert Atoms"; // map to already converted nominals private Map nominalMap = new HashMap(); // map to int for names private Map nameMap = new HashMap(); // flag for whether to skip absent props private boolean skipAbsentProp = true; /** Converts nominal vars to atoms, renaming them based on lexical propositions. */ static void convertNominals(LF lf) { convertNominals(lf, null, null); } /** * Converts nominal vars to atoms, renaming them based on word position, if * a root sign is given, otherwise using lexical propositions; * returns the converted nominal root. */ static Nominal convertNominals(LF lf, Sign root, Nominal nominalRoot) { // check preference for naming with word positions; set root to null if false Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); boolean useWordPositions = prefs.getBoolean(USE_WORD_POSITIONS_FOR_ATOM_CONVERSION, true); if (!useWordPositions) root = null; // traverse twice, skipping absent props the first time Converter converter = new Converter(); converter.convertNoms(lf, root); converter.skipAbsentProp = false; converter.convertNoms(lf, root); // return converted nominal root, if any Nominal retval = null; if (nominalRoot != null) { retval = converter.nominalMap.get(nominalRoot); } return retval; } // recurse through lf, converting nominals private void convertNoms(LF lf, Sign root) { if (lf instanceof SatOp) { SatOp satOp = (SatOp) lf; // try finding word index of lex origin in root sign int wordIndex = -1; if (root != null) { LexSemOrigin origin = satOp.getOrigin(); if (origin instanceof Sign) { Sign lexSign = (Sign) origin; // make sure it's not dominated by another lex pred // nb: also need to check for special pred 'elem', which isn't // dominated in sample flights grammar String lexPred = HyloHelper.getLexPred(satOp); if (lexPred != null && !lexPred.equals("elem")) { if (!lexDominated(lexPred, lexSign)) wordIndex = root.wordIndex(lexSign); } } } Nominal oldNom = satOp.getNominal(); Proposition prop = null; LF arg = satOp.getArg(); if (arg instanceof Proposition) { prop = (Proposition) arg; } else if (arg instanceof Op) { Op op = (Op) arg; LF first = (LF) op.getArguments().get(0); if (first instanceof Proposition) { prop = (Proposition) first; } } Nominal convertedNom = convertNominal(oldNom, prop, wordIndex); satOp.setNominal(convertedNom); convertNoms(arg, root); } else if (lf instanceof Diamond) { Diamond d = (Diamond) lf; LF arg = d.getArg(); if (arg instanceof Nominal) { Nominal oldNom = (Nominal) arg; Nominal convertedNom = convertNominal(oldNom, null, -1); d.setArg(convertedNom); } else if (arg instanceof Op) { Op op = (Op) arg; List args = op.getArguments(); LF first = args.get(0); if (first instanceof Nominal) { Nominal oldNom = (Nominal) first; LF second = args.get(1); Proposition prop = null; if (second instanceof Proposition) { prop = (Proposition) second; } Nominal convertedNom = convertNominal(oldNom, prop, -1); args.set(0, convertedNom); } convertNoms(arg, root); } } else if (lf instanceof Op) { List args = ((Op)lf).getArguments(); for (int i = 0; i < args.size(); i++) { convertNoms(args.get(i), root); } } } // returns a nominal atom based on the old nominal, prop and maps, // which are updated accordingly; // wordIndex is used instead if non-negative; // the skipAbsentProp flag controls whether to skip a null prop, // so that a meaningful name might be created later private Nominal convertNominal(Nominal oldNom, Proposition prop, int wordIndex) { // check for an atom if (oldNom instanceof NominalAtom) return oldNom; // handle word index case if (wordIndex >= 0) return convertNominal(oldNom, "w" + wordIndex); // skip absent props according to flag if (prop == null && skipAbsentProp) return oldNom; // check if already converted, and return copy Nominal alreadyConvertedNom = nominalMap.get(oldNom); if (alreadyConvertedNom != null) { return (Nominal) alreadyConvertedNom.copy(); } // otherwise create new atom, with name based on prop (if possible) String nameBase = "x"; if (prop != null) { nameBase = prop.toString().toLowerCase().substring(0,1); // use "n" if not a letter if (!Character.isLetter(nameBase.charAt(0))) nameBase = "n"; } int ext = 1; Integer baseCount = nameMap.get(nameBase); if (baseCount != null) { ext = baseCount.intValue() + 1; } nameMap.put(nameBase, new Integer(ext)); String name = nameBase + ext; return convertNominal(oldNom, name); } // returns the converted nominal using the given name, updating the map private Nominal convertNominal(Nominal oldNom, String name) { Nominal retval = new NominalAtom(name, oldNom.getType()); nominalMap.put(oldNom, retval); return retval; } //--------------------------------------------------------------------------- // check for dominating lex pred // // returns true if the EP for the lexPred is dominated by another lex pred private static boolean lexDominated(String lexPred, Sign lexSign) { Category cat = lexSign.getCategory(); LF lf = cat.getLF(); Nominal index = cat.getIndexNominal(); List preds = HyloHelper.getPreds(lf); // find EP with lexPred, other lex preds SatOp lexEP = null; List otherLexPreds = new ArrayList(); for (SatOp pred : preds) { if (HyloHelper.isLexPred(pred)) { if (lexPred.equals(HyloHelper.getLexPred(pred))) lexEP = pred; else otherLexPreds.add(pred); } } if (lexEP == null) { throw new RuntimeException("Couldn't find lexPred: " + lexPred); } // check domination Nominal lexNom = HyloHelper.getPrincipalNominal(lexEP); for (SatOp pred : otherLexPreds) { Nominal otherNom = HyloHelper.getPrincipalNominal(pred); Stack seen = new Stack(); seen.push(index); // don't recurse through index nominal if (dominates(otherNom, lexNom, preds, seen)) return true; } // otherwise false return false; } // returns true if a dominates b in preds, using seen stack to avoid looping private static boolean dominates(Nominal a, Nominal b, List preds, Stack seen) { // check for identity if (a.equals(b)) return false; // push a to seen noms seen.push(a); // check relations for (SatOp pred : preds) { if (a.equals(HyloHelper.getPrincipalNominal(pred))) { Nominal c = HyloHelper.getSecondaryNominal(pred); if (c == null) continue; // check immed dominance if (b.equals(c)) return true; // found dominance! // check seen if (seen.contains(c)) continue; // recurse if (dominates(c, b, preds, seen)) return true; } } // otherwise not; pop a and return seen.pop(); return false; } //--------------------------------------------------------------------------- // convert nominal atoms back to vars // /** Converts nominal atoms back to vars. */ static void convertNominalsToVars(List preds) { convertNominalsToVars(preds, null); } /** * Converts nominal atoms back to vars, returning the converted nominal root. */ static Nominal convertNominalsToVars(List preds, Nominal nominalRoot) { Nominal retval = null; for (SatOp pred : preds) { Nominal nom = pred._nominal; Nominal nv = convertNominalToVar(nom); if (nom.equals(nominalRoot)) retval = nv; pred.setNominal(nv); LF arg = pred.getArg(); if (arg instanceof Diamond) { Diamond dArg = (Diamond) arg; LF arg2 = dArg.getArg(); if (arg2 instanceof Nominal) { Nominal nv2 = convertNominalToVar((Nominal)arg2); dArg.setArg(nv2); } } } return retval; } // returns a nominal var with the same name as the given nominal static Nominal convertNominalToVar(Nominal nom) { return new NominalVar(nom.getName().toUpperCase(), nom.getType()); } } ================================================ FILE: src/opennlp/ccg/hylo/Diamond.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A modal diamond operator, such as <P>p. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.6 $, $Date: 2009/07/17 04:23:30 $ **/ public final class Diamond extends ModalOp { private static final long serialVersionUID = 543211908001651361L; public Diamond(Element e) { super(e); } public Diamond(Mode mode, LF arg) { super(mode, arg); } public LF copy() { return new Diamond ((Mode)_mode.copy(), _arg.copy()); } public boolean equals(Object o) { if (o instanceof Diamond) { return super.equals((Diamond)o); } else { return false; } } public void unifyCheck(Object u) throws UnifyFailure { if (u instanceof Diamond) { super.unifyCheck((Diamond)u); } else { throw new UnifyFailure(); } } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof HyloFormula) { if (u instanceof Diamond) { Mode $mode = (Mode) Unifier.unify(_mode, ((Diamond)u)._mode, sub); LF $arg = (LF) Unifier.unify(_arg,((Diamond)u)._arg, sub); return new Diamond($mode, $arg); } else return super.unify(u,sub); } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { return new Diamond((Mode)_mode.fill(sub), (LF)_arg.fill(sub)); } /** Returns the string form of this modal op, without the arg. */ public String modalOpString() { return new StringBuffer().append('<').append(_mode.toString()).append('>').toString(); } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("diamond"); retval.setAttribute("mode", _mode.toString()); Element argElt = _arg.toXml(); retval.addContent(argElt); return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/EPsScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Michael White (The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.util.Pair; import java.text.NumberFormat; import java.util.*; /** * A class implementing LF scoring in terms of recall and precision * of elementary predications. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2010/11/15 03:21:12 $ **/ public class EPsScorer { /** * Class for scoring results. */ public static class Results { // results public double recall = 0.0; public double precision = 0.0; public double fscore = 0.0; public double depsRecall = 0.0; public double depsPrecision = 0.0; public double depsFscore = 0.0; public double unlabeledDepsRecall = 0.0; public double unlabeledDepsPrecision = 0.0; public double unlabeledDepsFscore = 0.0; // display public String toString() { return "fscore: " + nf.format(fscore) + " recall: " + nf.format(recall) + " precision: " + nf.format(precision) + " deps fscore: " + nf.format(depsFscore) + " deps recall: " + nf.format(depsRecall) + " deps precision: " + nf.format(depsPrecision) + " unlabeled deps fscore: " + nf.format(unlabeledDepsFscore) + " unlabeled deps recall: " + nf.format(unlabeledDepsRecall) + " unlabeled deps precision: " + nf.format(unlabeledDepsPrecision); } // formats to four decimal places private static final NumberFormat nf = initNF(); private static NumberFormat initNF() { NumberFormat f = NumberFormat.getInstance(); f.setMinimumIntegerDigits(1); f.setMinimumFractionDigits(1); f.setMaximumFractionDigits(4); return f; } } /** * Returns the results of scoring an LF against a gold LF. */ public static Results score(LF lf, LF goldLF) { // get EPs List eps = HyloHelper.flatten(lf); List goldEPs = HyloHelper.flatten(goldLF); Set epsSet = new HashSet(eps); Set goldEPsSet = new HashSet(goldEPs); // get unlabeled deps Set> unlabeledDepsSet = new HashSet>(); Set> goldUnlabeledDepsSet = new HashSet>(); for (SatOp ep : eps) { Pair dep = getDep(ep); if (dep != null) unlabeledDepsSet.add(dep); } for (SatOp ep : goldEPs) { Pair dep = getDep(ep); if (dep != null) goldUnlabeledDepsSet.add(dep); } // calc recall Results retval = new Results(); int recalled = 0, depsRecalled = 0, unlabeledDepsRecalled = 0; int goldDeps = goldUnlabeledDepsSet.size(); for (SatOp ep : goldEPs) { boolean isdep = HyloHelper.isRelPred(ep); if (epsSet.contains(ep)) { recalled++; if (isdep) depsRecalled++; } if (isdep && unlabeledDepsSet.contains(getDep(ep))) unlabeledDepsRecalled++; } retval.recall = 1.0 * recalled / goldEPs.size(); retval.depsRecall = (goldDeps > 0) ? 1.0 * depsRecalled / goldDeps : 1.0; retval.unlabeledDepsRecall = (goldDeps > 0) ? 1.0 * unlabeledDepsRecalled / goldDeps : 1.0; // calc precision int precise = 0, depsPrecise = 0, unlabeledDepsPrecise = 0; int lfDeps = unlabeledDepsSet.size(); for (SatOp ep : eps) { boolean isdep = HyloHelper.isRelPred(ep); if (goldEPsSet.contains(ep)) { precise++; if (isdep) depsPrecise++; } if (isdep && goldUnlabeledDepsSet.contains(getDep(ep))) unlabeledDepsPrecise++; } retval.precision = 1.0 * precise / eps.size(); retval.depsPrecision = (lfDeps > 0) ? 1.0 * depsPrecise / lfDeps : 1.0; retval.unlabeledDepsPrecision = (lfDeps > 0) ? 1.0 * unlabeledDepsPrecise / lfDeps : 1.0; // calc f-score retval.fscore = fscore(retval.recall, retval.precision); retval.depsFscore = fscore(retval.depsRecall, retval.depsPrecision); retval.unlabeledDepsFscore = fscore(retval.unlabeledDepsRecall, retval.unlabeledDepsPrecision); // done return retval; } // returns an unlabeled dependency as a pair of nominals, or null if the ep is not relational private static Pair getDep(SatOp ep) { if (HyloHelper.isRelPred(ep)) { // put nominals in canonical order, so that direction of dependency doesn't matter Nominal n1 = HyloHelper.getPrincipalNominal(ep); Nominal n2 = HyloHelper.getSecondaryNominal(ep); if (n1.compareTo(n2) <= 0) return new Pair(n1, n2); else return new Pair(n2, n1); } else return null; } /** Calculates f-score as balanced harmonic mean of recall and precision. */ public static double fscore(double recall, double precision) { if (recall + precision == 0.0) return 0.0; return 2.0 * recall * precision / (recall + precision); } } ================================================ FILE: src/opennlp/ccg/hylo/EnglishAgreementExtractor.java ================================================ package opennlp.ccg.hylo; /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Rajakrishnan Rajkumar // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// import opennlp.ccg.lexicon.Word; import opennlp.ccg.perceptron.Alphabet; import opennlp.ccg.perceptron.FeatureExtractor; import opennlp.ccg.perceptron.FeatureMap; import opennlp.ccg.perceptron.FeatureVector; import opennlp.ccg.synsem.AtomCat; import opennlp.ccg.synsem.Category; import opennlp.ccg.synsem.ComplexCat; import opennlp.ccg.synsem.Sign; import opennlp.ccg.unify.FeatureStructure; import opennlp.ccg.unify.SimpleType; import opennlp.ccg.util.TrieMap; import opennlp.ccg.hylo.LexDependency; import java.util.*; /** * Class which extracts subject verb and relative pronoun agreement features for * English (described in): * * @InProceedings{rajkumar-white:2010:POSTERS, * author = {Rajkumar, Rajakrishnan and White, Michael}, * title = {Designing Agreement Features for Realization Ranking}, * booktitle = {Coling 2010: Posters}, * month = {August}, * year = {2010}, * address = {Beijing, China}, * publisher = {Coling 2010 Organizing Committee}, * pages = {1032--1040}, * url = {http://www.aclweb.org/anthology/C10-2119} * } * * The class extracts features based on the OpenCCG HLDS specific LF rels: ArgN (subject rel), whApposRel, GenRel, First, Next * * @author raja * @version $Revision: 1.11 $, $Date: 2011/11/25 18:18:33 $ */ public class EnglishAgreementExtractor implements FeatureExtractor{ /** Feature map wrapper, for unique retrieval from a sign's data objects. */ public static class FeatureMapWrapper { public FeatureMap featureMap; public FeatureMapWrapper(FeatureMap featureMap) { this.featureMap = featureMap;} } /** Inner class to store specific properties of signs (right now for unbalanced punctuation status). */ private class SignProps{ //Store comma/dash unbalanced punctuation private String unbalPunct=null; /** Constructor to specify unbalanced punctuation. */ public SignProps(String unbalPunct){ this.unbalPunct=unbalPunct; } public String getUnbalancedPunct(){ return unbalPunct; } } /** The alphabet. */ protected Alphabet alphabet = null; /** Current feature map. */ protected FeatureMap currentMap = null; /** Head and dependent signs (For feature extraction) .*/ protected Sign headSign=null; protected Sign depSign=null; /** Error analysis related. */ //Sentence id String sentId=null; //Instance num int INSTANCENUM=0; /** Subject-verb agreement feature extractors. */ protected List>> agrExtractors = new ArrayList>>(); protected List>> agrConjExtractors = new ArrayList>>(); protected List>> agrOfComplementExtractors = new ArrayList>>(); /** WH-pronoun agreement feature extractors. */ protected List>> whExtractors = new ArrayList>>(); protected List>> whConjExtractors = new ArrayList>>(); /** Punctuation agreement feature extractors. */ protected List>> punctExtractor = new ArrayList>>(); /** Constructors. */ //Constructor used during actual perceptron training and testing public EnglishAgreementExtractor() { // init lazy feature extractors this.init(); } //Constructor used during error analysis using serialized signs public EnglishAgreementExtractor(String sentId) { //init lazy feature extractors this.sentId=sentId; this.INSTANCENUM=0; this.init(); } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { this.alphabet = alphabet; } /** Initializes lazy feature extractors .*/ public void init() { //Agreement: Simple subj-verb feature extractors this.agrExtractors.add(dep_word_head_word(1)); this.agrExtractors.add(dep_word_head_pos(1)); this.agrExtractors.add(dep_pos_head_word(1)); this.agrExtractors.add(dep_pos_head_pos(1)); //Agreement: Disjunct subj feature extractors this.agrConjExtractors.add(dep_word_head_word(2)); this.agrConjExtractors.add(dep_word_head_pos(2)); this.agrConjExtractors.add(dep_pos_head_word(2)); this.agrConjExtractors.add(dep_pos_head_pos(2)); //Agreement: Of-complement feature extractors this.agrOfComplementExtractors.add(dep_word_head_word(3)); this.agrOfComplementExtractors.add(dep_word_head_pos(3)); this.agrOfComplementExtractors.add(dep_pos_head_word(3)); this.agrOfComplementExtractors.add(dep_pos_head_pos(3)); //WH-pronoun: Simple relative pronoun feature extractors this.whExtractors.add(dep_word_head_stem(4)); this.whExtractors.add(dep_word_head_pos(4)); this.whExtractors.add(dep_word_head_class(4)); //WH-pronoun: Conjunct/Disjunct subj feature extractors this.whConjExtractors.add(dep_word_head_stem(5)); this.whConjExtractors.add(dep_word_head_pos(5)); this.whConjExtractors.add(dep_word_head_class(5)); //Unbalanced punctuation this.punctExtractor.add(unbal_punct()); } /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { addFeatures(sign, complete); return getFeatureMap(sign); } /** Recursively adds features to the feature map for the given sign, if not already present. */ //TODO: Lazier feature extraction involving conditional feature extractors protected void addFeatures(Sign sign, boolean complete) { // check for existing map, otherwise make one if (getFeatureMap(sign) != null) return; // lex case if (sign.isLexical()) { currentMap = new FeatureMap(0); } // non-terminal else { Sign[] inputs = sign.getDerivationHistory().getInputs(); // first recurse for (Sign child : inputs) addFeatures(child, false); // use input maps in making current map if (inputs.length == 1) { currentMap = new FeatureMap(getFeatureMap(inputs[0])); } else if (inputs.length == 2) { currentMap = new FeatureMap(getFeatureMap(inputs[0]), getFeatureMap(inputs[1])); } String subjArg=null; //do each newly filled dep for (LexDependency dep : sign.getFilledDeps()) { this.headSign=dep.lexHead; this.depSign=dep.lexDep; //System.out.println("DEP: "+dep); //Find value of the subject feature if(subjArg==null){ subjArg=getSubjectFeature(dep.lexHead.getCategory()); //Back-off to Arg0 heuristic if subject feature not available for this verb if(subjArg==null)subjArg="Arg0"; } //Subject-verb agr features if(subjArg.equals(dep.rel) && (dep.lexHead.getOrthography().equals("was") || dep.lexHead.getOrthography().equals("were") || dep.lexHead.getPOS().equals("VBZ") || dep.lexHead.getPOS().equals("VBP"))){ //Simple subj-verb feats //Increment instance number if in error analysis mode if(sentId!=null)INSTANCENUM++; inc(agrExtractors); //Disjunct features if(dep.lexDep.getOrthography().equals("or")){ ArrayListrels=new ArrayList(2); rels.add("First");rels.add("Next"); Hashtablecdeps=this.getLowerSiblingDeps(inputs,dep.lexDep,rels,null); if(cdeps!=null){ for(Enumeratione=cdeps.keys();e.hasMoreElements();){ LexDependency cdep=e.nextElement(); this.depSign=cdep.lexDep; inc(agrConjExtractors); } } } //Of-complement subjects (for non-numeral, non-%-sign subjs) String subjClass=dep.lexDep.getWords().get(0).getSemClass(); String subjPOS=dep.lexDep.getPOS(); if(subjClass==null)subjClass="NULL"; if(!subjClass.equals("PERCENT") && !subjPOS.startsWith("CD")){ ArrayListrels=new ArrayList(1); rels.add("Mod"); ArrayListdepPreds=new ArrayList(1); depPreds.add("of"); HashtableofComplDeps=this.getLowerSiblingDeps(inputs,dep.lexDep,rels,depPreds); if(ofComplDeps!=null){ for(Enumeratione1=ofComplDeps.keys();e1.hasMoreElements();){ LexDependency ofComplDep=e1.nextElement(); Sign[] ofComplSigns=ofComplDeps.get(ofComplDep).getDerivationHistory().getInputs(); rels=new ArrayList(1); rels.add("Arg1"); HashtableofDeps=this.getLowerSiblingDeps(ofComplSigns,ofComplDep.lexDep,rels,null); if(ofDeps!=null){ for(Enumeratione2=ofDeps.keys();e2.hasMoreElements();){ LexDependency ofDep=e2.nextElement(); this.depSign=ofDep.lexDep; inc(agrOfComplementExtractors); } } } } } } //Relative clause features String whPrn=dep.lexDep.getOrthography(); if((dep.rel.equals("GenRel")||dep.rel.equals("whApposRel")) && (whPrn.equals("that")||whPrn.equals("who")||whPrn.equals("which")||whPrn.equals("whose"))){ //Make sure relative clause is linked to head of the quoted NP //(and not the quotation mark itself) Sign sib=this.getSibling(sign.getSiblingFilledDeps(),"Arg"); if(sib!=null){ this.headSign=sib; } //Simple WH-pronoun features //Increment instance number if in error analysis mode if(sentId!=null)INSTANCENUM++; inc(whExtractors); //Proximal conjunct features if(dep.lexDep.getPOS().equals("CC") || dep.lexDep.getOrthography().equals(",") || dep.lexDep.getOrthography().equals(";")|| dep.lexDep.getOrthography().equals("or")|| dep.lexDep.getOrthography().equals("and")){ ArrayListrels=new ArrayList(1); rels.add("Next"); Hashtablecdeps=this.getLowerSiblingDeps(inputs,dep.lexDep,rels,null); if(cdeps!=null){ for(Enumeratione=cdeps.keys();e.hasMoreElements();){ LexDependency cdep=e.nextElement(); this.depSign=cdep.lexDep; inc(whConjExtractors); } } } } } //Punctuation feature extraction: Unbalanced sentence medial appositions are flagged if (sign!=null && inputs!=null) { //Pass up unbalanced punctuation indicator //Result cat of current has unbal feature Category target = sign.getCategory().getTarget(); FeatureStructure fs = target.getFeatureStructure(); String punctFeatVal=null; if ( (fs != null && fs.hasAttribute("unbal"))) { Object val = fs.getValue("unbal"); punctFeatVal = (val instanceof SimpleType) ? ((SimpleType)val).getName() : null; } //Right child (binary case) or only child (unary case) has unbalanced punct feature SignProps childProps=(SignProps)inputs[inputs.length-1].getData(SignProps.class); if(childProps!=null)punctFeatVal=childProps.getUnbalancedPunct(); if(punctFeatVal!=null){ SignProps currProps=new SignProps(punctFeatVal); sign.addData(currProps); } //Extract unbalanced punctuation feature for unbalanced sentence medial punctuation if (inputs.length == 2) { //Left child has unbalanced punct feature SignProps lchildProps=(SignProps)inputs[0].getData(SignProps.class); if(lchildProps!=null && lchildProps.getUnbalancedPunct()!=null){ Word nextWord = inputs[1].getWords().get(0); //Check whether right child begins with a punctuation mark; else fire feature if (!isPunct(nextWord)){ inc(punctExtractor); } } } } } // store it storeFeatureMap(sign); } public Sign getOfComplSign(){ Sign retval=null; return retval; } /** Stores the current feature map as a data object in the given sign. */ protected void storeFeatureMap(Sign sign) { sign.addData(new FeatureMapWrapper(currentMap)); } /** Returns the feature map for this extractor from the given sign (null if none). */ public FeatureMap getFeatureMap(Sign sign) { FeatureMapWrapper fmw = (FeatureMapWrapper)sign.getData(FeatureMapWrapper.class); return (fmw != null) ? fmw.featureMap : null; } /** * Increments the count of the given features, if relevant. */ protected void inc(List>> extractors) { for (List> lazyExtractor : extractors) { Alphabet.Feature f = alphabet.indexLazy(lazyExtractor); if (f != null)currentMap.inc(f); } } //------------------------------------ // utility functions //Get value of subject feature from verb's result cat public String getSubjectFeature(Category cat){ String retval=null; if (cat instanceof ComplexCat) { Category resCat = ((ComplexCat)cat).getResult(); retval=this.getSubjectFeature(resCat); } else if (cat instanceof AtomCat) { AtomCat ac = (AtomCat) cat; FeatureStructure fs = ac.getFeatureStructure(); for(String attr: fs.getAttributes()){ if(attr.equals("sbj")){ retval=fs.getValue(attr).toString(); break; } } } return retval; } //checks for punct private boolean isPunct(Word w) { String pos = w.getPOS(); boolean retval = pos.startsWith("PUNCT"); retval = retval || pos.equals(".") || pos.equals(",") || pos.equals(";") || pos.equals(":") || pos.equals("LRB") || pos.equals("RRB"); //if (retval) { //System.out.println("isPunct: " + w.getForm() + " pos: " + pos); //} return retval; } // Get siblings of a given head 1-step down the derivation, given the head-sibling relations and lexical preds of deps .*/ public Hashtable getLowerSiblingDeps(Sign[] inputs,Sign headSign,ArrayListrels,ArrayListdepPreds){ Hashtable retval=new Hashtable(); for(Sign sign: inputs){ if(retval.size()==rels.size())break; Listsdeps=sign.getSiblingFilledDeps(); sdeps.addAll(sign.getFilledDeps()); for(LexDependency sdep: sdeps){ if(sdep.lexHead==headSign && rels.contains(sdep.rel) && !retval.containsKey(sdep)){ if(depPreds==null || depPreds.contains(sdep.lexDep.getOrthography())){ retval.put(sdep,sign); } } } } if(retval.size()==0)retval=null; return retval; } //returns sibling sign of a given head given a relation label private Sign getSibling(List sdeps,String rel){ Sign retval=null; if(sdeps!=null){ for(LexDependency dep: sdeps){ if(dep.rel.equals(rel)){ retval=dep.lexDep; break; } } } return retval; } // returns acceptable paraphrases for words private String adjustWord(String word) { String retval=word; //Account for acceptable paraphrases if (word.equals("'ve")) retval="have"; else if (word.equals("'s")) retval="is"; else if (word.equals("'re")) retval="are"; return retval; } //adjusts POS tags private String adjustPOS(String word,String pos,String semClass) { String retval=pos; if(word.equals("has")) retval="VBZ"; else if (word.equals("have")) retval="VBP"; else if(word.equals("one") || word.equals("1")) pos="CD-1"; else if(semClass!=null && semClass.equals("PERCENT")) retval=semClass; else if(word.equals(",") || word.equals(";")) retval="CC"; return retval; } //adjust sem class info private String adjustSemClass(String semClass) { String retval="UNK"; if(semClass!=null){ String[]temp=semClass.split("\\|"); retval=temp[0].split(":")[0]; } return retval; } //main prefixes (AGR=Agr; CONJ=Conjn/Disjn; WH=wh-pronoun; OF=Of-complement) private void add_prefix_main1(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "AGR"; }}); } private void add_prefix_main2(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "AGRCONJ"; }}); } private void add_prefix_main3(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "AGROF"; }}); } private void add_prefix_main4(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "AGRWH"; }}); } private void add_prefix_main5(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "AGRWHCONJ"; }}); } //instance # in error analysis mode private void add_instance_num(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return Integer.toString(INSTANCENUM); }}); } //sub-prefixes (W=Word; P=POS tag; S=Stem; C=SemClass) private void add_prefix_sub1(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "WW"; }}); } private void add_prefix_sub2(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "WP"; }}); } private void add_prefix_sub3(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "PW"; }}); } private void add_prefix_sub4(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "PP"; }}); } private void add_prefix_sub5(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "WS"; }}); } private void add_prefix_sub6(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "WC"; }}); } //select required feature prefix private void add_prefix(int prefix,List> retval) { switch (prefix) { case 1:add_prefix_main1(retval);break; case 2:add_prefix_main2(retval);break; case 3:add_prefix_main3(retval);break; case 4:add_prefix_main4(retval);break; case 5:add_prefix_main5(retval);break; } } // ------------------------------- // feature extractors // dep-word-head-word private List> dep_word_head_word(int prefix) { List> retval = new ArrayList>(3); if(this.sentId!=null)add_instance_num(retval); add_prefix(prefix,retval); add_prefix_sub1(retval); add_dep_word(retval); add_head_word(retval); return retval; } //dep-word head-pos private List> dep_word_head_pos(int prefix) { List> retval = new ArrayList>(3); if(this.sentId!=null)add_instance_num(retval); add_prefix(prefix,retval); add_prefix_sub2(retval); add_dep_word(retval); add_head_pos(retval); return retval; } //dep-pos head-word private List> dep_pos_head_word(int prefix) { List> retval = new ArrayList>(3); if(this.sentId!=null)add_instance_num(retval); add_prefix(prefix,retval); add_prefix_sub3(retval); add_dep_pos(retval); add_head_word(retval); return retval; } //dep-pos head-pos private List> dep_pos_head_pos(int prefix) { List> retval = new ArrayList>(3); if(this.sentId!=null)add_instance_num(retval); add_prefix(prefix,retval); add_prefix_sub4(retval); add_dep_pos(retval); add_head_pos(retval); return retval; } //dep-word head-stem private List> dep_word_head_stem(int prefix) { List> retval = new ArrayList>(3); if(this.sentId!=null)add_instance_num(retval); add_prefix(prefix,retval); add_prefix_sub5(retval); add_dep_word(retval); add_head_stem(retval); return retval; } //dep-word head-class private List> dep_word_head_class(int prefix) { List> retval = new ArrayList>(3); if(this.sentId!=null)add_instance_num(retval); add_prefix(prefix,retval); add_prefix_sub6(retval); add_dep_word(retval); add_head_class(retval); return retval; } //unbalanced punctuation private List> unbal_punct() { List> retval = new ArrayList>(1); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return "$punct"; }}); return retval; } //head word private void add_head_word(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ String word=adjustWord(headSign.getWordForm());return word; }}); } //head stem private void add_head_stem(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return headSign.getWords().get(0).getStem();}}); } //head class private void add_head_class(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ String semClass=adjustSemClass(headSign.getWords().get(0).getSemClass());return semClass;}}); } // head pos private void add_head_pos(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ String pos=adjustPOS(headSign.getOrthography(),headSign.getPOS(),headSign.getWords().get(0).getSemClass());return pos; }}); } // dep word private void add_dep_word(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ String word=adjustWord(depSign.getWordForm());return word; }}); } // dep pos private void add_dep_pos(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ String pos=adjustPOS(depSign.getOrthography(),depSign.getPOS(),depSign.getWords().get(0).getSemClass());return pos; }}); } } ================================================ FILE: src/opennlp/ccg/hylo/Flattener.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-6 Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.util.*; import java.util.*; import gnu.trove.*; /** * A class for performing flattening operations on LFs. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2011/06/06 18:52:30 $ **/ public class Flattener { // the resulting preds private List preds = new ArrayList(); // counter for alts private int altCount = 0; // counter for opts private int optCount = 0; // root preds private List roots = new ArrayList(); // map from preds to children from original expression (identity keys); // includes dummy parents to preserve structure private ListMap childMap = new ListMap(true); // map from nominals to highest pred for that nominal from original expression private Map nomMap = new HashMap(); // map from pred to depth in original expression private Map depthMap = new HashMap(); // map from nominal to highest parent nominal in original expression, or null if a root private Map parentMap = new HashMap(); // null nominal for use in dummy parents during flattening private static Nominal nullNom = new NominalAtom("null"); // null prop for use in dummy parents during flattening private static Proposition nullProp = new Proposition("null"); /** * Returns a map from a nominal to its highest parent nominal in the original expression, * after flattening, or null if none. */ public Map getHighestParentMap() { return parentMap; } /** * Recursively flattens the given LF and returns a list of elementary preds. * LF chunks are preserved on satops, as are alts (exclusive disjunctions) * and opts (optional parts). * Chunks, alts and opts are propagated through shared nominals. * A runtime exception is thrown if the LF cannot be flattened. */ public List flatten(LF lf) { flatten(lf, null, null, 0, new Stack(), new TIntArrayList()); if (altCount > 0 || optCount > 0) propAltsOptsChunks(); return preds; } // recursive flattening, with conversion of alts and opts private void flatten( LF lf, Nominal currentNominal, SatOp parent, int depth, Stack alts, TIntArrayList opts ) { if (lf instanceof SatOp) { // flatten arg with new current nominal SatOp satOp = (SatOp) lf; currentNominal = satOp.getNominal(); SatOp dummyParent = makeDummySatOp(currentNominal); addSatOp(dummyParent, parent, depth, alts, opts, lf); flatten(satOp.getArg(), currentNominal, dummyParent, depth, alts, opts); } else if (lf instanceof Op) { Op op = (Op) lf; SatOp dummyParent = makeDummySatOp(currentNominal); addSatOp(dummyParent, parent, depth, alts, opts, lf); if (op._name.equals(Op.XOR)) { // introduce new alt set; add alt for each item int altSet = altCount++; for (int i = 0; i < op._args.size(); i++) { alts.push(new Alt(altSet, i)); LF arg = op._args.get(i); flatten(arg, currentNominal, dummyParent, depth+1, alts, opts); alts.pop(); } } else if (op._name.equals(Op.OPT)) { // introduce new opt index for arg opts.add(optCount++); LF arg = op._args.get(0); flatten(arg, currentNominal, dummyParent, depth+1, alts, opts); opts.remove(opts.size()-1); } else { // otherwise just flatten each item for (Iterator it = op.getArguments().iterator(); it.hasNext(); ) { flatten(it.next(), currentNominal, dummyParent, depth+1, alts, opts); } } } else if (lf instanceof Proposition) { // add SatOp for lf if (currentNominal == null) { throw new RuntimeException("No current nominal in trying to flatten " + lf); } SatOp satOp = new SatOp(currentNominal, lf); addSatOp(satOp, parent, depth, alts, opts, lf); } else if (lf instanceof HyloVar) { // just skip for now } else if (lf instanceof Diamond) { Diamond diamond = (Diamond) lf; LF arg = diamond.getArg(); if (arg instanceof Proposition || arg instanceof Nominal || arg instanceof HyloVar) { // add SatOp for diamond SatOp satOp = new SatOp(currentNominal, lf); addSatOp(satOp, parent, depth, alts, opts, lf); } else if (arg instanceof Op && ((Op)arg)._name.equals(Op.CONJ)) { // add SatOp for diamond with first nominal arg, // and flatten the rest of the args with the first nominal arg as the // new current nominal Op argOp = (Op) arg; Iterator args = argOp._args.iterator(); LF firstArg = args.next(); if (!(firstArg instanceof Nominal)) { throw new RuntimeException("First arg of diamond is not a nominal: " + firstArg); } Nominal firstNominalArg = (Nominal) firstArg; // add SatOp for diamond SatOp satOp = new SatOp(currentNominal, new Diamond(diamond.getMode(), firstNominalArg)); addSatOp(satOp, parent, depth, alts, opts, lf); // flatten rest of list for (; args.hasNext(); ) { flatten(args.next(), firstNominalArg, satOp, depth+1, alts, opts); } } else if (arg instanceof Op && ((Op)arg)._name.equals(Op.XOR)) { Op argOp = (Op) arg; SatOp dummyParent = makeDummySatOp(currentNominal); addSatOp(dummyParent, parent, depth, alts, opts, lf); // as before, process xor by introducing new alt set and adding alt for each disjunct; // this time, also assume each disjunct is a conj op or nominal, and add a diamond satop // to the disjunct nominal int altSet = altCount++; for (int i = 0; i < argOp._args.size(); i++) { alts.push(new Alt(altSet, i)); LF disjunct = argOp._args.get(i); if (!(disjunct instanceof Op && ((Op)disjunct)._name.equals(Op.CONJ)) && !(disjunct instanceof Nominal)) { throw new RuntimeException("Disjunct of diamond is not a conj op or nominal: " + disjunct); } // conj op case if (disjunct instanceof Op) { Op disjunctOp = (Op) disjunct; Iterator args = disjunctOp._args.iterator(); LF firstArg = args.next(); if (!(firstArg instanceof Nominal)) { throw new RuntimeException("First arg of conj op under xor op is not a nominal: " + firstArg); } // add SatOp for diamond Nominal disjunctNominal = (Nominal) firstArg; SatOp satOp = new SatOp(currentNominal, new Diamond(diamond.getMode(), disjunctNominal)); addSatOp(satOp, dummyParent, depth+1, alts, opts, lf); // flatten rest of list for (; args.hasNext(); ) { flatten(args.next(), disjunctNominal, satOp, depth+2, alts, opts); } } // nominal case else { // just add SatOp for diamond Nominal disjunctNominal = (Nominal) disjunct; SatOp satOp = new SatOp(currentNominal, new Diamond(diamond.getMode(), disjunctNominal)); addSatOp(satOp, dummyParent, depth+1, alts, opts, lf); } alts.pop(); } } else { throw new RuntimeException("Arg of diamond is not a proposition, nominal or list: " + arg); } } else throw new RuntimeException("Unable to flatten " + lf); } // makes a dummy satop for the given nominal, if any; otherwise uses nullNom private static SatOp makeDummySatOp(Nominal nom) { return new SatOp((nom != null) ? nom : nullNom, nullProp); } // handles new preds private void addSatOp(SatOp satOp, SatOp parent, int depth, Stack alts, TIntArrayList opts, LF lf) { // add non-dummy satops to result if (satOp._arg != nullProp) preds.add(satOp); // update roots, maps if (parent == null) roots.add(satOp); else childMap.put(parent, satOp); Nominal nom = satOp._nominal; if (!nom.isShared()) { if (!nomMap.containsKey(nom) || depth < depthMap.get(nom)) { nomMap.put(nom, satOp); depthMap.put(nom, depth); parentMap.put(nom, (parent != null && parent._nominal != nullNom) ? parent._nominal : null); } } // set alts, opts, chunks if (!alts.empty()) satOp.alts = new ArrayList(alts); if (opts.size() > 0) satOp.opts = new TIntArrayList(opts.toNativeArray()); satOp.setChunks(lf.getChunks()); } // propagates alts, opts and chunks down from roots private void propAltsOptsChunks() { // propagate for each root nom List alts = Collections.emptyList(); TIntArrayList opts = new TIntArrayList(0); TIntArrayList chunks = new TIntArrayList(0); for (SatOp root : roots) { propAltsOptsChunks(root, alts, opts, chunks); } } // prop alts, opts & chunks, recursing through preds in child map and shared nom refs in nomMap private void propAltsOptsChunks(SatOp satOp, List alts, TIntArrayList opts, TIntArrayList chunks) { // prop alts and opts if (!alts.isEmpty()) { if (satOp.alts == null) satOp.alts = new ArrayList(3); for (Alt alt : alts) { if (!satOp.alts.contains(alt)) satOp.alts.add(alt); } Collections.sort(satOp.alts); } if (!opts.isEmpty()) { if (satOp.opts == null) satOp.opts = new TIntArrayList(3); for (int i=0; i < opts.size(); i++) { int opt = opts.get(i); if (!satOp.opts.contains(opt)) satOp.opts.add(opt); } satOp.opts.sort(); } if (!chunks.isEmpty()) { if (satOp.chunks == null) satOp.chunks = new TIntArrayList(3); for (int i=0; i < chunks.size(); i++) { int chunk = chunks.get(i); if (!satOp.chunks.contains(chunk)) satOp.chunks.add(chunk); } satOp.chunks.sort(); } // gather alts, opts & chunks for recursion List alts2 = (satOp.alts != null) ? satOp.alts : alts; TIntArrayList opts2 = (satOp.opts != null) ? satOp.opts : opts; TIntArrayList chunks2 = (satOp.chunks != null) ? satOp.chunks : chunks; // recurse through children, if any List children = childMap.get(satOp); if (children != null) { for (SatOp child : children) propAltsOptsChunks(child, alts2, opts2, chunks2); } // recurse through shared nominals, if apropos Nominal nom = satOp._nominal; if (nom.isShared()) { SatOp nomPred = nomMap.get(nom); if (nomPred != null) propAltsOptsChunks(nomPred, alts2, opts2, chunks2); } Nominal nom2 = HyloHelper.getSecondaryNominal(satOp); if (nom2 != null && nom2.isShared()) { SatOp nom2Pred = nomMap.get(nom2); if (nom2Pred != null) propAltsOptsChunks(nom2Pred, alts2, opts2, chunks2); } } } ================================================ FILE: src/opennlp/ccg/hylo/HyloAtom.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.unify.*; import gnu.trove.*; /** * A logical atomic formula. * The type is optional, so by default, it is not considered in determining equality. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/12/21 03:27:19 $ **/ public abstract class HyloAtom extends HyloFormula { private static final long serialVersionUID = 1L; protected String _name; protected SimpleType type; protected HyloAtom(String name) { this(name, null); } protected HyloAtom(String name, SimpleType st) { _name = name; type = st; } public void setAtomName(String name) { _name = name; } public String getName() { return _name; } public SimpleType getType() { return type; } public boolean occurs(Variable var) { return false; } public String toString() { return _name; } /** * Returns a pretty-printed string of this LF, with the given indent. */ public String prettyPrint(String indent) { return toString(); } public int compareTo(HyloAtom ha) { return _name.compareTo(ha._name); } /** Returns a hash code based on the atom name. */ public int hashCode() { return _name.hashCode(); } /** * Returns whether this atom equals the given object * based on the atom name. */ public boolean equals(Object obj) { if (this == obj) return true; if (obj == null || obj.getClass() != this.getClass()) { return false; } HyloAtom ha = (HyloAtom) obj; return _name.equals(ha._name); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { return hashCode(); } /** * Returns whether this atom equals the given object * up to variable names, using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { return equals(obj); } } ================================================ FILE: src/opennlp/ccg/hylo/HyloFormula.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-3 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import java.io.Serializable; import org.jdom.*; import gnu.trove.*; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; /** * A parent class to implement reasonable default behavior for classes * representing data structures for hybrid logic. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.13 $, $Date: 2009/12/21 03:27:19 $ **/ public abstract class HyloFormula implements LF, Serializable { private static final long serialVersionUID = 1L; /** * The LF chunks to which this LF belongs. */ protected TIntArrayList chunks = null; /** * Sets the LF chunks to which this LF belongs. * LF chunks are used during realization to ensure * that certain edges are semantically complete * before combination is attempted with edges * with semantics outside the chunk. * The chunks are numbered starting with 0, * and null represents no chunks. */ public void setChunks(TIntArrayList chunks) { this.chunks = chunks; } /** * Gets the LF chunks to which this LF belongs. */ public TIntArrayList getChunks() { return chunks; } /** Returns null as the default type. */ public SimpleType getType() { return null; } /** * Returns a copy of this LF. * (LF chunks are not copied.) */ public abstract LF copy(); /** * Applies a ModFcn to this LF and then applies it to all fields * which are themselves Mutables. * * @param mf a function to be applied */ public void deepMap(ModFcn mf) { mf.modify(this); } /** * Unify this Unfiable with another Object. * This default implementation will reverse the direction of unification * for a variable, otherwise it fails. * NB: The implementation of unification in the hylo package is not * complete; a particular limitation is that no attempt is made to unify lists of terms * connected by an Op instance. * * @param o object to unify with * @param s Substitution containing the variable resolutions * @exception UnifyFailure if this Unifiable cannot be unified with * the Object * @return an object which represents the unification of * this Unifiable with the Object */ public Object unify(Object u, Substitution s) throws UnifyFailure { if (u instanceof Variable) return ((Unifiable)u).unify(this, s); else throw new UnifyFailure(this.toString(), u.toString()); } /** * Check if this Unifiable can unify with another Object. This * should be implemented as a quick check to allow users of the * Unifiable to scan a group of Unifications to rapidly see if the * entire group is at least possible before descending into each * one with a full unification procedure. Thus, if a call to this * method does not result in a UnifyFailure exception being * thrown, it doesn't mean that the Object can definitely be * unified with this Unifiable -- what is important is that when a * call to this method throws a UnifyFailure exception, it permits * one to avoid calling the unify() method on other Unifiables in * a group because the quick check failed on this one. * * @param o object to check for unifiability * @exception UnifyFailure if this Unifiable cannot be unified with * the Object * @return the Object o, unmodified **/ public void unifyCheck(Object u) throws UnifyFailure {} /** * Replaces any variables in this Unifiable with the values found * for them in the Substitution argument. * * @param s Substitution containing the variable resolutions * @return a copy of this Unifiable with all variables from the * Substitution replaced by their values. */ public Object fill(Substitution s) throws UnifyFailure { return this; } /** * Returns a hash code using the given map from vars to ints. */ public abstract int hashCode(TObjectIntHashMap varMap); /** * Returns whether this LF equals the given object * up to variable names, using the given maps from vars to ints. */ public abstract boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2); /** * Returns an XML representation of this LF. */ public abstract Element toXml(); /** * Returns a pretty-printed string of this LF, with the given indent. */ public abstract String prettyPrint(String indent); } ================================================ FILE: src/opennlp/ccg/hylo/HyloHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import opennlp.ccg.grammar.*; import opennlp.ccg.lexicon.Lexicon; import org.jdom.*; import java.util.*; import gnu.trove.*; /** * A utility class to help with certain global operations over hybrid logic * terms. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.43 $, $Date: 2011/08/26 05:18:39 $ **/ public class HyloHelper { //----------------------------------------------------------------- // XML functions /** * Builds a Hylo term from the given element. * An "lf" element may be used to wrap one or more (implicitly conj-ed) terms. */ public static LF getLF(Element e) { LF retval = null; String type = e.getName(); if (type.equals("op")) { retval = new Op(e); } else if (type.equals("var")) { String name = getName(e); retval = new HyloVar(prefix(name), type(name)); } else if (type.equals("nomvar")) { String name = getName(e); boolean shared = "true".equals(e.getAttributeValue("shared")); retval = new NominalVar(prefix(name), type(name), shared); } else if (type.equals("nom")) { String name = getName(e); boolean shared = "true".equals(e.getAttributeValue("shared")); retval = new NominalAtom(prefix(name), type(name), shared); } else if (type.equals("prop")) { String name = getName(e); retval = new Proposition(name, existingType(name)); } else if (type.equals("satop")) { retval = new SatOp(e); } else if (type.equals("box") || type.equals("b")) { retval = new Box(e); } else if (type.equals("diamond") || type.equals("d")) { retval = new Diamond(e); } else if (type.equals("mode")) { String name = getName(e); retval = new ModeLabel(name); } else if (type.equals("modevar")) { String name = getName(e); retval = new ModeVar(name); } else if (type.equals("lf")) { retval = getLF_FromChildren(e); } else { System.out.println("Invalid hybrid logic LF type: " + type); } // assign chunks if (retval != null) { String chunks = e.getAttributeValue("chunks"); if (chunks != null) { retval.setChunks(convertChunks(chunks)); } } // done return retval; } // returns the value of the attribute 'name' or 'n' private static String getName(Element e) { String name = e.getAttributeValue("name"); if (name == null) name = e.getAttributeValue("n"); return name; } // returns the simple type with the given name, if it exists, or null if not private static SimpleType existingType(String name) { Types types = Grammar.theGrammar.types; if (types.containsSimpleType(name)) return types.getSimpleType(name); else return null; } /** Returns the prefix of the name, up to an optional colon. */ protected static String prefix(String name) { int index = name.indexOf(":"); if (index >= 0) return name.substring(0, index); else return name; } /** Returns the simple type given by the suffix of the name after the colon, or null if none. */ protected static SimpleType type(String name) { int index = name.indexOf(":"); String suffix = (index >=0 && index+1 < name.length()) ? name.substring(index+1) : null; if (suffix != null) return Grammar.theGrammar.types.getSimpleType(suffix); else return null; } /** * Returns a Hylo term from the children of the given element, * adding an implicit CONJ op if necessary. */ @SuppressWarnings("unchecked") public static LF getLF_FromChildren(Element e) { List children = e.getChildren(); if (children.size() > 1) { List preds = new ArrayList(children.size()); for (int i=0; i < children.size(); i++) { preds.add(getLF(children.get(i))); } Op conj = new Op(Op.CONJ, preds); return conj; } else return getLF(children.get(0)); } /** * Returns an XML representation of the given LF, * wrapped with an 'lf' element, * removing CONJ ops that may be left implicit. */ public static Element toXml(LF lf) { Element retval = new Element("lf"); retval.addContent(lf.toXml()); removeConjOps(retval); return retval; } //----------------------------------------------------------------- // process chunks /** * Processes and removes any chunk elements. * Each chunk element is numbered, and all contained elements are marked * as being contained by this chunk, via a "chunks" attribute. */ public static void processChunks(Element e) { processChunks(e, null, 0); removeChunkElts(e); } // recursively processes chunks, threading count through calls @SuppressWarnings("unchecked") private static int processChunks(Element e, String chunks, int count) { // check for chunk if (e.getName().equals("chunk")) { // update chunks string and counter if (chunks == null) { chunks = "" + count; } else { chunks += " " + count; } count++; } // otherwise add chunks attr, if val non-null else if (chunks != null) { e.setAttribute("chunks", chunks); } // do children List children = e.getChildren(); for (int i=0; i < children.size(); i++) { count = processChunks(children.get(i), chunks, count); } // return current count return count; } // converts chunk strings private static TIntArrayList convertChunks(String chunks) { String[] tokens = chunks.split("\\s+"); TIntArrayList retval = new TIntArrayList(tokens.length); for (int i = 0; i < tokens.length; i++) { retval.add(Integer.parseInt(tokens[i])); } return retval; } //----------------------------------------------------------------- // recursively remove certain elements private static abstract class ElementTest { abstract boolean test(Element elt); } // recursively removes elements meeting given test @SuppressWarnings("unchecked") private static void removeElts(Element elt, ElementTest eltTest) { // nb: need to dump children into a new list, in order to get a list iterator // that will allow multiple adds List children = elt.getChildren(); List newChildren = new ArrayList(children.size()); newChildren.addAll(children); for (ListIterator li = newChildren.listIterator(); li.hasNext(); ) { Element nextElt = li.next(); removeElts(nextElt, eltTest); if (eltTest.test(nextElt)) { li.remove(); for (Iterator it = nextElt.getChildren().iterator(); it.hasNext(); ) { Element childElt = it.next(); it.remove(); // removes childElt from nextElt's children, so it can become a child of elt li.add(childElt); } } } elt.removeContent(); elt.setContent(newChildren); } // recursively removes conj ops private static void removeConjOps(Element lfElt) { removeElts( lfElt, new ElementTest() { boolean test(Element elt) { return elt.getName().equals("op") && elt.getAttributeValue("name").equals(Op.CONJ); } } ); } // recursively removes chunk elements private static void removeChunkElts(Element lfElt) { removeElts( lfElt, new ElementTest() { boolean test(Element elt) { return elt.getName().equals("chunk"); } } ); } //----------------------------------------------------------------- // functions for elementary predications /** * Returns whether the given LF is an elementary predication, * ie a lexical predication, relation predication or attribute-value predication. */ public static boolean isElementaryPredication(LF lf) { return isLexPred(lf) || isRelPred(lf) || isAttrPred(lf); } /** * Returns whether the given elementary predication is a lexical predication, * ie one of the form @x(prop). */ public static boolean isLexPred(LF pred) { if (!(pred instanceof SatOp)) return false; SatOp satOp = (SatOp) pred; LF arg = satOp.getArg(); return (arg instanceof Proposition); } /** * Returns whether the given elementary predication is a relation predication, * ie one of the form @x(<Rel>y). */ public static boolean isRelPred(LF pred) { if (!(pred instanceof SatOp)) return false; SatOp satOp = (SatOp) pred; LF arg = satOp.getArg(); if (!(arg instanceof Diamond)) return false; Diamond d = (Diamond) arg; return (d.getArg() instanceof Nominal); } /** * Returns whether the given elementary predication is an attribute-value predication, * ie one of the form @x(<Rel>prop). Note that the prop is also allowed to be * a HyloVar. */ public static boolean isAttrPred(LF pred) { if (!(pred instanceof SatOp)) return false; SatOp satOp = (SatOp) pred; LF arg = satOp.getArg(); return isAttr(arg); } /** * Returns whether the given arg is an attribute-value pair, * ie one of the form <Rel>prop. Note that the prop is also allowed to be * a HyloVar. */ public static boolean isAttr(LF arg) { if (!(arg instanceof Diamond)) return false; Diamond d = (Diamond) arg; LF dArg = d.getArg(); return ( dArg instanceof Proposition || (dArg instanceof HyloVar && !(dArg instanceof NominalVar)) ); } /** * Returns the name of the lexical predicate of the given elementary predication, * or null, if the given LF is not a lexical predicate. */ public static String getLexPred(LF lf) { if (!isLexPred(lf)) return null; LF arg = ((SatOp)lf).getArg(); return ((Proposition)arg).toString(); } /** * Returns the name of the relation of the given elementary predication, * or null, if the given LF is not a relation or attribute-value predicate. */ public static String getRel(LF lf) { if (!isRelPred(lf) && !isAttrPred(lf)) return null; LF arg = ((SatOp)lf).getArg(); return ((Diamond)arg).getMode().toString(); } /** * Returns the string value of the attribute-value predicate, or * null if the given LF is not an attribute-value predicate or has no value. */ public static String getVal(LF lf) { if (!isAttrPred(lf)) return null; LF arg = ((SatOp)lf).getArg(); LF dArg = ((Diamond)arg).getArg(); if (dArg instanceof Proposition) return ((Proposition)dArg).getName(); return null; } /** * Returns the principal nominal the given elementary predication, * or null, if the given LF is not an elementary predication. */ public static Nominal getPrincipalNominal(LF lf) { if (!isElementaryPredication(lf)) return null; return ((SatOp)lf).getNominal(); } /** * Returns the secondary nominal of the given elementary predication, * or null, if the given LF is not a relation predication. */ public static Nominal getSecondaryNominal(LF lf) { if (!isRelPred(lf)) return null; LF arg = ((SatOp)lf).getArg(); return (Nominal) ((Diamond)arg).getArg(); } //----------------------------------------------------------------- // flattening /** * Returns a flattened, sorted list of elementary preds from the given LF * as a conjunction op, or as a single LF, if there is only one. * LF chunks are preserved on satops, as are alts (exclusive disjunctions) * and opts (optional parts). * A runtime exception is thrown if the LF cannot be flattened. */ @SuppressWarnings("unchecked") public static LF flattenLF(LF lf) { List preds = flatten(lf); if (preds.size() == 1) { return (LF) preds.get(0); } else { return new Op(Op.CONJ, (List)preds); } } /** * Returns a list of predications from the given LF, which is assumed to be either * a conjunction of elementary predications or a single elementary predication. */ public static List getPreds(LF lf) { if (lf instanceof Op && ((Op)lf).getName().equals(Op.CONJ)) { List args = ((Op)lf).getArguments(); List retval = new ArrayList(args.size()); for (LF arg : args) retval.add((SatOp)arg); return retval; } else { List retval = new ArrayList(1); retval.add((SatOp)lf); return retval; } } /** * Returns the first elementary predication from the given LF, which is assumed to be either * a conjunction of elementary predications or a single elementary predication; * otherwise returns null. */ public static SatOp getFirstPred(LF lf) { if (lf instanceof SatOp) return (SatOp) lf; if (lf instanceof Op && ((Op)lf).getName().equals(Op.CONJ)) { List args = ((Op)lf).getArguments(); return (SatOp) args.get(0); } return null; } /** * Returns a flattened, sorted list of elementary preds from the given LF * as a list. * LF chunks are preserved on satops, as are alts (exclusive disjunctions) * and opts (optional parts). * Chunks, alts and opts are propagated through shared nominals. * A runtime exception is thrown if the LF cannot be flattened. */ public static List flatten(LF lf) { List retval = new Flattener().flatten(lf); sort(retval); return retval; } /** * Returns the first elementary predication in the flattened LF. * A runtime exception is thrown if the LF cannot be flattened. */ public static LF firstEP(LF lf) { List preds = new Flattener().flatten(lf); return preds.get(0); } /** * Sets the origin of the elementary preds in the given LF (if any). */ public static void setOrigin(LF lf, LexSemOrigin origin) { if (lf == null) return; if (lf instanceof SatOp) ((SatOp)lf).setOrigin(origin); else if (lf instanceof Op && ((Op)lf).getName().equals(Op.CONJ)) { List args = ((Op)lf).getArguments(); for (LF arg : args) { if (arg instanceof SatOp) ((SatOp)arg).setOrigin(origin); } } } /** * Returns a map from nominals to index positions for the first EP for * that nominal in a sorted list of elementary predications. */ public static Map nomIndex(List preds) { HashMap retval = new HashMap(preds.size()/2); for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); Nominal nom = pred._nominal; if (!retval.containsKey(nom)) retval.put(nom, i); } return retval; } /** * Returns whether a nominal is a root in the list of EPs using a linear search. */ public static boolean isRoot(Nominal nom, List preds) { for (SatOp pred : preds) { Nominal child = getSecondaryNominal(pred); if (child != null && child.equals(nom)) return false; } return true; } //----------------------------------------------------------------- // lexical dependencies /** Returns the unfilled lexical dependencies for a lexical item's LF. */ public static List getUnfilledLexDeps(LF lf) { if (lf == null) return Collections.emptyList(); return LexDependency.unfilledLexDeps(getPreds(lf)); } /** * Returns the filled lexical dependencies from those in the unfilled list * by checking the sign's LF for ones that have become filled, removing the * corresponding no longer unfilled deps. */ public static List getFilledLexDeps(List unfilled, LF lf) { if (lf == null) return Collections.emptyList(); return LexDependency.filledLexDeps(unfilled, getPreds(lf)); } /** * Returns the semantic features (attribute-value preds) for the given nominal * in the given LF. */ public static List getSemFeatsForHead(Nominal nominal, LF lf) { if (nominal == null || lf == null) return Collections.emptyList(); List retval = new ArrayList(3); for (SatOp pred : getPreds(lf)) { if (nominal.equals(pred._nominal) && isAttrPred(pred)) retval.add(pred); } return retval; } //----------------------------------------------------------------- // compacting /** Composes compact and convertNominals. */ public static LF compactAndConvertNominals(LF lf, Nominal root) { LF retval = compact(lf, root); convertNominals(retval); return retval; } /** Composes compact and convertNominals with a root sign, for conversion using word positions. */ public static LF compactAndConvertNominals(LF lf, Nominal root, Sign rootSign) { root = convertNominals(lf, rootSign, root); LF retval = compact(lf, root); return retval; } /** * Returns a compacted LF from the given flattened one. * A root nominal may also be given (otherwise null). * Nominals with multiple parents are kept separate. * If there are any duplicate predications, an attempt * is made to attach them in different locations. */ public static LF compact(LF lf, Nominal root) { return Compacter.compact(lf, root); } //----------------------------------------------------------------- // convert nominals /** Converts nominal vars to atoms, renaming them based on lexical propositions. */ public static void convertNominals(LF lf) { Converter.convertNominals(lf); } /** * Converts nominal vars to atoms, renaming them based on word position, if * a root sign is given, otherwise using lexical propositions; * returns the converted nominal root. */ public static Nominal convertNominals(LF lf, Sign root, Nominal nominalRoot) { return Converter.convertNominals(lf, root, nominalRoot); } /** * Converts nominal atoms back to vars, returning the converted nominal root. * The LF is assumed to be flattened to elementary predications. */ public static Nominal convertNominalsToVars(LF lf, Nominal nominalRoot) { return Converter.convertNominalsToVars(getPreds(lf), nominalRoot); } //----------------------------------------------------------------- // append /** * Returns a the conjunction of the two LFs, either * as a conjunction op, or as a single LF, if one is null. * If either LF is itself a conj op, its elements are appended * instead of the conj op itself. * If both LFs are null, null is returned. */ public static LF append(LF lf1, LF lf2) { // set up new list int size = 0; List args1 = null; if (lf1 instanceof Op && ((Op)lf1).getName().equals(Op.CONJ)) { args1 = ((Op)lf1).getArguments(); size += args1.size(); } else if (lf1 != null) { size++; } List args2 = null; if (lf2 instanceof Op && ((Op)lf2).getName().equals(Op.CONJ)) { args2 = ((Op)lf2).getArguments(); size += args2.size(); } else if (lf2 != null) { size++; } List combined = new ArrayList(size); // add to new list if (args1 != null) { combined.addAll(args1); } else if (lf1 != null) { combined.add(lf1); } if (args2 != null) { combined.addAll(args2); } else if (lf2 != null) { combined.add(lf2); } // return if (combined.isEmpty()) { return null; } else if (combined.size() == 1) { return combined.get(0); } else { return new Op(Op.CONJ, combined); } } //----------------------------------------------------------------- // sort /** * Sorts the list of elementary predications in a conj op, * or does nothing if the LF is not a conj op. */ public static void sort(LF lf) { if (lf instanceof Op && ((Op)lf).getName().equals(Op.CONJ)) { sort(((Op)lf).getArguments()); } } /** * Sorts a list of elementary predications. */ public static void sort(List preds) { Collections.sort(preds, predComparator); } // compares elementary predications private static final Comparator predComparator = new Comparator() { public int compare(LF lf1, LF lf2){ // sort first on principal nominal int nomCompare = getPrincipalNominal(lf1).compareTo(getPrincipalNominal(lf2)); if (nomCompare != 0) return nomCompare; // sort next on type of elementary predication int typeCompare = epType(lf1).compareTo(epType(lf2)); if (typeCompare != 0) return typeCompare; // then on lex pred if (isLexPred(lf1)) { return getLexPred(lf1).compareToIgnoreCase(getLexPred(lf2)); } // then rels String rel1 = getRel(lf1); String rel2 = getRel(lf2); Lexicon theLexicon = Grammar.theGrammar.lexicon; Integer rel1Index = theLexicon.getRelationSortIndex(rel1); Integer rel2Index = theLexicon.getRelationSortIndex(rel2); int relIndexCompare = rel1Index.compareTo(rel2Index); if (relIndexCompare != 0) return relIndexCompare; int relCompare = rel1.compareToIgnoreCase(rel2); if (relCompare != 0) return relCompare; // then secondary nominal if (isRelPred(lf1)) { return getSecondaryNominal(lf1).compareTo(getSecondaryNominal(lf2)); } // otherwise 0 return 0; } }; // order of elementary predication type private static Integer epType(LF lf) { if (isLexPred(lf)) return LEX_PRED; else if (isAttrPred(lf)) return ATTR_PRED; else if (isRelPred(lf)) return REL_PRED; // shouldn't happen else return null; } private static Integer LEX_PRED = new Integer(1); private static Integer ATTR_PRED = new Integer(2); private static Integer REL_PRED = new Integer(3); //----------------------------------------------------------------- // check /** * Checks the list of elementary predications in a conj op * for well-formedness, or does nothing if the LF is not a conj op. * A UnifyFailure exception is thrown if the check fails. * The only current check is that there is no more than one lexical * predication per nominal. * The list of predications is assumed to be already sorted. */ public static void check(LF lf) throws UnifyFailure { if (lf instanceof Op && ((Op)lf).getName().equals(Op.CONJ)) { check(((Op)lf).getArguments()); } } private static void check(List preds) throws UnifyFailure { for (int i = 0; i < preds.size()-1; i++) { LF lf1 = preds.get(i); LF lf2 = preds.get(i+1); if (isLexPred(lf1) && isLexPred(lf2) && getPrincipalNominal(lf1).equals(getPrincipalNominal(lf2))) { throw new UnifyFailure(); } } } } ================================================ FILE: src/opennlp/ccg/hylo/HyloVar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; import gnu.trove.*; /** * A class for objects which can stand for any HyloFormula object. * Types are unified with other hylo vars, and with other hylo formulas when present. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.17 $, $Date: 2009/07/17 04:23:30 $ **/ public class HyloVar extends HyloFormula implements Variable, Indexed { private static final long serialVersionUID = 3455577234911944031L; protected final String _name; protected int _index; protected int _hashCode; protected SimpleType type; public HyloVar(String name) { this(name, 0, null); } public HyloVar(String name, SimpleType st) { this(name, 0, st); } protected HyloVar(String name, int index, SimpleType st) { _name = name; _index = index; type = (st != null) ? st : Grammar.theGrammar.types.getSimpleType(Types.TOP_TYPE); _hashCode = _name.hashCode() + _index + type.getIndex(); } public String name() { return _name; } public LF copy() { return new HyloVar(_name, _index, type); } public int getIndex() { return _index; } public void setIndex(int index) { _hashCode += index - _index; _index = index; } public SimpleType getType() { return type; } public boolean occurs(Variable var) { return equals(var); } public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof HyloVar)) return false; HyloVar var = (HyloVar) o; return _index == var._index && _name.equals(var._name) && type.equals(var.type); } public int compareTo(HyloVar hv) { int retval = _name.compareTo(hv._name); if (retval == 0) { if (_index < hv._index) { retval = -1; } else if (_index > hv._index) { retval = 1; } } return retval; } public Object unify(Object u, Substitution sub) throws UnifyFailure { // with nominal vars, reverse direction of unification if (u instanceof NominalVar) return ((NominalVar)u).unify(this, sub); // check for equality with u if (equals(u)) return this; // make sure u is an LF if (!(u instanceof LF)) throw new UnifyFailure(); LF lf = (LF) u; // check type compatibility, if present SimpleType st = null; if (lf.getType() != null) st = (SimpleType) type.unify(lf.getType(), sub); // with hylo vars, substitute according to type specificity then comparison order, // so that the direction of unification doesn't matter if (u instanceof HyloVar) { HyloVar u_hv = (HyloVar)u; // equal types, use comparison order if (type.equals(u_hv.getType())) { if (compareTo(u_hv) >= 0) return sub.makeSubstitution(this, u_hv); else return sub.makeSubstitution(u_hv, this); } // unequal types, use most specific one if (type.equals(st)) return sub.makeSubstitution(u_hv, this); if (u_hv.getType().equals(st)) return sub.makeSubstitution(this, u_hv); // otherwise make new hylo var with intersection type, // name based on comparison order and index, and new index String name = (compareTo(u_hv) >= 0) ? (u_hv._name + u_hv._index) : (_name + this._index); HyloVar hv_st = new HyloVar(name, UnifyControl.getUniqueVarIndex(), st); // and subst both sub.makeSubstitution(u_hv, hv_st); return sub.makeSubstitution(this, hv_st); } // with props, check for more specific type if (u instanceof Proposition) { Proposition prop = (Proposition) u; // if no or same type, just subst if (st == null || prop.getType().equals(st)) return sub.makeSubstitution(this, prop); // otherwise subst prop with name of type Proposition prop_st = new Proposition(st.getName(), st); return sub.makeSubstitution(this, prop_st); } // otherwise, do occurs check ... if (((LF)u).occurs(this)) throw new UnifyFailure(); // and then go ahead and substitute return sub.makeSubstitution(this, u); } public Object fill(Substitution sub) throws UnifyFailure { Object val = sub.getValue(this); if (val != null) { return val; } else { return this; } } public String toString() { String retval = _name+"_"+_index; if (!type.getName().equals(Types.TOP_TYPE)) retval += ":" + type.getName(); return retval; } /** Returns the name with the type separated by a colon if the type is not the top type. */ public String nameWithType() { String retval = _name; if (!type.getName().equals(Types.TOP_TYPE)) retval += ":" + type.getName(); return retval; } /** * Returns a pretty-printed string of this LF, with the given indent. */ public String prettyPrint(String indent) { return toString(); } /** Returns a hash code based on the name, index and type. */ public int hashCode() { return _hashCode; } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { // see if this already in map if (varMap.containsKey(this)) return varMap.get(this); // otherwise add it int next = varMap.size() + 1; varMap.put(this, next); return next; } /** * Returns whether this var equals the given object up to variable names, * using the given maps from vars to ints. * (Note that the name and index may differ, but the types must be equal.) */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } HyloVar hv = (HyloVar) obj; if (varMap.get(this) != varMap2.get(hv)) return false; if (!this.type.equals(hv.type)) return false; return true; } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("var"); retval.setAttribute("name", nameWithType()); return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/LexDepFeatureExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011-3 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import java.util.*; import opennlp.ccg.lexicon.Word; import opennlp.ccg.perceptron.*; import opennlp.ccg.synsem.*; import opennlp.ccg.util.TrieMap; /** * A class for extracting lexical dependency features. This class * implements the features in White and Rajkumar's EMNLP-12 paper * * Minimal Dependency Length in Realization Ranking. * * Features are extracted lazily for efficiency, using a prefix of "ld" * for "lexical dependency". * * Features potentially involve the word, POS tag, word class, definiteness, syntactic * complexity, short-long order, and total dependency length, where the latter * three can be controlled by the appropriate include* flags; of * these, only dependency length is included by default. * * Word class is based on the semantic class, a check for color terms, common suffixes, * and the presence of a hyphen or capitalization. * * The checks for definite NPs, color terms and common suffixes are done by methods with * defaults for the English CCGbank, which can be overridden in subclasses or * reconfigured in the case of color terms and suffixes. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2011/11/10 15:12:53 $ */ public class LexDepFeatureExtractor implements FeatureExtractor { /** Feature prefix constant: "ld". */ public static final String PREFIX = "ld"; /** Head precedes dep constant: "hpd". */ public static final String HEAD_PRECEDES_DEP = "hpd"; /** Dep precedes head constant: "dph". */ public static final String DEP_PRECEDES_HEAD = "dph"; /** Left of head sibs precedence constant: "lsp". */ public static final String LEFT_SIBS_PRECEDENCE = "lsp"; /** Right of head sibs precedence constant: "rsp". */ public static final String RIGHT_SIBS_PRECEDENCE = "rsp"; /** Returns the appropriate siblings precedence constant for the head-dep order constant. */ public static String sibPrecedenceForDep(String depConst) { return (depConst == DEP_PRECEDES_HEAD) ? LEFT_SIBS_PRECEDENCE : RIGHT_SIBS_PRECEDENCE; } /** Definiteness constant: "def1". */ public static final String DEF = "def1"; /** Indefiniteness constant: "def0". */ public static final String INDEF = "def0"; /** Class constant PRO. */ public static final String CLASS_PRO = "PRO"; /** Class constant COLOR. */ public static final String CLASS_COLOR = "COLOR"; /** Class constant HYPH. */ public static final String CLASS_HYPH = "HYPH"; /** Class constant CAP. */ public static final String CLASS_CAP = "CAP"; /** Class constant NIL. */ public static final String CLASS_NIL = "NIL"; /** Flag for whether to include syntactic complexity ordering features (defaults to false). */ public boolean includeComplexityFeats = false; /** Complexity ordering constant for verb presence: "1v". */ public static final String HAS_V = "1v"; /** Complexity ordering constant for verb presence: "0v". */ public static final String NO_V = "0v"; /** Complexity ordering constant for punct presence: "1p". */ public static final String HAS_P = "1p"; /** Complexity ordering constant for punct absence: "0p". */ public static final String NO_P = "0p"; /** Flag for whether to include short-long features (defaults to false). */ public boolean includeShortLong = false; /** Short-long order constant: "sl". */ public static final String SHORT_LONG_ORDER = "sl"; /** Long-short order constant: "ls". */ public static final String LONG_SHORT_ORDER = "ls"; /** Flag for whether to include global dependency length features (defaults to true). */ public boolean includeDepLen = true; /** Global dependency length feature constant: "$deplen". */ public static final String DEPLEN = "$deplen"; /** Conditional lazy extractor, for lazily extracting a feature subject to a test. */ public static abstract class ConditionalLazyExtractor { abstract boolean test(); List> lazyExtractor = new ArrayList>(5); } /** Conditional lazy evaluator, for lazily extracting a feature and its value, subject to a test. */ public static abstract class ConditionalLazyEvaluator extends ConditionalLazyExtractor { abstract float eval(); } /** Feature map wrapper, for unique retrieval from a sign's data objects. */ public static class FeatureMapWrapper { public FeatureMap featureMap; public FeatureMapWrapper(FeatureMap featureMap) { this.featureMap = featureMap; } } /** The alphabet. */ protected Alphabet alphabet = null; /** Current feature map. */ protected FeatureMap currentMap = null; /** Current sign (for extracting features). */ protected Sign currentSign = null; /** Current input signs (for extracting features). */ protected Sign[] currentInputs = null; /** Current dependency (for extracting features). */ protected LexDependency currentDep = null; /** Current head index. */ protected int currentHeadIndex = -1; /** Current dependent index. */ protected int currentDepIndex = -1; /** Current sibling dependency (for extracting features). */ protected LexDependency currentSib = null; /** Current sibling dependent index. */ protected int currentSibIndex = -1; /** Current head broad POS (for extracting features). */ protected String currentHeadBroadPOS = null; /** Current head-dependent order (for extracting features). */ protected String currentHeadDepOrder = null; /** Current siblings precedence relation (for extracting features). */ protected String currentSibsPrecedence = null; /** Current dep prececes sib flag (for extracting features). */ protected boolean currentDepPrecedesSib = false; /** Current dep sign (for extracting features). */ protected Sign currentDepSign = null; /** Current sib sign (for extracting features). */ protected Sign currentSibSign = null; /** Current dep phrase lengths (for extracting features). */ protected PhraseLengths currentDepLengths = null; /** Current sib phrase lengths (for extracting features). */ protected PhraseLengths currentSibLengths = null; /** Current difference in lengths between second and first siblings (for extracting features). */ protected PhraseLengths currentLengthsDiff = null; ///** Lexical feature extractors. */ //protected List>> lexExtractors = new ArrayList>>(); /** Dependency order feature extractors. */ protected List>> depOrderExtractors = new ArrayList>>(); /** Siblings order feature extractors. */ protected List>> sibsOrderExtractors = new ArrayList>>(); /** Siblings order conditional feature extractors. */ protected List sibsOrderCondExtractors = new ArrayList(); /** Siblings complexity order conditional feature extractors. */ protected List sibsComplexityOrderCondExtractors = new ArrayList(); /** Short-long conditional feature evaluators. */ protected List shortLongCondEvaluators = new ArrayList(); /** Global dependency length keys. */ protected List depLenKeys = new ArrayList(1); /** Global dependency length feature. */ protected Alphabet.Feature depLenFeat = null; /** Constructor. */ public LexDepFeatureExtractor() { // init lazy feature extractors depOrderExtractors.add(head_dep_order_words()); depOrderExtractors.add(head_dep_order_pos()); depOrderExtractors.add(head_dep_order_word_pos()); depOrderExtractors.add(head_dep_order_pos_word()); sibsOrderExtractors.add(sibs_precedence_words()); sibsOrderExtractors.add(sibs_precedence_word_pos()); sibsOrderExtractors.add(sibs_precedence_pos_word()); sibsOrderCondExtractors.add(sibs_precedence_pos()); sibsOrderExtractors.add(sibs_precedence_word_class()); sibsOrderExtractors.add(sibs_precedence_class_word()); sibsOrderCondExtractors.add(sibs_precedence_class()); sibsOrderCondExtractors.add(sibs_precedence_rels()); sibsOrderCondExtractors.add(sibs_precedence_defs()); // init complexity feature extractors sibsComplexityOrderCondExtractors.add(sibs_precedence_verbs()); sibsComplexityOrderCondExtractors.add(sibs_precedence_puncts()); // init short-long feature evaluators shortLongCondEvaluators.add(short_long_words()); // init dep len keys depLenKeys.add(DEPLEN); } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { this.alphabet = alphabet; } /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { addFeatures(sign, complete); return getFeatureMap(sign); } /** Recursively adds features to the feature map for the given sign, if not already present. */ protected void addFeatures(Sign sign, boolean complete) { // check for existing map, otherwise make one if (getFeatureMap(sign) != null) return; // lex case if (sign.isLexical()) { currentSign = sign; currentMap = new FeatureMap(0); //inc(lexExtractors); } // non-terminal else { Sign[] inputs = sign.getDerivationHistory().getInputs(); // first recurse for (Sign child : inputs) addFeatures(child, false); // use input maps in making current map currentSign = sign; currentInputs = inputs; if (inputs.length == 1) { currentMap = new FeatureMap(getFeatureMap(inputs[0])); } else if (inputs.length == 2) { currentMap = new FeatureMap(getFeatureMap(inputs[0]), getFeatureMap(inputs[1])); } // do each newly filled dep for (LexDependency dep : sign.getFilledDeps()) { currentDep = dep; currentHeadBroadPOS = getHeadBroadPOS(dep); setDepIndexes(dep); currentHeadDepOrder = getHeadDepOrder(); inc(depOrderExtractors); // do dep len if (includeDepLen) { Alphabet.Feature f = getDepLenFeat(); if (f != null) currentMap.add(f, (float)depLen()); } // do order for each sib on the same side of the same head for (LexDependency sib : sign.getSiblingFilledDeps()) { currentSib = sib; if (dep.lexHead != sib.lexHead) continue; setSibIndex(sib); if (currentHeadDepOrder != getHeadSibOrder()) continue; currentSibsPrecedence = sibPrecedenceForDep(currentHeadDepOrder); currentDepPrecedesSib = depPrecedesSib(); inc(sibsOrderExtractors); incCond(sibsOrderCondExtractors); // do complexity, length feats if (includeComplexityFeats || includeShortLong) { setLengthsDiff(); if (currentLengthsDiff != null) { if (includeComplexityFeats) incCond(sibsComplexityOrderCondExtractors); if (includeShortLong) addCond(shortLongCondEvaluators); } } } } } // store it storeFeatureMap(sign); } /** Stores the current feature map as a data object in the given sign. */ protected void storeFeatureMap(Sign sign) { sign.addData(new FeatureMapWrapper(currentMap)); } /** Returns the feature map for this extractor from the given sign (null if none). */ protected FeatureMap getFeatureMap(Sign sign) { FeatureMapWrapper fmw = (FeatureMapWrapper)sign.getData(FeatureMapWrapper.class); return (fmw != null) ? fmw.featureMap : null; } /** * Increments the count of the given features, if relevant. */ protected void inc(List>> extractors) { for (List> lazyExtractor : extractors) { Alphabet.Feature f = alphabet.indexLazy(lazyExtractor); if (f != null) currentMap.inc(f); } } /** * Increments the count of the given conditional features, if relevant. */ protected void incCond(List condExtractors) { for (ConditionalLazyExtractor condExtractor : condExtractors) { if (condExtractor.test()) { Alphabet.Feature f = alphabet.indexLazy(condExtractor.lazyExtractor); if (f != null) currentMap.inc(f); } } } /** * Adds to the values of the given conditional features, if relevant. */ protected void addCond(List condEvaluators) { for (ConditionalLazyEvaluator condEvaluator : condEvaluators) { if (condEvaluator.test()) { Alphabet.Feature f = alphabet.indexLazy(condEvaluator.lazyExtractor); if (f != null) currentMap.add(f, condEvaluator.eval()); } } } //------------------------------------ // utility functions // returns up to the first two chars of the head POS private String getHeadBroadPOS(LexDependency dep) { String pos = dep.lexHead.getPOS(); String retval = pos; if (pos.length() > 2) retval = pos.substring(0, 2).intern(); return retval; } // sets the current head and dep indexes, and the dep sign private void setDepIndexes(LexDependency dep) { currentHeadIndex = currentSign.wordIndex(dep.lexHead); currentDepIndex = currentSign.wordIndex(dep.lexDep); currentDepSign = currentSign.getSignHeadedByDep(dep); } // returns the head-dependent order private String getHeadDepOrder() { return (currentHeadIndex < currentDepIndex) ? HEAD_PRECEDES_DEP : DEP_PRECEDES_HEAD; } // sets the current sib index and the sib sign private void setSibIndex(LexDependency sib) { currentSibIndex = currentSign.wordIndex(sib.lexDep); currentSibSign = currentSign.getSignHeadedByDep(sib); } // returns the head--sibling dependent order private String getHeadSibOrder() { return (currentHeadIndex < currentSibIndex) ? HEAD_PRECEDES_DEP : DEP_PRECEDES_HEAD; } // returns whether the dep precedes the sib private boolean depPrecedesSib() { return currentDepIndex < currentSibIndex; } //------------------------------------ // definiteness functions /** Class for storing whether a sign is or immediately contains a definite NP. */ public static class DefiniteNP { /** Definiteness value; null means not an NP (or NP parent). */ public Boolean def; public DefiniteNP(Boolean def) { this.def = def; } public String toString() { return "defNP: " + def; } } /** Returns the definite NP status for a sign, caching it in the sign. */ public DefiniteNP getDefiniteNP(Sign sign) { // check cached DefiniteNP defNP = (DefiniteNP)sign.getData(DefiniteNP.class); if (defNP != null) return defNP; // determine def NP status Boolean def = null; // check for NP Sign npSign = getSignOrChildSignAsNP(sign); if (npSign != null) { // set status to definite by default; check for indef def = Boolean.TRUE; // get sem feats Nominal npNom = npSign.getCategory().getIndexNominal(); List semFeats = HyloHelper.getSemFeatsForHead(npNom, npSign.getCategory().getLF()); // check for nil for (SatOp feat : semFeats) { if (isIndefFeat(feat)) { def = Boolean.FALSE; break; } } // otherwise check deps if (def) { // get all deps List allDeps = new ArrayList(5); allDeps.addAll(npSign.getFilledDeps()); allDeps.addAll(npSign.getSiblingFilledDeps()); // check for a|an|some|any for (LexDependency dep : allDeps) { if (isIndefDep(dep)) { def = Boolean.FALSE; break; } } } } // store result and return defNP = new DefiniteNP(def); sign.addData(defNP); return defNP; } /** * Returns the given sign if it's an NP sign; * otherwise returns the first child sign that's an NP sign; * otherwise returns null. */ protected Sign getSignOrChildSignAsNP(Sign sign) { if (isNP(sign)) return sign; if (sign.isLexical()) return null; Sign[] inputs = sign.getDerivationHistory().getInputs(); for (int i = 0; i < inputs.length; i++) { if (isNP(inputs[i])) return inputs[i]; } return null; } /** * Returns whether the given sign is an NP. * The default implementation tests for a category type of "np". */ protected boolean isNP(Sign sign) { Category cat = sign.getCategory(); if (!(cat instanceof AtomCat)) return false; AtomCat ac = (AtomCat) cat; return (ac.getType().equals("np")); } /** * Returns whether the given semantic features signals indefiniteness. * The default implementation tests for <:det>nil. */ protected boolean isIndefFeat(SatOp feat) { return HyloHelper.getRel(feat).equals("det") && "nil".equals(HyloHelper.getVal(feat)); } /** * Returns whether the given lexical dependency signals indefiniteness. * The default implementation checks for <Det>a|an|any|some. */ protected boolean isIndefDep(LexDependency dep) { if (dep.rel.equalsIgnoreCase("Det")) { String form = dep.lexDep.getWordForm(); if (form=="a" || form=="an" || form=="any" || form=="some") return true; } return false; } /** Returns whether two signs differ in definiteness. */ public boolean defDifference(Sign sign1, Sign sign2) { DefiniteNP defNP1 = getDefiniteNP(sign1); if (defNP1.def == null) return false; DefiniteNP defNP2 = getDefiniteNP(sign2); if (defNP2.def == null) return false; return defNP1.def != defNP2.def; } /** Returns the appropriate definiteness/indefiniteness constant. */ public String defConstant(DefiniteNP defNP) { return (defNP.def) ? DEF : INDEF; } //------------------------------------ // phrase length functions /** * Class for storing length of phrase in words, puncts and (finite) verbs, for unique retrieval from a sign's data objects. */ public static class PhraseLengths { public int wordlen, punctlen, verblen; public PhraseLengths(int wordlen, int punctlen, int verblen) { this.wordlen = wordlen; this.punctlen = punctlen; this.verblen = verblen; } public String toString() { return " wordlen: " + wordlen + " punctlen: " + punctlen + " verblen: " + verblen; } } /** Returns the phrase lengths for a sign, caching them in the sign. */ public PhraseLengths getPhraseLengths(Sign sign) { // check cached PhraseLengths lengths = (PhraseLengths)sign.getData(PhraseLengths.class); if (lengths != null) return lengths; int wordlen = 0, punctlen = 0, verblen = 0; // lex case if (sign.isLexical()) { for (Word w: sign.getWords()) { wordlen++; if (isPunct(w)) punctlen++; if (isVerb(w)) verblen++; } } // non-lex: add child lengths else { Sign[] inputs = sign.getDerivationHistory().getInputs(); for (int i = 0; i < inputs.length; i++) { PhraseLengths lengthsI = getPhraseLengths(inputs[i]); wordlen += lengthsI.wordlen; punctlen += lengthsI.punctlen; verblen += lengthsI.punctlen; } } // store result and return lengths = new PhraseLengths(wordlen, punctlen, verblen); sign.addData(lengths); return lengths; } /** * Returns whether a word is a punctuation mark that typically signals sentence-internal complexity. * The default implementation tests for commas, dashes (--), semi-colons and colons. */ protected boolean isPunct(Word word) { // NB: in principle could use POS, but sometimes punctuation marks seem to end up with IN as the POS tag String form = word.getForm(); return (form == "," || form == "--" || form == ";" || form == ":"); } /** * Returns whether a word is a verb that indicates a substantial clause. * The default implementation tests for the finite verb POS tags VBD, VBP and VBZ. */ protected boolean isVerb(Word word) { String pos = word.getPOS(); return (pos == "VBD" || pos == "VBP" || pos == "VBZ"); } /** * Sets the differences in length between the signs headed by the current dep and sib, or null if none; * also sets the current dep and sib lengths. The lengths are set to the lengths of the second sign * minus those of the first sign. */ protected void setLengthsDiff() { // reset currentLengthsDiff = null; currentDepLengths = null; currentSibLengths = null; // ensure both there if (currentDepSign == null || currentSibSign == null) return; // get phrase lengths currentDepLengths = getPhraseLengths(currentDepSign); currentSibLengths = getPhraseLengths(currentSibSign); // get 1st and 2nd phrase lengths PhraseLengths pl1 = (currentDepPrecedesSib) ? currentDepLengths : currentSibLengths; PhraseLengths pl2 = (currentDepPrecedesSib) ? currentSibLengths : currentDepLengths; // set diff to 2nd - 1st currentLengthsDiff = new PhraseLengths(pl2.wordlen-pl1.wordlen, pl2.punctlen-pl1.punctlen, pl2.verblen-pl1.verblen); } //------------------------------------ // dep len functions /** Returns the dep len feature if not already set. */ protected Alphabet.Feature getDepLenFeat() { if (depLenFeat == null) depLenFeat = alphabet.index(depLenKeys); return depLenFeat; } /** * Returns the dependency length between the current head and the current dependent. * The default implementation returns the number of intervening words excluding * punctuation (as determined by isPunct), and doesn't count each word in a collapsed NE separately. */ protected int depLen() { List words = currentSign.getWords(); int min = Math.min(currentHeadIndex, currentDepIndex); int max = Math.max(currentHeadIndex, currentDepIndex); int count = 0; for (int i=min+1; i < max; i++) { Word w = words.get(i); if (!isPunct(w)) count++; } return count; } //------------------------------------ // word class functions /** * Returns a class for the word, or CLASS_NIL if none. * The default implementation returns one of the following, in this order: * the semantic class of the word; * CLASS_PRO, if a pronoun; * CLASS_COLOR, if a color word; * the suffix, if getSuffix returns a value; * CLASS_HYPH, if the word is hyphenated; * CLASS_CAP, if capitalized; * or CLASS_NIL, otherwise. * The word class is cached using cachedWordClasses. */ protected String getWordClass(Word word) { String retval = cachedWordClasses.get(word); if (retval != null) return retval; String wClass = word.getSemClass(); if (wClass != null) return updateCachedWordClasses(word, wClass); if (isPro(word)) return updateCachedWordClasses(word, CLASS_PRO); String form = word.getForm(); if (colors.contains(form)) return updateCachedWordClasses(word, CLASS_COLOR); String suffix = getSuffix(form); if (suffix != null) return updateCachedWordClasses(word, suffix); if (form.indexOf('-') >= 0) return updateCachedWordClasses(word, CLASS_HYPH); if (Character.isUpperCase(form.charAt(0))) return updateCachedWordClasses(word, CLASS_CAP); return updateCachedWordClasses(word, CLASS_NIL); } /** * Returns whether a word is a pronoun. * The default implementation returns whether the POS tag starts with "PR". */ protected boolean isPro(Word word) { return word.getPOS().startsWith("PR"); } /** * The set of color words to check for in determining the word class. */ protected Set colors = defaultColors(); /** * Sets the set of color words. */ public void setColorWords(Set colorWords) { colors = colorWords; } /** * Returns the default set of color words: 11 common English colors, with two spellings of gray/grey. */ protected Set defaultColors() { String[] colors = { "black", "blue", "brown", "gray", "grey", "green", "orange", "pink", "purple", "red", "white", "yellow" }; return new HashSet(Arrays.asList(colors)); } /** * A sequence of suffixes to check for in determining the word class, ordered by specificity. */ protected String[] suffixClasses = defaultSuffixClasses(); /** * Returns the default suffix classes: 61 common English suffixes from various lists on the web. */ protected String[] defaultSuffixClasses() { return new String[] { "ancy", "aphy", "arch", "crat", "gram", "less", "logy", "ness", "nomy", "ship", "some", "sque", "tude", "ade", "age", "ant", "aph", "ary", "ast", "ate", "ble", "dom", "ent", "est", "ful", "ian", "ile", "ion", "ing", "ish", "ism", "ist", "ise", "ite", "ium", "ive", "ize", "nce", "oid", "ory", "ose", "ote", "ous", "sig", "ure", "ac", "al", "an", "cy", "ed", "en", "er", "fy", "ic", "le", "ly", "or", "se", "sy", "ty", "y" }; } /** * Sets the suffix classes, which are assumed to be interned. */ public void setSuffixClasses(String[] suffixes) { suffixClasses = suffixes; } /** * Returns a matching suffix class, or null if none. */ protected String getSuffix(String form) { for (int i=0; i < suffixClasses.length; i++) { String suff = suffixClasses[i]; if (form.length() > suff.length() && form.endsWith(suff)) return suff; } return null; } /** * Cache of word classes, using a weak hash map. */ protected WeakHashMap cachedWordClasses = new WeakHashMap(); /** * Updates the cached word classes with the given word and word class, and returns the word class. */ protected String updateCachedWordClasses(Word word, String wordClass) { cachedWordClasses.put(word, wordClass); return wordClass; } //------------------------------------ // shared feature extractor elements // prefix: "ld" + head broad POS private void add_prefix(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return PREFIX; }}); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentHeadBroadPOS; }}); } // head-dep order private void add_head_dep_order(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentHeadDepOrder; }}); } // rel private void add_rel(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentDep.rel.intern(); }}); } // common head-dep order elements private void add_head_dep_order_common(List> retval) { add_prefix(retval); add_head_dep_order(retval); add_rel(retval); } // head word private void add_head_word(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentDep.lexHead.getWordForm(); }}); } // head pos private void add_head_pos(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentDep.lexHead.getPOS(); }}); } // dep word private void add_dep_word(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentDep.lexDep.getWordForm(); }}); } // dep pos private void add_dep_pos(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentDep.lexDep.getPOS(); }}); } // sibs precedence private void add_sibs_precedence(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSibsPrecedence; }}); } // common sibs precedence elements private void add_sibs_precedence_common(List> retval) { add_prefix(retval); add_sibs_precedence(retval); } // sibs word1 private void add_sibs_word1(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? currentDep.lexDep.getWordForm() : currentSib.lexDep.getWordForm(); }}); } // sibs word2 private void add_sibs_word2(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? currentSib.lexDep.getWordForm() : currentDep.lexDep.getWordForm(); }}); } // sibs pos1 private void add_sibs_pos1(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? currentDep.lexDep.getPOS() : currentSib.lexDep.getPOS(); }}); } // sibs pos2 private void add_sibs_pos2(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? currentSib.lexDep.getPOS() : currentDep.lexDep.getPOS(); }}); } // sibs class1 private void add_sibs_class1(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ Sign first = (currentDepPrecedesSib) ? currentDep.lexDep : currentSib.lexDep; return getWordClass(first.getWords().get(0)); }}); } // sibs class2 private void add_sibs_class2(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ Sign second = (currentDepPrecedesSib) ? currentSib.lexDep : currentDep.lexDep; return getWordClass(second.getWords().get(0)); }}); } // sibs rel1 private void add_sibs_rel1(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? currentDep.rel.intern() : currentSib.rel.intern(); }}); } // sibs rel2 private void add_sibs_rel2(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? currentSib.rel.intern() : currentDep.rel.intern(); }}); } //------------------------------- // feature extractors // head-dep order words private List> head_dep_order_words() { List> retval = new ArrayList>(5); add_head_dep_order_common(retval); add_head_word(retval); add_dep_word(retval); return retval; } // head-dep order pos private List> head_dep_order_pos() { List> retval = new ArrayList>(5); add_head_dep_order_common(retval); add_head_pos(retval); add_dep_pos(retval); return retval; } // head-dep order word/pos private List> head_dep_order_word_pos() { List> retval = new ArrayList>(5); add_head_dep_order_common(retval); add_head_word(retval); add_dep_pos(retval); return retval; } // head-dep order pos/word private List> head_dep_order_pos_word() { List> retval = new ArrayList>(5); add_head_dep_order_common(retval); add_head_pos(retval); add_dep_word(retval); return retval; } // sibs precedence words private List> sibs_precedence_words() { List> retval = new ArrayList>(5); add_sibs_precedence_common(retval); add_sibs_word1(retval); add_sibs_word2(retval); return retval; } // sibs precedence word pos private List> sibs_precedence_word_pos() { List> retval = new ArrayList>(5); add_sibs_precedence_common(retval); add_sibs_word1(retval); add_sibs_pos2(retval); return retval; } // sibs precedence pos word private List> sibs_precedence_pos_word() { List> retval = new ArrayList>(5); add_sibs_precedence_common(retval); add_sibs_pos1(retval); add_sibs_word2(retval); return retval; } // sibs precedence pos private ConditionalLazyExtractor sibs_precedence_pos() { ConditionalLazyExtractor retval = new ConditionalLazyExtractor() { boolean test() { return currentDep.lexDep.getPOS() != currentSib.lexDep.getPOS(); } }; add_sibs_precedence_common(retval.lazyExtractor); add_sibs_pos1(retval.lazyExtractor); add_sibs_pos2(retval.lazyExtractor); return retval; } // sibs precedence word / class private List> sibs_precedence_word_class() { List> retval = new ArrayList>(5); add_sibs_precedence_common(retval); add_sibs_word1(retval); add_sibs_class2(retval); return retval; } // sibs precedence class / word private List> sibs_precedence_class_word() { List> retval = new ArrayList>(5); add_sibs_precedence_common(retval); add_sibs_class1(retval); add_sibs_word2(retval); return retval; } // sibs precedence class private ConditionalLazyExtractor sibs_precedence_class() { ConditionalLazyExtractor retval = new ConditionalLazyExtractor() { boolean test() { return getWordClass(currentDep.lexDep.getWords().get(0)) != getWordClass(currentSib.lexDep.getWords().get(0)); } }; add_sibs_precedence_common(retval.lazyExtractor); add_sibs_class1(retval.lazyExtractor); add_sibs_class2(retval.lazyExtractor); return retval; } // sibs precedence rels private ConditionalLazyExtractor sibs_precedence_rels() { ConditionalLazyExtractor retval = new ConditionalLazyExtractor() { boolean test() { return !currentDep.rel.equals(currentSib.rel); } }; add_sibs_precedence_common(retval.lazyExtractor); add_sibs_rel1(retval.lazyExtractor); add_sibs_rel2(retval.lazyExtractor); return retval; } // sibs precedence defs private ConditionalLazyExtractor sibs_precedence_defs() { ConditionalLazyExtractor retval = new ConditionalLazyExtractor() { boolean test() { if (currentDepSign == null || currentSibSign == null) return false; return defDifference(currentDepSign, currentSibSign); } }; add_sibs_precedence_common(retval.lazyExtractor); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? defConstant(getDefiniteNP(currentDepSign)) : defConstant(getDefiniteNP(currentSibSign)); }}); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentDepPrecedesSib) ? defConstant(getDefiniteNP(currentSibSign)) : defConstant(getDefiniteNP(currentDepSign)); }}); return retval; } // sibs precedence verbs private ConditionalLazyExtractor sibs_precedence_verbs() { ConditionalLazyExtractor retval = new ConditionalLazyExtractor() { boolean test() { return currentLengthsDiff.verblen != 0 && (currentDepLengths.verblen == 0 || currentSibLengths.verblen == 0); } }; add_sibs_precedence_common(retval.lazyExtractor); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentLengthsDiff.verblen > 0) ? NO_V : HAS_V; }}); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentLengthsDiff.verblen > 0) ? HAS_V : NO_V; }}); return retval; } // sibs precedence puncts private ConditionalLazyExtractor sibs_precedence_puncts() { ConditionalLazyExtractor retval = new ConditionalLazyExtractor() { boolean test() { return currentLengthsDiff.punctlen != 0 && (currentDepLengths.punctlen == 0 || currentSibLengths.punctlen == 0); } }; add_sibs_precedence_common(retval.lazyExtractor); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentLengthsDiff.punctlen > 0) ? NO_P : HAS_P; }}); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentLengthsDiff.punctlen > 0) ? HAS_P : NO_P; }}); return retval; } // short-long words private ConditionalLazyEvaluator short_long_words() { ConditionalLazyEvaluator retval = new ConditionalLazyEvaluator() { boolean test() { return currentLengthsDiff.wordlen != 0; } float eval() { return (float) Math.abs(currentLengthsDiff.wordlen); } }; add_sibs_precedence_common(retval.lazyExtractor); retval.lazyExtractor.add(new TrieMap.KeyExtractor(){public String getKey(){ return (currentLengthsDiff.wordlen > 0) ? SHORT_LONG_ORDER : LONG_SHORT_ORDER; }}); return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/LexDependency.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import java.util.*; import opennlp.ccg.synsem.*; /** * A class for tracking semantic dependencies between lexical items. * A lex dependency is a triple consisting of a lexical head, a relation * and a lexical dependent. Either the head or dependent can be null, in * which case the dependency is considered unfilled. To be equal, a lex * dependency must have identical heads and dependents and equal relations. * Relations can involve chains of individual relations (concatenated with dots) * when there are intervening nominals for the same lexical item. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2011/08/26 21:31:52 $ */ public class LexDependency { /** The lexical head. */ public Sign lexHead; /** The relation. */ public String rel; /** The lexical dependent. */ public Sign lexDep; /** Constructor. */ public LexDependency(Sign lexHead, String rel, Sign lexDep) { this.lexHead = lexHead; this.rel = rel; this.lexDep = lexDep; } /** Hash code. */ public int hashCode() { int retval = rel.hashCode(); if (lexHead != null) retval += 31 * lexHead.hashCode(); if (lexDep != null) retval += 7 * lexDep.hashCode(); return retval; } /** Equals. */ public boolean equals(Object obj) { if (obj == this) return true; if (!(obj instanceof LexDependency)) return false; LexDependency dep = (LexDependency) obj; return lexHead == dep.lexHead && lexDep == dep.lexDep && rel.equals(dep.rel); } /** toString. */ public String toString() { StringBuffer retval = new StringBuffer(); if (lexHead == null) retval.append("null"); else retval.append(lexHead.getOrthography()); retval.append('<').append(rel).append('>'); if (lexDep == null) retval.append("null"); else retval.append(lexDep.getOrthography()); return retval.toString(); } /** Filled test: neither head nor dependent null. */ public boolean filled() { return lexHead != null && lexDep != null; } /** * Returns a list of unfilled dependencies for a list of EPs * for a lexical item. */ public static List unfilledLexDeps(List preds) { List retval = new ArrayList(4); Map nomIndex = HyloHelper.nomIndex(preds); Set nominals = nomIndex.keySet(); // special case for indexRels if (nominals.size() == 1 && !HyloHelper.isLexPred(preds.get(0))) { Sign lexHead = null; LexSemOrigin origin = preds.get(0).getOrigin(); if (origin instanceof Sign) lexHead = (Sign) origin; else return retval; for (SatOp pred : preds) { String rel = HyloHelper.getRel(pred); if (rel != null) { // add unfilled dep with lex head as dep retval.add(new LexDependency(null, rel, lexHead)); if (HyloHelper.isRelPred(pred)) { // also add unfilled dep with lex head as head retval.add(new LexDependency(lexHead, rel, null)); } } } return retval; } // otherwise, starting with each root nominal, enumerate paths to leaf nominals for (Nominal root : nominals) { // check for root nominal if (!HyloHelper.isRoot(root, preds)) continue; // set lex head Sign lexHead = null; int rootIndex = nomIndex.get(root); SatOp rootPred = preds.get(rootIndex); if (HyloHelper.isLexPred(rootPred)) { LexSemOrigin origin = rootPred.getOrigin(); if (origin instanceof Sign) lexHead = (Sign) origin; rootIndex++; } // start path for each rel for root nom for (int i=rootIndex; i < preds.size() && HyloHelper.getPrincipalNominal(preds.get(i)).equals(root); i++) { rootPred = preds.get(i); Nominal dep = HyloHelper.getSecondaryNominal(rootPred); if (dep == null) continue; String rel = HyloHelper.getRel(rootPred); addUnfilledLexDep(dep, lexHead, rel, preds, nomIndex, retval); } } return retval; } // recursively adds unfilled lex deps to retval for leaf nominals private static void addUnfilledLexDep(Nominal dep, Sign lexHead, String rel, List preds, Map nomIndex, List retval) { // if dep not in nom index, then just add unfilled dep for the current rel if (!nomIndex.containsKey(dep)) { retval.add(new LexDependency(lexHead, rel, null)); return; } // otherwise continue with the preds for the current dep int depIndex = nomIndex.get(dep); SatOp depPred = preds.get(depIndex); // if lex head null, add unfilled dep for the current rel, // then update lex head and reset rel if (lexHead == null) { Sign lexDep = null; LexSemOrigin origin = depPred.getOrigin(); if (origin instanceof Sign) { lexDep = (Sign) origin; retval.add(new LexDependency(lexHead, rel, lexDep)); } lexHead = lexDep; rel = null; } // then recurse through further rels, if any for (int i=depIndex; i < preds.size() && HyloHelper.getPrincipalNominal(preds.get(i)).equals(dep); i++) { depPred = preds.get(i); Nominal depdep = HyloHelper.getSecondaryNominal(depPred); if (depdep == null) continue; String relrel = (rel == null) ? HyloHelper.getRel(depPred) : rel + "." + HyloHelper.getRel(depPred); addUnfilledLexDep(depdep, lexHead, relrel, preds, nomIndex, retval); } } /** * Returns the filled lexical dependencies from those in the unfilled list * by checking the list of EPs for ones that have become filled, removing the * corresponding no longer unfilled deps. */ public static List filledLexDeps(List unfilled, List preds) { List retval = new ArrayList(unfilled.size()); Map nomIndex = HyloHelper.nomIndex(preds); // check each unfilled dep for (Iterator it = unfilled.iterator(); it.hasNext(); ) { LexDependency udep = it.next(); String[] rels = udep.rel.split("\\."); // dependent missing case if (udep.lexDep == null) { // follow rels to descendant pred SatOp relPred = findPred(udep.lexHead, rels[0], preds); SatOp descendantPred = findDescendantPred(relPred, 0, rels, preds, nomIndex); // check if dep filled Nominal depnom = HyloHelper.getSecondaryNominal(descendantPred); if (!nomIndex.containsKey(depnom)) continue; SatOp depPred = preds.get(nomIndex.get(depnom)); if (HyloHelper.isLexPred(depPred)) { // remove dep from unfilled it.remove(); // add filled dep, if lexical if (depPred.getOrigin() instanceof Sign) { Sign lexDep = (Sign) depPred.getOrigin(); retval.add(new LexDependency(udep.lexHead, udep.rel, lexDep)); } } } // head missing case else if (udep.lexHead == null) { // follow rels to ancestor pred SatOp relPred = findPred(udep.lexDep, rels[rels.length-1], preds); SatOp ancestorPred = findAncestorPred(relPred, rels.length-1, rels, preds); // check if head filled Nominal headnom = HyloHelper.getPrincipalNominal(ancestorPred); if (!nomIndex.containsKey(headnom)) continue; SatOp headPred = preds.get(nomIndex.get(headnom)); if (HyloHelper.isLexPred(headPred)) { // remove dep from unfilled it.remove(); // add filled dep, if lexical if (headPred.getOrigin() instanceof Sign) { Sign lexHead = (Sign) headPred.getOrigin(); retval.add(new LexDependency(lexHead, udep.rel, udep.lexDep)); } } } } return retval; } // returns the EP with the given origin and rel, or null if not found private static SatOp findPred(Sign origin, String rel, List preds) { for (SatOp pred : preds) { if (pred.getOrigin() != origin) continue; if (rel.equals(HyloHelper.getRel(pred))) return pred; } return null; } // returns the descendant EP for the given rels, or null if not found private static SatOp findDescendantPred(SatOp current, int index, String[] rels, List preds, Map nomIndex) { if (index == rels.length-1) return current; // find EP for next rel SatOp next = null; String rel = rels[++index]; Nominal depnom = HyloHelper.getSecondaryNominal(current); for (int i=nomIndex.get(depnom); i < preds.size() && HyloHelper.getPrincipalNominal(preds.get(i)).equals(depnom); i++) { SatOp pred = preds.get(i); if (rel.equals(HyloHelper.getRel(pred))) { next = pred; break; } } if (next == null) return null; // recurse return findDescendantPred(next, index, rels, preds, nomIndex); } // returns the ancestor EP for the given rels, or null if not found private static SatOp findAncestorPred(SatOp current, int index, String[] rels, List preds) { if (index == 0) return current; // find EP for previous rel SatOp prev = null; String rel = rels[--index]; Nominal headnom = HyloHelper.getPrincipalNominal(current); for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); if (headnom.equals(HyloHelper.getSecondaryNominal(pred)) && rel.equals(HyloHelper.getRel(pred))) { prev = pred; break; } } if (prev == null) return null; // recurse return findAncestorPred(prev, index, rels, preds); } /** * Filters the first list of dependencies to those sharing a head with a dependency in the second list. */ public static List filterSameHead(List deps1, List deps2) { if (deps1.isEmpty() || deps2.isEmpty()) return Collections.emptyList(); List retval = new ArrayList(deps1.size()); for (LexDependency dep1 : deps1) { for (LexDependency dep2 : deps2) { if (dep1.lexHead == dep2.lexHead) { retval.add(dep1); break; } } } return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/ModalOp.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; import java.util.*; import gnu.trove.*; /** * A parent class for modal operators, such as <P>p, [F]q, and * <>(p ^ q). * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.6 $, $Date: 2009/12/21 03:27:19 $ **/ public abstract class ModalOp extends HyloFormula { private static final long serialVersionUID = 1L; protected Mode _mode; protected LF _arg; @SuppressWarnings("unchecked") protected ModalOp(Element e) { String atomLabel = e.getAttributeValue("mode"); if (atomLabel == null) atomLabel = e.getAttributeValue("m"); if (atomLabel != null) { _mode = new ModeLabel(atomLabel); _arg = HyloHelper.getLF_FromChildren(e); } else { List children = e.getChildren(); _mode = (Mode)HyloHelper.getLF((Element)children.get(0)); _arg = HyloHelper.getLF((Element)children.get(1)); } } protected ModalOp(Mode mode, LF arg) { _mode = mode; _arg = arg; } public Mode getMode() { return _mode; } public void setMode(Mode mode) { _mode = mode; } public LF getArg() { return _arg; } public void setArg(LF arg) { _arg = arg; } public void deepMap(ModFcn mf) { _arg.deepMap(mf); mf.modify(this); } public boolean occurs(Variable var) { return _mode.occurs(var) || _arg.occurs(var); } protected boolean equals(ModalOp mo) { if (_mode.equals(mo._mode) && _arg.equals(mo._arg)) { return true; } else { return false; } } protected void unifyCheck(ModalOp mo) throws UnifyFailure { _mode.unifyCheck(mo._mode); _arg.unifyCheck(mo._arg); } /** Returns a hash code based on the mode and arg. */ public int hashCode() { return _mode.hashCode() + _arg.hashCode(); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { return _mode.hashCode(varMap) + _arg.hashCode(varMap); } /** * Returns whether this modal op equals the given object * up to variable names, using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } ModalOp mo = (ModalOp) obj; return _mode.equals(mo._mode, varMap, varMap2) && _arg.equals(mo._arg, varMap, varMap2); } /** Returns the string form of this modal op, without the arg. */ abstract public String modalOpString(); public String toString() { return new StringBuffer().append(modalOpString()).append(_arg.toString()).toString(); } /** * Returns a pretty-printed string of this LF, with the given indent. */ public String prettyPrint(String indent) { // calc new indent StringBuffer ibuf = new StringBuffer(); ibuf.append(indent).append(' '); String modalOpString = modalOpString(); for (int i = 0; i < modalOpString.length(); i++) { ibuf.append(' '); } String newIndent = ibuf.toString(); // calc string StringBuffer sb = new StringBuffer(); sb.append('\n').append(indent).append(modalOpString); sb.append(_arg.prettyPrint(newIndent)); // done return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/hylo/Mode.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; /** * A interface for hybrid logic nominals, to allow polymorphism for * both Modality labels and variables over Modality labels. * * @author Jason Baldridge * @author Scott Martin * @version $Revision: 1.2 $, $Date: 2005/10/19 21:27:15 $ **/ public interface Mode extends LF { /** * Gets the name of this mode. */ String getName(); } ================================================ FILE: src/opennlp/ccg/hylo/ModeLabel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A modality label. * Types are not currently supported. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/07/17 04:23:30 $ **/ public final class ModeLabel extends HyloAtom implements Mode { private static final long serialVersionUID = -4101305505903588678L; public ModeLabel(String name) { super(name); } public LF copy() { return new ModeLabel(_name); } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (equals(u)) return this; return super.unify(u, sub); } /** * Returns an XML representation of this LF (not currently supported). * Throws a runtime exception. */ public Element toXml() { throw new RuntimeException("toXml() not currently supported for ModeLabel."); } } ================================================ FILE: src/opennlp/ccg/hylo/ModeVar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A class for variables over ModeLabel objects. * Mode vars are not really supported at present, and * type unification is not implemented. * * @author Jason Baldridge * @author Scott Martin * @version $Revision: 1.5 $, $Date: 2009/07/17 04:23:30 $ **/ public class ModeVar extends HyloVar implements Mode { private static final long serialVersionUID = -6872985893931836901L; public ModeVar(String name) { super(name); } protected ModeVar(String name, int index, SimpleType st) { super(name, index, st); } /** * Gets the name of this mode variable. * @return This method just delegates to the {@link #name()} method. */ @Override public String getName() { return name(); } public LF copy() { return new ModeVar(_name, _index, type); } public boolean equals(Object o) { if (!(o instanceof ModeVar)) return false; return super.equals(o); } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof ModeLabel) { return sub.makeSubstitution(this, u); } else if (u instanceof ModeVar) { ModeVar u_nv = (ModeVar)u; if (equals(u_nv)) return this; // substitute according to comparison order, // so that the direction of unification doesn't matter if (compareTo(u_nv) >= 0) { return sub.makeSubstitution(this, u_nv); } else { return sub.makeSubstitution(u_nv, this); } } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { Object val = sub.getValue(this); if (val != null) { return val; } else { return this; } } /** * Returns an XML representation of this LF (not currently supported). * Throws a runtime exception. */ public Element toXml() { throw new RuntimeException("toXml() not currently supported for ModeVar."); } } ================================================ FILE: src/opennlp/ccg/hylo/Nominal.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-3 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; /** * A interface for hybrid logic nominals, to allow polymorphism for * both Nominal atoms and Nominal variables. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.6 $, $Date: 2005/11/01 19:39:27 $ **/ public interface Nominal extends LF { public String getName(); public boolean isShared(); public void setShared(boolean shared); public int compareTo(Nominal nom); } ================================================ FILE: src/opennlp/ccg/hylo/NominalAtom.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A hybrid logic nominal, an atomic formula which holds true at exactly one * point in a model. * The type is checked for compatibility during unification with nominal vars, * but it is not updated, since nominal atoms are constants. * If no type is given, the TOP type is used for backwards compatibility. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.10 $, $Date: 2009/07/17 04:23:30 $ **/ public class NominalAtom extends HyloAtom implements Nominal { private static final long serialVersionUID = -6002484920078196411L; protected boolean shared = false; public NominalAtom(String name) { this(name, null); } public NominalAtom(String name, SimpleType st) { this(name, st, false); } public NominalAtom(String name, SimpleType st, boolean shared) { super(name, st); type = (st != null) ? st : Grammar.theGrammar.types.getSimpleType(Types.TOP_TYPE); this.shared = shared; } public String getName() { return _name; } public boolean isShared() { return shared; } public void setShared(boolean shared) { this.shared = shared; } public LF copy() { return new NominalAtom(_name, type, shared); } /** Returns a hash code based on the atom name and type. */ public int hashCode() { return _name.hashCode() + type.hashCode(); } /** * Returns whether this atom equals the given object based on the atom name and type. */ public boolean equals(Object obj) { if (!super.equals(obj)) return false; NominalAtom nom = (NominalAtom) obj; return type.equals(nom.type); } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (equals(u)) return this; return super.unify(u, sub); } public int compareTo(Nominal nom) { if (nom instanceof NominalAtom) { return super.compareTo((NominalAtom)nom); } int retval = _name.compareTo(nom.getName()); if (retval == 0) { retval = -1; } // atom precedes var if names equal return retval; } public String toString() { String retval = _name; if (!type.getName().equals(Types.TOP_TYPE)) retval += ":" + type.getName(); return retval; } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("nom"); retval.setAttribute("name", toString()); return retval; } /** Tests serialization. */ public static void debugSerialization() throws IOException, ClassNotFoundException { // test serialization NominalAtom n = new NominalAtom("w1"); String filename = "tmp.ser"; ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(filename)); System.out.println("Writing n: " + n); out.writeObject(n); out.close(); ObjectInputStream in = new ObjectInputStream(new FileInputStream(filename)); System.out.print("Reading n2: "); NominalAtom n2 = (NominalAtom) in.readObject(); System.out.println(n2); in.close(); // test identity and equality System.out.println("n == n2?: " + (n == n2)); System.out.println("n.equals(n2)?: " + (n.equals(n2))); } } ================================================ FILE: src/opennlp/ccg/hylo/NominalVar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A class for variables over NominalAtom objects. * Nominal vars take precedence over generic hylo vars. * Types are unified with other hylo vars and nominal atoms. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.13 $, $Date: 2009/07/17 04:23:30 $ **/ public class NominalVar extends HyloVar implements Nominal { private static final long serialVersionUID = -2086362887254623273L; protected boolean shared = false; public NominalVar(String name) { super(name); } public NominalVar(String name, SimpleType st) { super(name, st); } public NominalVar(String name, SimpleType st, boolean shared) { super(name, st); this.shared = shared; } protected NominalVar(String name, int index, SimpleType st) { super(name, index, st); } protected NominalVar(String name, int index, SimpleType st, boolean shared) { super(name, index, st); this.shared = shared; } public String getName() { return _name; } public boolean isShared() { return shared; } public void setShared(boolean shared) { this.shared = shared; } public void setType(SimpleType st) { _hashCode += st.getIndex() - type.getIndex(); type = st; } public LF copy() { return new NominalVar(_name, _index, type, shared); } public boolean equals(Object o) { if (!(o instanceof NominalVar)) return false; return super.equals(o); } public int compareTo(Nominal nom) { if (nom instanceof NominalVar) { return super.compareTo((NominalVar)nom); } int retval = _name.compareTo(nom.getName()); if (retval == 0) { retval = 1; } // atom precedes var if names equal return retval; } public Object unify(Object u, Substitution sub) throws UnifyFailure { // check for equality with u if (equals(u)) return this; // make sure u is an LF if (!(u instanceof LF)) throw new UnifyFailure(); // check type compatibility LF lf = (LF) u; if (lf.getType() == null) throw new UnifyFailure(); SimpleType st = (SimpleType) type.unify(lf.getType(), sub); // with nominal atoms, go ahead and substitute if (u instanceof NominalAtom) return sub.makeSubstitution(this, u); // with nominal vars, substitute according to type specificity then comparison order, // so that the direction of unification doesn't matter if (u instanceof NominalVar) { NominalVar u_nv = (NominalVar) u; // equal types, use comparison order if (type.equals(u_nv.getType())) { if (super.compareTo(u_nv) >= 0) return sub.makeSubstitution(this, u_nv); else return sub.makeSubstitution(u_nv, this); } // unequal types, use most specific one if (type.equals(st)) return sub.makeSubstitution(u_nv, this); if (u_nv.getType().equals(st)) return sub.makeSubstitution(this, u_nv); // otherwise make new nom var with intersection type, // name based on comparison order and index, and new index String name = (super.compareTo(u_nv) >= 0) ? (u_nv._name + u_nv._index) : (_name + _index); NominalVar nv_st = new NominalVar(name, UnifyControl.getUniqueVarIndex(), st); // and subst both sub.makeSubstitution(u_nv, nv_st); return sub.makeSubstitution(this, nv_st); } // with hylo vars, substitute the hylo var for this if (u instanceof HyloVar) { HyloVar u_hv = (HyloVar) u; // check for same type if (type.equals(st)) return sub.makeSubstitution(u_hv, this); // otherwise make new nom var with intersection type, // same name, and new index NominalVar nv_st = new NominalVar(this._name, UnifyControl.getUniqueVarIndex(), st); // and subst both sub.makeSubstitution(u_hv, nv_st); return sub.makeSubstitution(this, nv_st); } // otherwise give up throw new UnifyFailure(); } public Object fill(Substitution sub) throws UnifyFailure { Object val = sub.getValue(this); if (val != null) { return val; } else { return this; } } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("nomvar"); retval.setAttribute("name", nameWithType()); return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/Op.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import opennlp.ccg.grammar.Grammar; import org.jdom.*; import java.util.*; import gnu.trove.*; /** * A generic operator, such as conjunction, disjunction, exclusive-or, * negation or optionality (^, v, v_, ~, ?). * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.17 $, $Date: 2009/12/21 03:27:19 $ **/ public class Op extends HyloFormula { private static final long serialVersionUID = -7489598953770901195L; /** Conjunction constant. */ public static final String CONJ = "conj"; /** Disjunction constant. */ public static final String DISJ = "disj"; /** Exclusive-or constant. */ public static final String XOR = "xor"; /** Negation constant. */ public static final String NEG = "neg"; /** Optionality constant. */ public static final String OPT = "opt"; /** The name of the operator (ie its kind). */ protected final String _name; /** The args. */ protected List _args; /** Element constructor. */ @SuppressWarnings("unchecked") public Op(Element e) { String name = e.getAttributeValue("name"); if (name == null) name = e.getAttributeValue("n"); _name = name; List argElements = e.getChildren(); int argSize = argElements.size(); List args = new ArrayList(argSize); for (int i=0; i 1 && (name.equals(NEG) || name.equals(OPT))) { _args = new ArrayList(1); _args.add(new Op(CONJ, args)); } else _args = args; } /** Constructor. */ public Op(String name, List args) { _name = name; _args = args; } /** Two arg convenience constructor. */ public Op(String name, LF first, LF second) { _name = name; _args = new ArrayList(); _args.add(first); _args.add(second); } public String getName() { return _name; } public List getArguments() { return _args; } public void addArgument(LF formula) { _args.add(formula); } /** * Appends the args if the given lf is a CONJ op, * otherwise just adds it. */ public void appendArgs(LF lf) { if (lf instanceof Op && ((Op)lf).getName().equals(Op.CONJ)) _args.addAll(((Op)lf).getArguments()); else _args.add(lf); } public LF copy() { List $args = new ArrayList(_args.size()); for (LF arg : _args) { $args.add(arg.copy()); } return new Op(_name, $args); } public void deepMap(ModFcn mf) { for (Iterator argsIt = _args.iterator(); argsIt.hasNext(); ) { argsIt.next().deepMap(mf); } mf.modify(this); } public boolean occurs(Variable var) { for (Iterator argsIt = _args.iterator(); argsIt.hasNext(); ) { if (argsIt.next().occurs(var)) { return true; } } return false; } /** Returns true iff the given object equals this op. */ public boolean equals(Object o) { if (!(o instanceof Op)) return false; Op op = (Op) o; if (_name != op._name) return false; List opArgs = op._args; if (_args.size() != opArgs.size()) return false; if (!opArgs.containsAll(_args)) return false; return true; } /** Unification is not attempted for Ops. */ public void unifyCheck(Object u) throws UnifyFailure { throw new UnifyFailure(); } /** Unification is not attempted for Ops. */ public Object unify(Object u, Substitution s) throws UnifyFailure { throw new UnifyFailure(); } public Object fill(Substitution sub) throws UnifyFailure { List $args = new ArrayList(_args.size()); for (LF arg : _args) { $args.add((LF)arg.fill(sub)); } return new Op(_name, $args); } public String toString() { StringBuffer sb = new StringBuffer(); String opString = printOp(_name); if (_args.size() == 1) { sb.append(opString); sb.append(_args.get(0).toString()); } else { sb.append('('); Iterator argsIt = filteredArgs().iterator(); for (; argsIt.hasNext(); ) { sb.append(argsIt.next().toString()); if (argsIt.hasNext()) sb.append(' ').append(opString).append(' '); } sb.append(')'); } return sb.toString(); } /** * Returns a pretty-printed string of this LF, with the given indent. */ public String prettyPrint(String indent) { StringBuffer sb = new StringBuffer(); String opString = printOp(_name); if (_args.size() == 1) { sb.append(opString); sb.append(((LF)_args.get(0)).prettyPrint(indent)); } else { sb.append('('); Iterator argsIt = filteredArgs().iterator(); for (; argsIt.hasNext(); ) { sb.append(argsIt.next().prettyPrint(indent)); if (argsIt.hasNext()) sb.append(' ').append(opString).append(' '); } sb.append(')'); } return sb.toString(); } public static String printOp(String o) { if (o.equals(CONJ)) return "^"; else if (o.equals(DISJ)) return "v"; else if (o.equals(XOR)) return "v_"; else if (o.equals(NEG)) return "~"; else if (o.equals(OPT)) return "?"; else return o; } // filters out semantic features if apropos private List filteredArgs() { String featsToShow = Grammar.theGrammar.prefs.featsToShow; if (featsToShow.length() == 0) return _args; List retval = new ArrayList(_args.size()); for (Iterator it = _args.iterator(); it.hasNext(); ) { LF arg = it.next(); String attr = null; if (arg instanceof SatOp && HyloHelper.isAttrPred(arg)) attr = HyloHelper.getRel(arg); else if (arg instanceof Diamond && HyloHelper.isAttr(arg)) attr = ((Diamond)arg).getMode().toString(); if (attr == null || featsToShow.indexOf(attr) != -1) retval.add(arg); } return retval; } /** Returns a hash code. */ public int hashCode() { int retval = _name.hashCode(); for (Iterator it = _args.iterator(); it.hasNext(); ) { retval += it.next().hashCode(); } return retval; } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { int retval = _name.hashCode(); for (Iterator it = _args.iterator(); it.hasNext(); ) { LF arg = it.next(); retval += arg.hashCode(varMap); } return retval; } /** * Returns whether this op equals the given object * up to variable names, using the given maps from vars to ints * (where args must be in the same order). */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } Op op = (Op) obj; if (!_name.equals(op._name)) return false; if (_args.size() != op._args.size()) return false; for (int i = 0; i < _args.size(); i++) { LF arg = (LF) _args.get(i); LF arg2 = (LF) op._args.get(i); if (!arg.equals(arg2, varMap, varMap2)) return false; } return true; } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("op"); retval.setAttribute("name", _name); for (int i = 0; i < _args.size(); i++) { LF arg = (LF) _args.get(i); Element argElt = arg.toXml(); retval.addContent(argElt); } return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/Proposition.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; /** * A propositional value, such as the predicate "sleep", * or the value of a semantic feature, such as "past" for tense. * Types are unified if present. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.7 $, $Date: 2009/07/17 04:23:30 $ **/ public class Proposition extends HyloAtom { private static final long serialVersionUID = -5392519210634765414L; public Proposition(String name) { super(name); } public Proposition(String name, SimpleType st) { super(name, st); } public LF copy() { return new Proposition(_name, type); } public Object unify(Object u, Substitution sub) throws UnifyFailure { // check equality if (equals(u)) return this; // check for prop with compatible type if (u instanceof Proposition) { Proposition prop = (Proposition) u; if (type == null || prop.type == null) throw new UnifyFailure(); SimpleType st = (SimpleType) type.unify(prop.type, sub); // return prop with most specific type if (st.equals(type)) return this; if (st.equals(prop.type)) return prop; // otherwise return prop with name of intersection type return new Proposition(st.getName(), st); } // otherwise defer to default routine return super.unify(u, sub); } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("prop"); retval.setAttribute("name", toString()); return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/SatOp.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import org.jdom.*; import java.util.*; import gnu.trove.*; /** * A hybrid logic satifaction operator, which tests whether a formula is true * a particular point named by a nominal. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.13 $, $Date: 2009/07/17 04:23:30 $ **/ public class SatOp extends HyloFormula { private static final long serialVersionUID = -4953978442971984002L; /** * The LF alts to which this LF belongs. * Null represents no alts. */ protected List alts = null; /** * Gets the LF alts to which this LF belongs. */ public List getAlts() { return alts; } /** * The LF opts (optional parts) to which this LF belongs. * LF opts are used during realization to represent * optional parts of the input. * The opts are numbered starting with 0, * and null represents no opts. */ protected TIntArrayList opts = null; /** * Gets the LF opts to which this LF belongs. */ public TIntArrayList getOpts() { return opts; } /** * The sign or unary rule which introduced this predication. */ protected LexSemOrigin _origin = null; /** * Gets the sign or unary rule which introduced this predication (or null if none). */ public LexSemOrigin getOrigin() { return _origin; } /** * Sets the sign or unary rule which introduced this predication. */ public void setOrigin(LexSemOrigin origin) { _origin = origin; } // the real contents of the satop protected Nominal _nominal; protected LF _arg; public SatOp(Element e) { boolean shared = "true".equals(e.getAttributeValue("shared")); String nom = e.getAttributeValue("nom"); if (nom != null) { _nominal = new NominalAtom(HyloHelper.prefix(nom), HyloHelper.type(nom), shared); } else { nom = e.getAttributeValue("nomvar"); if (nom != null) { _nominal = new NominalVar(HyloHelper.prefix(nom), HyloHelper.type(nom), shared); } else { throw new RuntimeException("Satop must have a nom or nomvar."); } } _arg = HyloHelper.getLF_FromChildren(e); } public SatOp(Nominal nom, LF arg) { _nominal = nom; _arg = arg; } public Nominal getNominal() { return _nominal; } public void setNominal(Nominal nominal) { _nominal = nominal; } public LF getArg() { return _arg; } public void setArg(LF arg) { _arg = arg; } public LF copy() { SatOp retval = new SatOp((Nominal)_nominal.copy(), _arg.copy()); retval._origin = _origin; return retval; } public void deepMap(ModFcn mf) { _nominal.deepMap(mf); _arg.deepMap(mf); mf.modify(this); } public boolean occurs(Variable var) { return (_nominal.occurs(var) || _arg.occurs(var)); } /** Returns true iff the nominal and arg are equal. */ public boolean equals(Object o) { if (o instanceof SatOp && _nominal.equals(((SatOp)o)._nominal) && _arg.equals(((SatOp)o)._arg)) { return true; } else { return false; } } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof HyloFormula) { if (u instanceof SatOp) { Nominal $nom = (Nominal) Unifier.unify(_nominal, ((SatOp)u)._nominal, sub); LF $arg = (LF)Unifier.unify(_arg,((SatOp)u)._arg, sub); SatOp retval = new SatOp($nom, $arg); retval._origin = _origin; return retval; } else return super.unify(u,sub); } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { SatOp retval = new SatOp((Nominal)_nominal.fill(sub), (LF)_arg.fill(sub)); retval._origin = _origin; return retval; } public String toString() { boolean includeParens = !(_arg instanceof Op); StringBuffer sbuf = new StringBuffer(); sbuf.append('@').append(_nominal.toString()); if (includeParens) { sbuf.append('('); } sbuf.append(_arg.toString()); if (includeParens) { sbuf.append(')'); } return sbuf.toString(); } /** * Returns a pretty-printed string of this LF, with the given indent. */ public String prettyPrint(String indent) { // calc new indent StringBuffer ibuf = new StringBuffer(); ibuf.append(indent).append(" "); String nomStr = _nominal.toString(); for (int i = 0; i < nomStr.length(); i++) { ibuf.append(' '); } String newIndent = ibuf.toString(); // calc string boolean includeParens = !(_arg instanceof Op); StringBuffer sbuf = new StringBuffer(); sbuf.append('@').append(nomStr); if (includeParens) { sbuf.append('('); } sbuf.append(_arg.prettyPrint(newIndent)); if (includeParens) { sbuf.append(')'); } // done return sbuf.toString(); } /** Returns a hash code using the nominal and arg. */ public int hashCode() { return _nominal.hashCode() + _arg.hashCode(); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { return _nominal.hashCode(varMap) + _arg.hashCode(varMap); } /** * Returns whether this sat op equals the given object * up to variable names, using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } SatOp so = (SatOp) obj; return _nominal.equals(so._nominal, varMap, varMap2) && _arg.equals(so._arg, varMap, varMap2); } /** * Returns an XML representation of this LF. */ public Element toXml() { Element retval = new Element("satop"); if (_nominal instanceof NominalAtom) { retval.setAttribute("nom", _nominal.toString()); } else { retval.setAttribute("nomvar", ((NominalVar)_nominal).nameWithType()); } Element argElt = _arg.toXml(); retval.addContent(argElt); return retval; } } ================================================ FILE: src/opennlp/ccg/hylo/graph/DefaultLFEdgeFactory.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; /** * A factory for LF edges that creates edges from specified source and target vertices and an edge label. * This class provides a default implementation of the {@link LFEdgeFactory} interface for instantiating * {@link LFGraph}s. * * @author Scott Martin */ public class DefaultLFEdgeFactory implements LFEdgeFactory { /** * Creates an edge from a specified source and target vertex. * @return A new edge with a null label. * @see #createLabeledEdge(LFVertex, LFVertex, LFEdgeLabel) */ @Override public LFEdge createEdge(LFVertex sourceVertex, LFVertex targetVertex) { return createLabeledEdge(sourceVertex, targetVertex, null); } /** * Creates a new labeled, directed edge from a specified vertex pair and edge label. * @param sourceVertex The source vertex of the new edge. * @param targetVertex The target vertex of the new edge. * @param label The label of the new edge. * @return An instance of {@link LFEdge} with the specfied parameters. * * @see LFEdge#LFEdge(LFVertex, LFVertex, LFEdgeLabel) */ @Override public LFEdge createLabeledEdge(LFVertex sourceVertex, LFVertex targetVertex, LFEdgeLabel label) { return new LFEdge(sourceVertex, targetVertex, label); } } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFEdge.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; /** * An edge in an {@linkplain LFGraph LF graph}. LF graph edges are directed, containing a * {@linkplain #getSource() source} and {@linkplain #getTarget() target} vertex as well as * an {@linkplain #getLabel() edge label} representing the type of relation between the two * vertices. * * @author Scott Martin */ public class LFEdge { final LFVertex source, target; final LFEdgeLabel label; /** * Convenience constructor for creating edges with a null edge label. This constructor just * calls LFEdge(source, target, null). * @see #LFEdge(LFVertex, LFVertex, LFEdgeLabel) */ public LFEdge(LFVertex source, LFVertex target) { this(source, target, null); } /** * Creates a new LF edge with the specified source and target vertices, and edge label. * @param source The source vertex of the new edge. * @param target The target vertex of the new edge. * @param label The label of the new edge, possibly null. * * @throws IllegalArgumentException If either source or target is null. */ public LFEdge(LFVertex source, LFVertex target, LFEdgeLabel label) { checkVertex(source, "source"); checkVertex(target, "target"); this.source = source; this.target = target; this.label = label; } void checkVertex(LFVertex v, String name) { if(v == null) { throw new IllegalArgumentException(name + " is null"); } } /** * Gets the edge label, which may be null. */ public LFEdgeLabel getLabel() { return label; } /** * Gets the source vertex. */ public LFVertex getSource() { return source; } /** * Gets the target vertex. */ public LFVertex getTarget() { return target; } /** * Tests whether this edge is equal to another by comparing the source and target vertices by using their * {@link LFVertex#equals(Object)} methods. * If the label is null, it is considered equivalent to the other edge's label only if * the other edge's label is also null. Otherwise, the labels are compared using their * {@link LFEdgeLabel#equals(Object)} method. * * @param o The edge to compare this edge to. */ @Override public boolean equals(Object o) { if(o instanceof LFEdge) { LFEdge e = (LFEdge)o; return source.equals(e.source) && target.equals(e.target) && (label != null) ? label.equals(e.label) : e.label == null; } return false; } /** * Computes a hash code for this edge based on the hash codes of its vertices and label, assuming the * label is non-null. */ @Override public int hashCode() { int h = 37 * source.hashCode() + target.hashCode(); if(label != null) { h += label.hashCode(); } return h; } /** * Gets a string representation of this edge. For example, if the edge's source is w1@woman, * its target is w0@the, and its label is Det, this method returns * w1@woman --Det--> w0@the. */ @Override public String toString() { StringBuilder sb = new StringBuilder(source.toString()); sb.append(" --"); sb.append((label == null) ? "(no label)" : label.toString()); sb.append("--> "); sb.append(target.toString()); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFEdgeFactory.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; import org.jgrapht.EdgeFactory; /** * A factory for LF edges that creates edges from specified source and target vertices and an edge label. * This interface extends the {@link EdgeFactory} interface for the specialized case of * directed, labeled LF edges with LF vertices as their nodes. A default implementation * is provided in {@link DefaultLFEdgeFactory}. * * @author Scott Martin */ public interface LFEdgeFactory extends EdgeFactory { /** * Creates a new labeled, directed edge from a specified vertex pair and edge label. * @param sourceVertex The source vertex of the new edge. * @param targetVertex The target vertex of the new edge. * @param label The label of the new edge. * @return An instance of {@link LFEdge} with the specified parameters. * * @see LFEdge#LFEdge(LFVertex, LFVertex, LFEdgeLabel) */ public LFEdge createLabeledEdge(LFVertex sourceVertex, LFVertex targetVertex, LFEdgeLabel label); } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFEdgeLabel.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; import java.util.HashMap; import java.util.Map; import opennlp.ccg.hylo.Mode; /** * A label for an {@linkplain LFEdge LF edge}. LF edge labels are drawn from * {@link Mode}s, so this class encapsulates one. *

    * Since certain modes reoccur frequently, this class maintains a cache of * modes mapped to edge labels, accessible via {@link #forMode(Mode)}. * * @author Scott Martin */ public class LFEdgeLabel { final Mode mode; private static Map labelCache; /** * Creates an LF edge label with the specified mode. * @param mode The mode representing the type of relation between the vertices. * @throws IllegalArgumentException if mode is null. */ public LFEdgeLabel(Mode mode) { if(mode == null) { throw new IllegalArgumentException("mode is null"); } this.mode = mode; } /** * Gets a cached LF edge label for the specified mode, if one is available. If none * has been created yet, a new LF edge label is created and cached for later use. * Access to the cached LF edge labels is synchronized to avoid threading isues. * * @param mode The mode to create a label for. * @return Either a cached LF edge label corresponding to the specified mode, if one * is available, or a newly created one. */ public static LFEdgeLabel forMode(Mode mode) { synchronized(LFEdgeLabel.class) { LFEdgeLabel l = null; if(labelCache == null) { labelCache = new HashMap(); } else { l = labelCache.get(mode); } if(l == null) { l = new LFEdgeLabel(mode); labelCache.put(mode, l); } return l; } } /** * Gets the name of this label, as specified by its underlying * {@linkplain Mode mode}. * @return The value of getMode().getName(). */ public String getName() { return mode.getName(); } /** * Gets the mode underlying this edge label. * @return The mode specified at creation. * @see #LFEdgeLabel(Mode) */ public Mode getMode() { return mode; } /** * Computes a hash code for this LF edge label based on the hash code of its * underlying mode. */ @Override public int hashCode() { return 31 * mode.hashCode(); } /** * Tests whether this LF edge label is equivalent to another by comparing their * modes, using their {@link Mode#equals(Object)} methods. */ @Override public boolean equals(Object obj) { return (obj instanceof LFEdgeLabel) && mode.equals(((LFEdgeLabel)obj).mode); } /** * Gets a string representation of this LF edge label. * @return The value of getName(). */ @Override public String toString() { return getName(); } } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFGraph.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import opennlp.ccg.hylo.Flattener; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.synsem.LF; import org.jgrapht.graph.DirectedMultigraph; /** * A graph representation of an {@link LF}. By default, LF graphs are empty. But populated LF graphs can * be easily created using the {@link LFGraphFactory} class. *

    * This class extends the * {@link DirectedMultigraph} class from the org.jgrapht.graph package that provides some * specialized methods for dealing with OpenCCG LFs. It provides some flexibility for edge creation * by allowing an {@link LFEdgeFactory} to be specified at creation. The original LF structure is also * reflected in the {@link #getLFAncestry(LFVertex)}, {@link #highestLFAncestors()}, and * {@link #highestLFAncestorOf(LFVertex)} methods. *

    * As a convenience, the method {@link #findVertexByNominal(Nominal)} provides access to the vertex * corresponding to a given nominal. This is implemented by a hash map so that the lookup takes place in * constant time. * * @see JGraphT website * * @author Scott Martin */ public class LFGraph extends DirectedMultigraph { private static final long serialVersionUID = 1L; //TODO make this really serializable? /** * The map reflecting LF ancestry by giving the highest ancestor for a specified * vertex, or null if it is the highest ancestor. */ protected Map highestAncestorMap = new HashMap(); /** * Convenience cache of nominals mapped to vertices. * @see #findVertexByNominal(Nominal) */ protected Map nominalVertexMap = new HashMap(); /** * Creates a new, empty LF graph that uses an implementation of {@link LFEdgeFactory} * as its edge factory. * @see #LFGraph(LFEdgeFactory) */ public LFGraph() { this(LFGraphFactory.DEFAULT_EDGE_FACTORY); } /** * Creates a new, empty LF graph using the specified edge factory. * @param lfEdgeFactory The edge factory to use for creating edges. * * @see LFGraphFactory */ public LFGraph(LFEdgeFactory lfEdgeFactory) { super(lfEdgeFactory); } /** * Gets the {@link LFEdgeFactory} used to create this LF graph. * @return The value of {@link #getEdgeFactory()}, cast to an * {@link LFEdgeFactory}. */ public LFEdgeFactory getLFEdgeFactory() { return (LFEdgeFactory)getEdgeFactory(); } /** * Adds a new labeled edge based on the specified source and target vertices and edge label by calling * getLFEdgeFactory().createLabeledEdge(source, target, label). The new * edge is added to this graph's {@linkplain #edgeSet() edge set}. * @param source The source vertex of the new edge. * @param target The target vertex of the new edge. * @param label The label of the new edge. * @return The newly created edge if it was successfully added to this graph, null otherwise. */ public LFEdge addLabeledEdge(LFVertex source, LFVertex target, LFEdgeLabel label) { LFEdge e = getLFEdgeFactory().createLabeledEdge(source, target, label); return addEdge(source, target, e) ? e : null; } /** * Finds a vertex by its nominal. This method does a lookup on a dictionary mapping each vertex's * {@linkplain LFVertex#getNominal() nominal} to members of the{@link #vertexSet()}. * @param nominal The nominal to test for. * @return A vertex whose nominal is equivalent to the one specified, or null if none is present. */ public LFVertex findVertexByNominal(Nominal nominal) { return nominalVertexMap.get(nominal); } /** * Overrides the superclass method to add a mapping from the vertex's {@linkplain LFVertex#getNominal() nominal} * to the vertex itself, for later retrieval via {@link #findVertexByNominal(Nominal)}. */ @Override public boolean addVertex(LFVertex v) { boolean b = super.addVertex(v); // give this a chance first if(b) { nominalVertexMap.put(v.nominal, v); } return b; } /** * Overrides the superclass method to remove any existing mapping from some nominal to the specified vertex. * @see #addVertex(LFVertex) * @see #findVertexByNominal(Nominal) */ @Override public boolean removeVertex(LFVertex v) { boolean b = super.removeVertex(v); // invoke this first if(b) { nominalVertexMap.values().remove(v); } return b; } /** * Gets the highest LF ancestor of the specified vertex, as determined by the LF structure. * @param vertex The vertex to get the highest ancestor for. * @return The highest ancestor of the specified vertex, or null if it is the highest in its * ancestry. * @see Flattener#getHighestParentMap() */ public LFVertex highestLFAncestorOf(LFVertex vertex) { return highestAncestorMap.get(vertex); } /** * Gets the LF ancestry corresponding to the specified vertex. * * @return The parents of the specified vertex in the LF ancestry. * @see #highestLFAncestorOf(LFVertex) * @see Flattener#getHighestParentMap() */ public Set getLFAncestry(LFVertex vertex) { LFVertex a = highestLFAncestorOf(vertex); Set as = new LinkedHashSet(); for(LFVertex v : vertexSet()) { if(!v.equals(vertex) && highestLFAncestorOf(v).equals(a)) { as.add(v); } } return as; } /** * Gets the vertex or vertices that are at the top of the LF ancestry hierarchy. * @return The set of vertices v for which {@link #highestLFAncestorOf(LFVertex)} returns * null. * @see Flattener#getHighestParentMap() */ public Set highestLFAncestors() { Set ps = new LinkedHashSet(); for(LFVertex v : vertexSet()) { if(highestLFAncestorOf(v) == null) { ps.add(v); } } return ps; } } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFGraphFactory.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; import static opennlp.ccg.hylo.HyloHelper.isAttrPred; import static opennlp.ccg.hylo.HyloHelper.isElementaryPredication; import static opennlp.ccg.hylo.HyloHelper.isLexPred; import static opennlp.ccg.hylo.HyloHelper.isRelPred; import java.util.List; import java.util.Map; import opennlp.ccg.hylo.Diamond; import opennlp.ccg.hylo.Flattener; import opennlp.ccg.hylo.HyloHelper; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.SatOp; import opennlp.ccg.realize.Realizer; import opennlp.ccg.synsem.LF; import org.jdom.Element; import org.jdom.input.DOMBuilder; /** * Factory class for creating LF graphs based on {@link LF}s and XML elements * that represent them. Graph factories cannot be instantiated, however, their * methods for building LF graphs are statically accessible. * * @author Scott Martin * @see LFGraph * @see LFGraphFactory#newGraphFrom(LF) */ public class LFGraphFactory { /** * An instance of {@link DefaultLFEdgeFactory}. */ public static final LFEdgeFactory DEFAULT_EDGE_FACTORY = new DefaultLFEdgeFactory(); static final DOMBuilder DOM_BUILDER = new DOMBuilder(); static final Flattener FLATTENER = new Flattener(); private LFGraphFactory() { // can't instantiate } /** * Builds a new LF graph based on the representation provided in the specified w3c XML element. * @param lfElement The XML element with root element tagged lf. * @return The value of {@link #newGraphFrom(Element)} after using a {@link DOMBuilder} to create a * jdom document. * * @see #newGraphFrom(Element) */ public static LFGraph newGraphFrom(org.w3c.dom.Element lfElement) { return newGraphFrom(DOM_BUILDER.build(lfElement)); } /** * Creates a new LF graph from the corresponding representation contained in the specified * jdom XML element. * @param lfElement The jdom element containing the representation of the LF. * @return An LF graph build from an {@link LF} object obtained by calling * {@link Realizer#getLfFromElt(Element)}. * * @see LFGraphFactory#newGraphFrom(LF) */ public static LFGraph newGraphFrom(Element lfElement) { return newGraphFrom(Realizer.getLfFromElt(lfElement)); } /** * Creates a new LF graph based on the specified LF object. The LF object is first flattened, and then * its LF ancestry structure is obtained by calling {@link Flattener#getHighestParentMap()}. *

    * This method makes two passes over the list of {@link SatOp}s obtained by flattening the specified LF. * The first pass adds vertices to the graph for every lexical predication, as determined by calling * {@link HyloHelper#isLexPred(LF)} on the {@linkplain SatOp#getNominal() SatOp's nominal} and * {@linkplain SatOp#getArg() proposition argument}. *

    * The second pass proceeds by cases, depending on the nature of the SatOp in question: *

    *
    Lexical predications
    *
    cause the new LF graph to be updated with the corresponding LF ancestry, as determined by * {@link Flattener#getHighestParentMap()}.
    *
    Relation predications
    *
    cause a new {@link LFEdge} to be added to the LF graph based on the * {@linkplain SatOp#getArg() SatOp's argument} and {@linkplain Diamond#getMode() the argument's * mode}.
    *
    Attribute-value predications
    *
    cause the vertex corresponding to the {@linkplain SatOp#getNominal() SatOp's nominal} to have * attributes {@linkplain LFVertex#setAttribute(opennlp.ccg.hylo.Mode, Proposition) added} based on * the {@linkplain SatOp#getArg() SatOp's argument}.
    *
    * where the nature of the SatOp in question is determined using {@link HyloHelper#isLexPred(LF)}, * {@link HyloHelper#isRelPred(LF)}, and {@link HyloHelper#isAttrPred(LF)}. * * @param lf The LF object to build an LF graph for. * @return A new LF graph whose vertices represent the nominals in the LF's flattened representation and * whose edges represent its relation predications. * @throws IllegalArgumentException If lf is null. */ public static LFGraph newGraphFrom(LF lf) { if(lf == null) { throw new IllegalArgumentException("lf is null"); } LFGraph g = new LFGraph(DEFAULT_EDGE_FACTORY); Flattener f = new Flattener(); List satOps = f.flatten(lf); Map ancestorMap = f.getHighestParentMap(); for(SatOp so : satOps) { // first pass adds vertices if(isLexPred(so)) { g.addVertex(new LFVertex(so.getNominal(), (Proposition)so.getArg())); } } for(SatOp so : satOps) { // second pass adds edges and attributes, sets highest parent (if any) if(isElementaryPredication(so)) { Nominal soNom = so.getNominal(); LFVertex source = g.findVertexByNominal(soNom); // check if node is not yet added (not a lex. pred.) if(source == null) { source = new LFVertex(soNom); g.addVertex(source); } if(isLexPred(so)) { Nominal parent = ancestorMap.get(source.nominal); if(parent != null) { g.highestAncestorMap.put(source, g.findVertexByNominal(parent)); } } else if(isRelPred(so)) { Diamond d = (Diamond)so.getArg(); Nominal dArg = (Nominal)d.getArg(); LFVertex target = g.findVertexByNominal(dArg); if(target == null) { target = new LFVertex(dArg); g.addVertex(target); } g.addLabeledEdge(source, target, LFEdgeLabel.forMode(d.getMode())); } else if(isAttrPred(so)) { Diamond d = (Diamond)so.getArg(); source.addAttribute(d.getMode(), (Proposition)d.getArg()); } } } return g; } } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFVertex.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Set; import opennlp.ccg.hylo.Mode; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.SatOp; /** * A vertex in an {@link LFGraph}. Vertices are based on {@link SatOp}s, encapsulating their * {@linkplain SatOp#getNominal() nominal} and {@linkplain SatOp#getArg() proposition argument} (or * null if there is no associated proposition). * Vertices also maintain a list of attribute/value pairs representing their associated attribute/value * predications. *

    * The {@linkplain #getIndex() index} and {@linkplain #getType() type} of an LF vertex are determined by * the specified nominal's {@linkplain Nominal#getName() name}. For example, if a vertex's * {@linkplain #getName() name} is w12, then {@link #getIndex()} returns 12 and * {@link #getType()} returns {@link LFVertexType#WORD}. * * @author Scott Martin * @see LFGraph * @see LFVertexType * @see SatOp */ public class LFVertex { final Nominal nominal; final Proposition proposition; /** * This vertex's attribute map. */ protected Map attributes; private Integer index; private LFVertexType type; /** * Creates a new LF vertex based on the specified SatOp. * @param satOp The SatOp to use for creating a new LF vertex, using its nominal and argument. * @see #LFVertex(Nominal, Proposition) */ public LFVertex(SatOp satOp) { this(satOp.getNominal(), (Proposition)satOp.getArg()); } /** * Creates a new LF vertex based on the specified nominal, with a null proposition. * @see #LFVertex(Nominal, Proposition) */ public LFVertex(Nominal nominal) { this(nominal, null); } /** * Creates a new LF vertex based on the specified nominal and proposition. * @see #LFVertex(Nominal, Proposition, Map) */ public LFVertex(Nominal nominal, Proposition proposition) { this(nominal, proposition, null); } /** * Creates a new LF vertex based on the specified nominal and proposition, with the specified attribute * map (which can be null). * @throws IllegalArgumentException if nominal is null. */ public LFVertex(Nominal nominal, Proposition proposition, Map attributes) { if(nominal == null) { throw new IllegalArgumentException("nominal is null"); } this.nominal = nominal; this.proposition = proposition; this.attributes = attributes; } /** * Gets this LF vertex's name, determined by the name of its {@linkplain #getNominal() nominal}. * @return The value of getNominal().getName(). */ public String getName() { return getNominal().getName(); } /** * Gets this LF vertex's associated predicate, the name of its {@linkplain #getProposition() proposition}. * @return The value of getProposition().getName(), if this vertex's proposition is non-null, * and null otherwise. */ public String getPredicate() { Proposition p = getProposition(); return (p == null) ? null : p.getName(); } /** * Gets this LF vertex's associated nominal. */ public Nominal getNominal() { return nominal; } /** * Gets this LF vertex's associated proposition. * @return Possibly null, if no proposition was provided at creation. * @see #LFVertex(Nominal) */ public Proposition getProposition() { return proposition; } /** * Gets the type of this LF vertex, as determined by the prefix of its {@linkplain #getNominal() nominal}'s * name. For example, if this vertex's {@linkplain Nominal#getName() name} is x3, then * this method returns * {@link LFVertexType#fromPrefix(String) LFVertexType.fromPrefix}('x') == {@link LFVertexType#NONWORD}. * @return The value of {@link LFVertexType#fromPrefix(String)} for this vertex's * {@linkplain #getNominal() nominal}'s {@linkplain Nominal#getName() name}. * @see LFVertexType */ public LFVertexType getType() { return (type == null) ? (type = LFVertexType.fromPrefix(nominal.getName())) : type; } /** * Gets the word index associated with this LF vertex, as determined by {@link #parseVertexIndex(Nominal)}. */ public Integer getIndex() { return (index == null) ? (index = parseVertexIndex(nominal)) : index; } /** * Parses the word index associated with the specified nominal, obtained by parsing its name. For * example, if the specified nominal's name is x9, this method returns the integer * 9. * @param nominal The nominal to find the word index for. * @return The integer index corresponding to the specified nominal, determined by parsing its * {@linkplain Nominal#getName() name}. */ public static Integer parseVertexIndex(Nominal nominal) { String nm = nominal.getName(); int colidx = nm.indexOf(':'); String s = (colidx == -1) ? nm : nm.substring(0, colidx); return Integer.parseInt(s.substring(1)); } /** * Gets the attribute map associated with this LF vertex. Note that the returned map is * not modifiable; to modify a vertex's attributes, the methods * {@link #setAttribute(Mode, Proposition)} and {@link #removeAttribute(Mode)} * should be used. * @return An unmodifiable copy of the attribute map encapsulated by this vertex, or * {@link Collections#EMPTY_MAP} if no attributes are present. * @see Collections#unmodifiableMap(Map) */ @SuppressWarnings("unchecked") public Map getAttributeMap() { return (attributes == null) ? Collections.EMPTY_MAP : Collections.unmodifiableMap(attributes); } /** * Gets the names of this LF vertex's attributes. Note that the returned set is not modifiable; * the methods {@link #setAttribute(Mode, Proposition)} and {@link #removeAttribute(Mode)} should * be used to modify this vertex's attribute/value pairs. * @return The value of getAttributeMap().keySet(), or {@link Collections#EMPTY_SET} if * no attributes are present. * @see Collections#unmodifiableSet(Set) */ @SuppressWarnings("unchecked") public Set attributeNames() { return (attributes == null) ? Collections.EMPTY_SET : Collections.unmodifiableSet(attributes.keySet()); } /** * Tests whether this vertex contains an attribute with the associated attribute name. * @param attributeName The name of the attribute to test for. * @return true if this vertex has an attribute named attributeName. * @see #attributeNames() */ public boolean containsAttribute(Mode attributeName) { return (attributes != null && attributes.containsKey(attributeName)); } /** * Gets the value of the attribute with the specified mode name. * @param attributeName The attribute name to retrieve a value for. * @return The associated attribute value, or null if none is present. */ public Proposition getAttributeValue(Mode attributeName) { return (attributes == null) ? null : attributes.get(attributeName); } /** * Adds an attribute/value to this vertex's attributes. * @param attributeName The name of the new attribute. * @param value The value of the new attribute. * @return True if this vertex's attribute/value map changed as a result of the call because * either (1) no attribute named attributeName was present or (2) * the value associated with attributeName changed (was different from value). * @see #setAttribute(Mode, Proposition) */ public boolean addAttribute(Mode attributeName, Proposition value) { // works even when setAttribute() returns null return !value.equals(setAttribute(attributeName, value)); } /** * Sets the attribute associated with the specified mode name to the specified proposition value. * @param attributeName The key to set the value for. * @param value The value that will be associated with attributeName. * @return The value previously associated with attributeName, or null if no value * was previously associated. * @see Map#put(Object, Object) */ public Proposition setAttribute(Mode attributeName, Proposition value) { if(attributes == null) { attributes = new HashMap(); } return attributes.put(attributeName, value); } /** * Removes and returns the value associated with the specified attribute name. * @param attributeName The name to remove the value for. * @return The value previously associated with attributeName, or null if no value * was associated with it. */ public Proposition removeAttribute(Mode attributeName) { return (attributes == null) ? null : attributes.remove(attributeName); } /** * Gets a hash code for this vertex based on its nominal and proposition, if the proposition is * non-null. */ @Override public int hashCode() { int i = 37 * nominal.hashCode(); if(proposition != null) { i += proposition.hashCode(); } // Don't include attributes in hash code calculation. This could cause problems if a vertex is // added to a collection that relies on hashing, and then the attributes are later modified. return i; } /** * Tests whether this LF vertex is equal to another by comparing their nominals and possibly also their * propositions and attributes, if they are non-null/non-empty. * @see #getNominal() * @see #getProposition() * @see #getAttributeMap() */ @Override public boolean equals(Object obj) { if(obj instanceof LFVertex) { LFVertex v = (LFVertex)obj; return nominal.equals(v.nominal) && ((proposition == null) ? v.proposition == null : proposition.equals(v.proposition)) && ((attributes == null) ? v.attributes == null : attributes.equals(v.attributes)); } return false; } /** * Gets a string representation of this LF vertex. For example, if this vertex's name is w9, * its proposition's name is walk, and its attribute map contains num=sg and * det=nil, this method returns w9@walk {num=sg, det=nil}. */ @Override public String toString() { StringBuilder sb = new StringBuilder(nominal.getName()); if(proposition != null) { sb.append('@'); sb.append(proposition.getName()); } if(attributes != null && !attributes.isEmpty()) { sb.append(' '); sb.append(attributes.toString()); } return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/hylo/graph/LFVertexType.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.hylo.graph; /** * A set of enum constants indicating the type of an LF vertex. The type of a vertex is * also signaled by the character prefix of its name, as in w9 and x3. * * @author Scott Martin */ public enum LFVertexType { /** * The type of vertices representing words, with prefix w. */ WORD('w'), /** * The type of vertices representing nonwords, prefixed x. */ NONWORD('x'); /** * The prefix character for this vertex type. */ final Character prefix; private LFVertexType(Character prefix) { this.prefix = prefix; } /** * Gets the naming prefix used by this vertex type. */ public Character getPrefix() { return prefix; } /** * Gets the LF vertex type corresponding to the specified prefix string. * * @param string The prefix string. * @return The value of {@link #fromPrefix(Character)} for the first character in the * specified string. * * @see #fromPrefix(Character) */ public static LFVertexType fromPrefix(String string) { return fromPrefix(Character.valueOf(string.charAt(0))); } /** * Gets the LF vertex type corresponding to the specified character prefix. * @param prefix The prefix character. * @return The vertex type with the specified prefix, as determined by * {@link #getPrefix()}, or {@link #NONWORD} if there is no vertex type for the given * prefix character. */ public static LFVertexType fromPrefix(Character prefix) { for(LFVertexType type : values()) { if(type.prefix.equals(prefix)) { return type; } } return NONWORD; } } ================================================ FILE: src/opennlp/ccg/hylo/graph/package.html ================================================

    Provides representations of {@link opennlp.ccg.synsem.LF}s as traversible graphs based on the JGraphT package, as well as customized filters for sets of edges in those graphs.

    {@link opennlp.ccg.hylo.graph.LFGraph}s can be created on their own, or from an existing {@link opennlp.ccg.synsem.LF} (or XML structure representing one) using the {@link opennlp.ccg.hylo.graph.LFGraphFactory}. LF graphs are also customizable in that different edge factories can be specified by implementing {@link opennlp.ccg.hylo.graph.LFEdgeFactory} (a default implementation is provided in {@link opennlp.ccg.hylo.graph.DefaultLFEdgeFactory}).

    ================================================ FILE: src/opennlp/ccg/lexicon/DataItem.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import org.jdom.*; /** * Data structure for storing information about a lexical entry. Specifically * used by LMR grammars. * * @author Jason Baldridge * @version $Revision: 1.2 $, $Date: 2007/12/17 20:02:23 $ */ public class DataItem { private String stem = ""; private String pred = ""; public DataItem() {} public DataItem (String s, String p) { stem = s; pred = p; } public DataItem(Element datael) { stem = datael.getAttributeValue("stem"); pred = datael.getAttributeValue("pred"); if (null == pred) { pred = stem; } } public void setStem(String s) { stem = s; } public void setPred(String s) { pred = s; } public String getStem() { return stem; } public String getPred() { return pred; } } ================================================ FILE: src/opennlp/ccg/lexicon/DefaultTokenizer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.util.*; import java.text.*; import java.util.*; import javax.xml.datatype.*; import gnu.trove.*; /** * DefaultTokenizer provides a default implementation of the * Tokenizer interface. * * @author Michael White * @version $Revision: 1.32 $, $Date: 2010/12/09 04:58:12 $ **/ public class DefaultTokenizer implements Tokenizer { // date format with pattern yyyy.MM.dd, strict parsing private DateFormat dateFormat = null; // date format with pattern *.MM.dd, strict parsing private DateFormat dateFormatNoYear = null; // time format with pattern HH:mm, strict parsing private DateFormat timeFormat = null; // factory for parsing durations, in format "PnYnMnDTnHnMnS", as defined in XML Schema 1.0 section 3.2.6.1 private DatatypeFactory datatypeFactory = null; /** * Map from special token semantic classes to special token constants. * The map is initialized in the constructor, where * the standard constants (eg Tokenizer.DATE_CLASS and Tokenizer.DATE_VAL) are added. */ protected Map specialTokenMap = null; /** * A set containing semantic classes to replace words with for language models. * Equality is checked with identity, for use with interned strings. */ @SuppressWarnings("unchecked") protected Set replacementSemClasses = new THashSet(new TObjectIdentityHashingStrategy()); /** * Constructor. */ public DefaultTokenizer() { // init date, time formats dateFormat = new SimpleDateFormat("yyyy.MM.dd", Locale.ENGLISH); dateFormat.setLenient(false); dateFormatNoYear = new SimpleDateFormat("*.MM.dd", Locale.ENGLISH); dateFormatNoYear.setLenient(false); timeFormat = new SimpleDateFormat("HH:mm", Locale.ENGLISH); timeFormat.setLenient(false); // init data type factory try { datatypeFactory = DatatypeFactory.newInstance(); } catch (DatatypeConfigurationException exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } // init special token map specialTokenMap = new HashMap(); specialTokenMap.put(Tokenizer.DATE_CLASS, Tokenizer.DATE_VAL); specialTokenMap.put(Tokenizer.TIME_CLASS, Tokenizer.TIME_VAL); specialTokenMap.put(Tokenizer.NUM_CLASS, Tokenizer.NUM_VAL); specialTokenMap.put(Tokenizer.AMT_CLASS, Tokenizer.AMT_VAL); specialTokenMap.put(Tokenizer.DUR_CLASS, Tokenizer.DUR_VAL); specialTokenMap.put(Tokenizer.NE_CLASS, Tokenizer.NE_VAL); } /** * Adds a semantic class to replace words with for language models. */ public void addReplacementSemClass(String semClass) { replacementSemClasses.add(semClass.intern()); } /** * Returns whether the given semantic class is one to replace words with for language models. * The sem class is assumed to have been interned. */ public boolean isReplacementSemClass(String semClass) { return replacementSemClasses.contains(semClass); } /** * Parses an input string into a list of words, * including any explicitly given factors, * and the semantic class of special tokens. * Tokens are parsed into words using parseToken with the strictFactors * flag set to false. */ public List tokenize(String s) { return tokenize(s, false); } /** * Parses an input string into a list of words, * including any explicitly given factors, * and the semantic class of special tokens. * Tokens are parsed into words using parseToken, according to the given * flag for whether to parse factors strictly. * The string is assumed to have white-space delimited tokens. */ public List tokenize(String s, boolean strictFactors) { List retval = new ArrayList(); StringTokenizer st = new StringTokenizer(s); while (st.hasMoreTokens()) { retval.add(parseToken(st.nextToken(), strictFactors)); } return retval; } /** * Parses a token into a word, including any explicitly given factors * and the semantic class of special tokens. * Parsing is performed using parseToken with the strictFactors * flag set to false. */ public Word parseToken(String token) { return parseToken(token, false); } /** * Parses a token into a word, including any explicitly given factors * and the semantic class of special tokens, according to the given * flag for whether to parse factors strictly. * Recognized pitch accents may be appended to the word form with an underscore. * If the strictFactors flag is true, then colons are always assumed to * separate attribute-value pairs, and hyphens are always assumed to * separate attributes from values, and thus colons or hyphens not used as separators * must be escaped. * If the strictFactors flag is false, then there must be at least one colon * and at least one hyphen in the token to trigger parsing of factors, * in which case colons or hyphens not used as separators * must again be escaped; otherwise, colons or hyphens on their own may * appear without escaping in the word form. * After splitting the token into factors, it is unescaped. */ public Word parseToken(String token, boolean strictFactors) { // init String form = token; String pitchAccent = null; List> attrValPairs = null; String stem = null; String POS = null; String supertag = null; String semClass = null; // handle colon-separated attr-val pairs int colonPos = token.indexOf(':'); int hyphenPos = token.indexOf('-'); if (strictFactors || (colonPos > 0 && hyphenPos > 0)) { // deal with special cases before first colon, if any String suffix; if (colonPos > 0 && hyphenPos > colonPos) { form = token.substring(0, colonPos); suffix = token.substring(colonPos+1); } else if (colonPos < 0 && hyphenPos < 0) { form = token; suffix = null; } else { form = null; suffix = token; } while (suffix != null) { hyphenPos = suffix.indexOf('-'); String attr = suffix.substring(0, hyphenPos); String val = suffix.substring(hyphenPos+1); colonPos = suffix.indexOf(':'); if (colonPos > 0) { val = suffix.substring(hyphenPos+1, colonPos); suffix = suffix.substring(colonPos+1); } else suffix = null; attr = unescape(attr); val = unescape(val); if (attr.equals(Tokenizer.WORD_ATTR)) { form = val; continue; } if (attr.equals(Tokenizer.STEM_ATTR)) { stem = val; continue; } if (attr.equals(Tokenizer.POS_ATTR)) { POS = val; continue; } if (attr.equals(Tokenizer.SUPERTAG_ATTR)) { supertag = val; continue; } if (attr.equals(Tokenizer.SEM_CLASS_ATTR)) { semClass = val; continue; } if (attr.equals(Tokenizer.PITCH_ACCENT_ATTR)) { pitchAccent = val; continue; } if (attrValPairs == null) attrValPairs = new ArrayList>(5); attrValPairs.add(new Pair(attr, val)); } } // check for pitch accent preceded by an underscore int pos = (form != null) ? form.lastIndexOf("_") : -1; if (pos > 0) { String suffix = form.substring(pos+1); if (Grammar.isPitchAccent(suffix)) { pitchAccent = suffix; form = form.substring(0, pos); } } // unescape form (unless it happens to be "null") if (!"null".equals(form)) form = unescape(form); // check for special token String specialTokenClass = isSpecialToken(form); if (specialTokenClass != null) semClass = specialTokenClass; // done return Word.createWord(form,pitchAccent,attrValPairs,stem,POS,supertag,semClass); } /** * Returns a string (eg Tokenizer.DATE_CLASS) indicating the semantic class * of special token, if the given token is recognized as a special * token; otherwise returns null. */ public String isSpecialToken(String token) { if (token == null) return null; if (isDate(token)) return Tokenizer.DATE_CLASS; if (isTime(token)) return Tokenizer.TIME_CLASS; if (isNum(token)) return Tokenizer.NUM_CLASS; if (isAmt(token)) return Tokenizer.AMT_CLASS; if (isDur(token)) return Tokenizer.DUR_CLASS; if (isNamedEntity(token)) return Tokenizer.NE_CLASS; return null; } /** * Returns the special token constant for the given special token class, * or null if none. */ public String getSpecialTokenConstant(String semClass) { if (semClass == null) return null; return specialTokenMap.get(semClass); } /** * Returns true iff the given string is a special token constant * (eg Tokenizer.DATE_VAL). */ public boolean isSpecialTokenConstant(String s) { return specialTokenMap.containsValue(s); } /** * Returns true iff the token is recognized as a date. * The default implementation recognizes dates in the * format yyyy.MM.dd, e.g. "2004.05.07", or *.MM.dd, e.g. "*.05.07", * which is taken to mean the 5th of May (in the contextually * appropriate year). Note that by including the "*." prefix, * the format avoids being ambiguous between a date or number; * that is, with this format, something like "10.01" is * unambiguously a number, whereas "*.10.01" means the 1st of * October. */ public boolean isDate(String token) { ParsePosition pos = new ParsePosition(0); Date date = dateFormat.parse(token, pos); if (date != null && pos.getIndex() == token.length()) return true; pos = new ParsePosition(0); date = dateFormatNoYear.parse(token, pos); return (date != null && pos.getIndex() == token.length()); } /** * Returns true iff the token is recognized as a time. * The default implementation recognizes times in the * 24-hour format HH:mm, e.g. "00:12" or "15:03". */ public boolean isTime(String token) { ParsePosition pos = new ParsePosition(0); Date time = timeFormat.parse(token, pos); return (time != null && pos.getIndex() == token.length()); } /** * Returns true iff the token is recognized as a number. * The default implementation returns true if the token * can be parsed as an integer or double, though not one * in scientific notation. */ public boolean isNum(String token) { try { Integer.parseInt(token); return true; } catch (NumberFormatException exc) { try { Double.parseDouble(token); if (token.indexOf('E') != -1) return false; if (token.indexOf('e') != -1) return false; return true; } catch (NumberFormatException exc2) { return false; } } } /** * Returns true iff the token is recognized as an amount. * The default implementation only handles currency amounts. * The token is recognized as an amount if it begins with * a number and ends with an ISO-4217 currency code. * (See http://www.xe.com/iso4217.htm.) */ public boolean isAmt(String token) { if (token.length() < 4) return false; String code = token.substring(token.length()-3); try { Currency.getInstance(code); } catch (IllegalArgumentException exc) { return false; } String num = token.substring(0,token.length()-3).trim(); return isNum(num); } /** * Returns true iff the token is recognized as a duration. * The format is "PnYnMnDTnHnMnS", as defined in XML Schema 1.0 section 3.2.6.1. * For example, "PT5H30" is 5 hours and 30 minutes. */ public boolean isDur(String token) { try { datatypeFactory.newDuration(token); return true; } catch (Exception exc) { return false; } } /** * Returns true iff the token is recognized as a named entity (not listed in lexicon). * The default implementation always returns false. */ public boolean isNamedEntity(String token) { return false; } /** * Returns a string for the given list of words. * A space separates the string for each word, as determined by getOrthography(Word,false). */ public String getOrthography(List words) { return getOrthography(words, false); } /** * Returns a string for the given list of words, optionally with semantic class replacement. * A space separates the string for each word, as determined by getOrthography(Word,semClassReplacement). */ public String getOrthography(List words, boolean semClassReplacement) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < words.size(); i++) { Word w = (Word) words.get(i); sb.append(getOrthography(w, semClassReplacement)); if (i < words.size()-1) sb.append(" "); } return sb.toString(); } /** * Returns a string for the given word, optionally with semantic class replacement. * The default implementation returns the word's form - or semantic class, if apropos - * followed by its pitch accent (if non-null) separated by an underscore, * followed by any further attribute values, also separated by underscores. * With the semantic class replacement option, the word form is replaced with * the semantic class, uppercased, if the class is listed as one to replace words with for * language models. */ public String getOrthography(Word w, boolean semClassReplacement) { StringBuffer sb = new StringBuffer(); String semClass = w.getSemClass(); if (semClassReplacement && semClass != null && replacementSemClasses.contains(semClass)) sb.append(semClass.toUpperCase()); else sb.append(w.getForm()); if (w.getPitchAccent() != null) sb.append("_").append(w.getPitchAccent()); for (Iterator> it = w.getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); sb.append("_").append(p.b); } return sb.toString(); } /** * Returns a string for the given list of words, * in the format expected by the SRILM tool for factored language models. * A space separates the string for each word, determined by format(Word). */ public String format(List words) { return format(words, false); } /** * Returns a string for the given list of words, * in the format expected by the SRILM tool for factored language models, * optionally with semantic class replacement. * A space separates the string for each word, determined by format(Word,boolean). */ public String format(List words, boolean semClassReplacement) { StringBuffer sb = new StringBuffer(); sb.append(" "); for (int i = 0; i < words.size(); i++) { Word w = words.get(i); if (w.getForm() == "" || w.getForm() == "") continue; // skip or sb.append(format(w, semClassReplacement)); sb.append(" "); } sb.append(""); return sb.toString(); } /** * Returns a string for the given word, * in the format expected by the SRILM tool for factored language models. * All factors are escaped. */ public String format(Word w) { return format(w, false); } /** * Returns a string for the given word, * in the format expected by the SRILM tool for factored language models, * optionally with semantic class replacement. * All factors are escaped. * With the semantic class replacement option, the word form and stem are replaced with * the semantic class, uppercased, if the class is listed as one to replace words with for * language models. */ public String format(Word w, boolean semClassReplacement) { StringBuffer sb = new StringBuffer(); String form = w.getForm(); String pitchAccent = w.getPitchAccent(); String stem = w.getStem(); String POS = w.getPOS(); String supertag = w.getSupertag(); String semClass = w.getSemClass(); if (semClassReplacement && semClass != null && replacementSemClasses.contains(semClass)) { form = escape(semClass.toUpperCase()); stem = form; } sb.append(escape(form)); if (pitchAccent != null) sb.append(":").append(Tokenizer.PITCH_ACCENT_ATTR).append("-").append(escape(pitchAccent)); for (Iterator> it = w.getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); String attr = p.a; String val = p.b; if (val != null) sb.append(":").append(escape(attr)).append("-").append(escape(val)); } if (stem != null) sb.append(":").append(Tokenizer.STEM_ATTR).append("-").append(escape(stem)); if (POS != null) sb.append(":").append(Tokenizer.POS_ATTR).append("-").append(escape(POS)); if (supertag != null) sb.append(":").append(Tokenizer.SUPERTAG_ATTR).append("-").append(escape(supertag)); if (semClass != null) sb.append(":").append(Tokenizer.SEM_CLASS_ATTR).append("-").append(escape(semClass)); return sb.toString(); } /** * Returns an encoding of the given string where * the characters for ampersand, less-than, greater-than, * apostrophe, quote, colon and hyphen are escaped * using HTML conventions. * Null is returned for the null string. * An initial substring 'null' is doubled. */ public static String escape(String s) { if (s == null) return null; StringBuffer output = null; // instantiate only if needed if (s.startsWith("null")) { output = new StringBuffer(); output.append("null"); } for(int i=0; i < s.length(); i++) { char c = s.charAt(i); if (output == null && (c=='<' || c=='>' || c=='&' || c=='\'' || c=='"' || c==':' || c=='-')) { output = new StringBuffer(); output.append(s.substring(0,i)); } if (output != null) { switch(c) { case '<': output.append("<"); break; case '>': output.append(">"); break; case '&': output.append("&"); break; case '\'': output.append("'"); break; case '"': output.append("""); break; case ':': output.append("&#").append((int)':').append(";"); break; case '-': output.append("&#").append((int)'-').append(";"); break; default: output.append(c); } } } return (output != null) ? output.toString() : s; } /** * Returns a decoding of the given string where * the characters for ampersand, less-than, greater-than, * apostrophe, quote, colon and hyphen (and any other * character whose code is given numerically) are unescaped * using HTML conventions. * An exception is that ampersands may be left unescaped * for convenience, when there is no following semicolon * in the string. * Null is returned for the null string and for the string "null". * An initial substring 'nullnull' is halved. */ public static String unescape(String s) { if (s == null || s.equals("null")) return null; StringBuffer output = null; // instantiate only if needed if (s.startsWith("nullnull")) { s = s.substring(4); output = new StringBuffer(); } for (int i=0; i < s.length(); i++) { char c = s.charAt(i); if (c == '&') { int endPos = s.indexOf(";", i); if (endPos < 0) { // allow unescaped ampersands if (output != null) output.append(c); continue; } if (output == null) { output = new StringBuffer(); output.append(s.substring(0,i)); } String escaped = s.substring(i+1, endPos); if (escaped.equals("lt")) { output.append('<'); i = endPos; continue; } if (escaped.equals("gt")) { output.append('>'); i = endPos; continue; } if (escaped.equals("amp")) { output.append('&'); i = endPos; continue; } if (escaped.equals("apos")) { output.append('\''); i = endPos; continue; } if (escaped.equals("quot")) { output.append('"'); i = endPos; continue; } if (s.charAt(i+1) == '#') { escaped = s.substring(i+2, endPos); output.append((char)Integer.parseInt(escaped)); i = endPos; continue; } throw new RuntimeException( "Unable to unescape " + s.substring(i,endPos+1) + "at position " + i + " in: " + s ); } else if (output != null) output.append(c); } return (output != null) ? output.toString() : s; } /** * Returns one or more orthographic words for the given word's form. * This method is called from within Sign.getWordsInXml as * part of producing the textual output of realization. * The default implementation checks the semantic class * for a special token class, and if true, returns the result * of expandDate, expandTime, expandNum, expandAmt, or * expandNamedEntity, as appropriate, after first checking that * the corresponding isDate, ..., isNamedEntity method returns true. * Otherwise, it splits the word form using underscore as a delimiter. */ public List expandWord(Word word) { String token = word.getForm(); String sc = word.getSemClass(); if (sc == Tokenizer.DATE_CLASS && isDate(token)) return expandDate(token); if (sc == Tokenizer.TIME_CLASS && isTime(token)) return expandTime(token); if (sc == Tokenizer.NUM_CLASS && isNum(token)) return expandNum(token); if (sc == Tokenizer.AMT_CLASS && isAmt(token)) return expandAmt(token); if (sc == Tokenizer.DUR_CLASS && isDur(token)) return expandDur(token); if (sc == Tokenizer.NE_CLASS && isNamedEntity(token)) return expandNamedEntity(token); String[] words = token.split("_"); return Arrays.asList(words); } /** * Returns one or more orthographic words for the given date token. * The default implementation expands the date with * EnglishExpander.expandDate, using the long option if the year is * present, and the medium option if not. */ public List expandDate(String date) { ArrayList retval = new ArrayList(); try { ParsePosition pos = new ParsePosition(0); Date dateObj = dateFormat.parse(date, pos); if (dateObj != null && pos.getIndex() == date.length()) { EnglishExpander.expandDate(dateObj, DateFormat.LONG, retval); } else { dateObj = dateFormatNoYear.parse(date); EnglishExpander.expandDate(dateObj, DateFormat.MEDIUM, retval); } } // shouldn't happen if isDate called first catch (ParseException exc) { // just add date string as a fall-back option retval.add(date); } return retval; } /** * Returns one or more orthographic words for the given time token. * The default implementation expands the time using * EnglishExpander.expandTime. */ public List expandTime(String time) { ArrayList retval = new ArrayList(); try { EnglishExpander.expandTime(timeFormat.parse(time), retval); } // shouldn't happen if isTime called first catch (ParseException exc) { // just add time string as a fall-back option retval.add(time); } return retval; } /** * Returns one or more orthographic words for the given number token. * The default implementation expands the number using * EnglishExpander.expandNumber. */ public List expandNum(String num) { ArrayList retval = new ArrayList(); EnglishExpander.expandNumber(num, retval); return retval; } /** * Returns one or more orthographic words for the given amount token. * The default implementation expands the number using * EnglishExpander.expandAmount. */ public List expandAmt(String amt) { String code = amt.substring(amt.length()-3); String num = amt.substring(0,amt.length()-3).trim(); ArrayList retval = new ArrayList(); EnglishExpander.expandAmount(num, code, retval); return retval; } /** * Returns one or more orthographic words for the given duration token. * The default implementation expands the number using * EnglishExpander.expandDuration. */ public List expandDur(String dur) { Duration duration = null; try { duration = datatypeFactory.newDuration(dur); } catch (Exception exc) { // parsing not expected to fail throw (RuntimeException) new RuntimeException().initCause(exc); } ArrayList retval = new ArrayList(); EnglishExpander.expandDuration(duration, retval); return retval; } /** * Returns one or more orthographic words for the given named entity token. * The default implementation just splits the token using underscore as a delimiter. */ public List expandNamedEntity(String namedEntity) { String[] words = namedEntity.split("_"); return Arrays.asList(words); } /** Test: tokenize args[0], expand each token; and optionally do parseToken(args[1],true). */ public static void main(String[] args) { Tokenizer tk = new DefaultTokenizer(); String s = args[0]; List words = tk.tokenize(s); String expw = ""; System.out.println("words: "); for (int i = 0; i < words.size(); i++) { Word word = words.get(i); System.out.print(word + " "); List orthWords = tk.expandWord(word); for (int j = 0; j < orthWords.size(); j++) { expw += orthWords.get(j) + " "; } } System.out.println(); System.out.println("expanded: " + expw); System.out.println("formatted: " + tk.format(words)); if (args.length > 1) { System.out.println(); Word strictlyParsed = tk.parseToken(args[1], true); System.out.println("strictly parsed word: " + strictlyParsed); System.out.println("formatted: " + tk.format(strictlyParsed)); } } } ================================================ FILE: src/opennlp/ccg/lexicon/EnglishExpander.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import java.util.*; import java.text.*; import javax.xml.datatype.*; /** * EnglishExpander provides methods for expanding numbers, amounts, durations, * dates and times as English words. * The class com.sun.speech.freetts.en.us.NumberExpander served * as a reference point in part, but the implementation has been * rewritten from scratch, streamlined and extended. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2005/10/20 17:30:30 $ **/ public class EnglishExpander { //-------------------------------------------- // dates // // formats date in month private static final DateFormat dateInMonthFormat = new SimpleDateFormat("dd", Locale.ENGLISH); // formats month in full private static final DateFormat monthFormat = new SimpleDateFormat("MMMM", Locale.ENGLISH); // formats year in full private static final DateFormat yearFormat = new SimpleDateFormat("yyyy", Locale.ENGLISH); /** * Expands a date to English words in short, * medium or long forms, adding the words to * the given list. The style is given by * DateFormat.SHORT (e.g., "the first"), * DateFormat.MEDIUM (e.g., "the first of May"), and * DateFormat.LONG (e.g., "the first of May, two thousand and four"). */ public static void expandDate(Date date, int style, List list) { list.add("the"); expandOrdinal(dateInMonthFormat.format(date), list); if (style == DateFormat.MEDIUM || style == DateFormat.LONG) { list.add("of"); list.add(monthFormat.format(date)); } if (style == DateFormat.LONG) { list.add(","); String year = yearFormat.format(date); // x00y if (year.charAt(1) == '0' && year.charAt(2) == '0') { expandNumber(year, list); } // xxyy else { expandNumber(year.substring(0,2), list); expandNumber(year.substring(2), list); } } } //-------------------------------------------- // times // // formats hours in range 1-12 private static final DateFormat hoursFormat = new SimpleDateFormat("hh", Locale.ENGLISH); // formats minutes private static final DateFormat minutesFormat = new SimpleDateFormat("mm", Locale.ENGLISH); // formats am/pm private static final DateFormat amPmFormat = new SimpleDateFormat("a", Locale.ENGLISH); /** * Expands a time to English words, * adding the words to the given list. * For example, a date object with time set to 23:02 * is expanded to "eleven oh two PM". */ public static void expandTime(Date time, List list) { // add hours String hours = hoursFormat.format(time); expandNDigitNumber(hours, list); // add minutes, with special case for 'oh' String minutes = minutesFormat.format(time); if (minutes.charAt(0) == '0' && minutes.charAt(1) != '0') { list.add("oh"); expandNDigitNumber(minutes.substring(1), list); } else { expandNDigitNumber(minutes, list); } // add AM or PM list.add(amPmFormat.format(time)); } //-------------------------------------------- // amounts // /** * Expands a digit string and currency code to * number words and the currency name, which * are added to the given list. * For example, "12.50" and "GBP" are expanded to * "twelve pounds and fifty pence". * The codes GBP, USD and EUR are converted to * names, while other codes are left as is. */ public static void expandAmount(String digitString, String currencyCode, List list) { // establish names for currency (singular and plural), // and fractional parts String currSing = null; String currPlur = null; String fracSing = null; String fracPlur = null; if (currencyCode.equals("GBP")) { currSing = "pound"; currPlur = "pounds"; fracSing = "penny"; fracPlur = "pence"; } else if (currencyCode.equals("USD")) { currSing = "dollar"; currPlur = "dollars"; fracSing = "cent"; fracPlur = "cents"; } else if (currencyCode.equals("EUR")) { currSing = "euro"; currPlur = "euros"; fracSing = "cent"; fracPlur = "cents"; } // if none, just expand digit string and append code if (currSing == null) { expandNumber(digitString, list); list.add(currencyCode); return; } // otherwise, get whole and fractional parts of digit string String whole = digitString; String frac = null; int dotIndex = digitString.indexOf("."); if (dotIndex != -1) { whole = digitString.substring(0,dotIndex); frac = digitString.substring(dotIndex+1); } // expand whole expandNumber(whole, list); // add currency name if (whole.equals("1")) list.add(currSing); else list.add(currPlur); // add fractional part, if any if (frac != null) { // add "and" list.add("and"); // expand frac expandNDigitNumber(frac, list); // add fractional name if (frac.equals("01")) list.add(fracSing); else list.add(fracPlur); } } //-------------------------------------------- // durations // /** * Expands a duration into a string of words. * NB: Fractions of seconds are ignored at present. */ public static void expandDuration(Duration duration, List list) { int[] durationFields = { duration.getYears(), duration.getMonths(), duration.getDays(), duration.getHours(), duration.getMinutes(), duration.getSeconds() }; int counter = 0; for (int i = 0; i < durationFields.length; i++) { if (durationFields[i] > 0) counter++; } for (int i = 0; i < durationFields.length; i++) { if (durationFields[i] > 0) { counter--; String str = Integer.toString(durationFields[i]); expandNumber(str, list); String unit = durUnits[i]; if (durationFields[i] != 1) unit += "s"; list.add(unit); if (counter > 1) list.add(","); if (counter == 1) list.add("and"); } } } // duration units private static String[] durUnits = { "year", "month", "day", "hour", "minute", "second" }; //-------------------------------------------- // numbers // /** * Expands a digit string to a sequence of digit words, * which are added to the given list. * For example, "1234" is expanded to "one two three four". */ public static void expandDigits(String digitString, List list) { for (int i = 0; i < digitString.length(); i++) { list.add(zeroToNine[digitString.charAt(i)-'0']); } } /** * Expands a digit string to number words, * which are added to the given list. * For example, "1234" is expanded to * "one thousand two hundred and thirty four". * The digit string may contain a single dot in it, * as well as an initial plus or minus character. * For example, "-100.011" is expanded to * "minus one hundred point zero one one". * Scientific notation is not currently handled. */ public static void expandNumber(String digitString, List list) { // do nothing with empty strings if (digitString.length() == 0) return; // handle plus or minus char c0 = digitString.charAt(0); if (c0 == '+' || c0 == '-') { list.add((c0 == '+') ? "plus" : "minus"); digitString = digitString.substring(1); } // check for dot int dotIndex = digitString.indexOf("."); if (dotIndex != -1) { // add numbers "point" digits expandNDigitNumber(digitString.substring(0,dotIndex), list); list.add("point"); expandDigits(digitString.substring(dotIndex+1), list); } else { // add numbers expandNDigitNumber(digitString, list); } } /** * Expands a digit string to words for an ordinal number, * which are added to the given list. * For example, "1234" is expanded to * "one thousand two hundred and thirty fourth". */ public static void expandOrdinal(String digitString, List list) { // expand number expandNDigitNumber(digitString, list); // replace last one int lastPos = list.size() - 1; String ordinal = getOrdinal(list.get(lastPos)); list.set(lastPos, ordinal); } // n-digit number public static void expandNDigitNumber(String digitString, List list) { int numDigits = digitString.length(); if (numDigits == 2) expand2DigitNumber(digitString, list); else if (numDigits == 3) expand3DigitNumber(digitString, list); else if (numDigits >= 4 && numDigits <= 12) expand4to12DigitNumber(digitString, list); else expandDigits(digitString, list); } // 2 digit numbers private static void expand2DigitNumber(String digitString, List list) { // 0x case if (digitString.charAt(0) == '0') { // do nothing for 00 case if (digitString.charAt(1) != '0') expandDigits(digitString.substring(1), list); } // 1x case else if (digitString.charAt(0) == '1') { list.add(tenToNineteen[digitString.charAt(1)-'0']); } // xy case, x >= 2 else { list.add(zeroToNinety[digitString.charAt(0)-'0']); // do nothing for x0 case if (digitString.charAt(1) != '0') expandDigits(digitString.substring(1), list); } } // 3 digit numbers private static void expand3DigitNumber(String digitString, List list) { // add hundreds if non-zero if (digitString.charAt(0) != '0') { // add hundreds digit expandDigits(digitString.substring(0,1), list); // add unit ("hundred") list.add("hundred"); } // add "and", if final two digits non-zero if (digitString.charAt(1) != '0' || digitString.charAt(2) != '0') list.add("and"); // expand final two digits expand2DigitNumber(digitString.substring(1), list); } // 4-12 digit numbers private static void expand4to12DigitNumber(String digitString, List list) { int numDigitsModThree = digitString.length() % 3; int numInitialDigits = (numDigitsModThree != 0) ? numDigitsModThree : 3; int unitsIndex = ((digitString.length() - 1) / 3) - 1; // add initial digits expandNDigitNumber(digitString.substring(0, numInitialDigits), list); // add unit list.add(thousandToBillion[unitsIndex]); // add rest expandNDigitNumber(digitString.substring(numInitialDigits), list); } //------------------------ // arrays of number words // private static final String[] zeroToNine = { "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine" }; private static final String[] tenToNineteen = { "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen" }; private static final String[] zeroToNinety = { "zero", "ten", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety" }; private static final String[] thousandToBillion = { "thousand", "million", "billion" }; private static final String[] zerothToNinth = { "zeroth", "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth" }; private static final String[] tenthToNineteenth = { "tenth", "eleventh", "twelfth", "thirteenth", "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth", "nineteenth" }; private static final String[] zerothToNinetieth = { "zeroth", "tenth", "twentieth", "thirtieth", "fortieth", "fiftieth", "sixtieth", "seventieth", "eightieth", "ninetieth" }; private static final String[] thousandthToBillionth = { "thousandth", "millionth", "billionth" }; //-------------------------------------------- // corresponding ordinals // // map from numbers to corresponding ordinals private static Map ordinalMap = null; // returns corresponding ordinal private static String getOrdinal(String number) { // ensure ordinalMap instantiated if (ordinalMap == null) { ordinalMap = new HashMap(); for (int i = 0; i < zeroToNine.length; i++) { ordinalMap.put(zeroToNine[i], zerothToNinth[i]); } for (int i = 0; i < tenToNineteen.length; i++) { ordinalMap.put(tenToNineteen[i], tenthToNineteenth[i]); } for (int i = 0; i < zeroToNinety.length; i++) { ordinalMap.put(zeroToNinety[i], zerothToNinetieth[i]); } ordinalMap.put("hundred", "hundredth"); for (int i = 0; i < thousandToBillion.length; i++) { ordinalMap.put(thousandToBillion[i], thousandthToBillionth[i]); } } return ordinalMap.get(number); } } ================================================ FILE: src/opennlp/ccg/lexicon/EntriesItem.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-9 Jason Baldridge, Gann Bierner and // University of Edinburgh / Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.synsem.*; import org.jdom.*; /** * Data structure for storing information about a category family entry. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.10 $, $Date: 2009/10/17 20:46:20 $ */ public class EntriesItem { private Family family; private Boolean active = Boolean.TRUE; private String name; private String stem; private String indexRel; private Category cat; public EntriesItem(Element el, Family family) { this.family = family; name = el.getAttributeValue("name"); stem = el.getAttributeValue("stem"); if (stem == null) stem = Lexicon.DEFAULT_VAL; String isActive = el.getAttributeValue("active"); if (isActive != null && isActive.equals("false")) active = Boolean.FALSE; String indexRelVal = el.getAttributeValue("indexRel"); if (indexRelVal != null) indexRel = indexRelVal; else indexRel = family.getIndexRel(); cat = CatReader.getCat((Element)el.getChildren().get(0)); } public Boolean getActive() { return active; } /** Returns the name of this entry. */ public String getName() { return name; } /** Returns the qualified name in the form familyName.name. */ public String getQualifiedName() { return getFamilyName() + "." + name; } /** Returns the name of this entry's family. */ public String getFamilyName() { return family.getName(); } /** Returns the supertag of this entry's category. */ public String getSupertag() { return cat.getSupertag(); } public Family getFamily() { return family; } public String getStem() { return stem; } public String getIndexRel() { return indexRel; } /** Returns this entry's family's coart rel. */ public String getCoartRel() { return family.getCoartRel(); } public Category getCat() { return cat; } public String toString () { return getQualifiedName() + ":" + stem + " :- " + cat; } } ================================================ FILE: src/opennlp/ccg/lexicon/FactorChainWord.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.util.*; import java.util.*; /** * A FactorChainWord represents a word via a chain of references to * factor keys, or just to a string in the case of the word form. * Canonical instances are created by a factory method, and stored * in a trie map. The factor chain representation should be more * space efficient when dealing with large numbers of words. * * * @author Michael White * @version $Revision: 1.3 $, $Date: 2009/07/17 04:23:30 $ */ public class FactorChainWord extends Word { private static final long serialVersionUID = 952665894357382685L; /** The referenced factor key or string (for the word form). */ protected Object key; /** The previous node in the chain. */ protected FactorChainWord prev; /** Constructor. */ protected FactorChainWord(Object key, FactorChainWord prev) { this.key = key; this.prev = prev; } /** Returns the surface form. */ public String getForm() { return getValFromInterned(Tokenizer.WORD_ATTR); } /** Returns the pitch accent. */ public String getPitchAccent() { return getValFromInterned(Tokenizer.PITCH_ACCENT_ATTR); } /** Returns the list of extra attribute-value pairs. */ protected List> getAttrValPairsList() { List> retval = null; FactorChainWord current = this; while (current != null) { if (current.key instanceof FactorKey) { FactorKey fkey = (FactorKey) current.key; if (!isKnownAttr(fkey.factor)) { if (retval == null) retval = new ArrayList>(5); retval.add(0, new Pair(fkey.factor, fkey.val)); } } current = current.prev; } return retval; } /** Returns the stem. */ public String getStem() { return getValFromInterned(Tokenizer.STEM_ATTR); } /** Returns the part of speech. */ public String getPOS() { return getValFromInterned(Tokenizer.POS_ATTR); } /** Returns the supertag. */ public String getSupertag() { return getValFromInterned(Tokenizer.SUPERTAG_ATTR); } /** Returns the semantic class. */ public String getSemClass() { return getValFromInterned(Tokenizer.SEM_CLASS_ATTR); } /** Returns the value of the attribute with the given name, or null if none. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used to retrieve the form, ..., semantic class. */ public String getVal(String attr) { String internedAttr = attr.intern(); // use == on interned attr return getValFromInterned(internedAttr); } /** Returns the value of the given interned attr, or null if none. */ protected String getValFromInterned(String attr) { FactorChainWord current = this; while (current != null) { if (attr == Tokenizer.WORD_ATTR) { if (current.key instanceof String) return (String) current.key; } else if (current.key instanceof FactorKey) { FactorKey fkey = (FactorKey) current.key; if (fkey.factor == attr) return fkey.val; } current = current.prev; } return null; } /** Factory. */ public static class Factory implements WordFactory { /** Trie map for canonical instances. */ protected TrieMap factorChainRoot = new TrieMap(null); /** Creates a surface word with the given interned form. */ public synchronized Word create(String form) { return create(factorChainRoot, Tokenizer.WORD_ATTR, form); } /** Creates a (surface or full) word with the given normalized attribute name and value. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used for the form, ..., semantic class. */ public synchronized Word create(String attr, String val) { return create(factorChainRoot, attr, val); } /** Creates a (surface or full) word from the given normalized factors. Returns null if no non-null vals. */ public synchronized Word create( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ) { // adds non-null vals from the root, in a rough specificity order TrieMap currentNode = factorChainRoot; if (POS != null) currentNode = findChild(currentNode, Tokenizer.POS_ATTR, POS); if (supertag != null) currentNode = findChild(currentNode, Tokenizer.SUPERTAG_ATTR, supertag); if (semClass != null) currentNode = findChild(currentNode, Tokenizer.SEM_CLASS_ATTR, semClass); if (stem != null) currentNode = findChild(currentNode, Tokenizer.STEM_ATTR, stem); if (form != null) currentNode = findChild(currentNode, Tokenizer.WORD_ATTR, form); if (pitchAccent != null) currentNode = findChild(currentNode, Tokenizer.PITCH_ACCENT_ATTR, pitchAccent); if (attrValPairs != null) { for (int i = 0; i < attrValPairs.size(); i++) { Pair p = attrValPairs.get(i); String attr = p.a; String val = p.b; currentNode = findChild(currentNode, attr, val); } } return currentNode.data; } /** Creates a word from the given node, adding the given interned attr and non-null val. */ protected Word create(TrieMap currentNode, String attr, String val) { TrieMap child = findChild(currentNode, attr, val); return child.data; } /** Gets or makes a child node from the given node. */ protected TrieMap findChild(TrieMap currentNode, String attr, String val) { Object key = FactorKey.getKey(attr, val); TrieMap child = currentNode.findChild(key); if (child.data == null) { FactorChainWord parent = currentNode.data; child.data = new FactorChainWord(key, parent); } return child; } } } ================================================ FILE: src/opennlp/ccg/lexicon/FactorKey.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.util.*; // import java.util.*; /** * Factor key, with identity equals for the factor and value. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2005/03/22 20:59:32 $ */ public class FactorKey { /** The factor. */ public final String factor; /** The value. */ public final String val; /** Constructor. Assumes interned components. */ private FactorKey(String factor, String val) { this.factor = factor; this.val = val; } /** Makes/retrieves an interned factor key for the given interned attr and val; for the word form, the string itself is returned. Null vals are replaced with <NULL>. */ public static Object getKey(String attr, String val) { if (val == null) val = ""; if (attr == Tokenizer.WORD_ATTR) return val; else return Interner.globalIntern(new FactorKey(attr, val)); } /** Returns a hash code constructed from the component identity hash codes. */ public int hashCode() { return System.identityHashCode(factor) - System.identityHashCode(val); } /** Returns true if the given factor key has identical components. */ public boolean equals(Object obj) { if (this == obj) return true; if (!(obj instanceof FactorKey)) return false; FactorKey key = (FactorKey) obj; return factor == key.factor && val == key.val; } /** Returns "factor-val". */ public String toString() { return factor + "-" + val; } } ================================================ FILE: src/opennlp/ccg/lexicon/Family.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-9 Jason Baldridge, Gann Bierner and // University of Edinburgh / Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import org.jdom.*; import java.util.*; /** * Lexicon category family. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.11 $, $Date: 2010/11/30 18:51:05 $ */ public class Family { private String name = ""; private Boolean closed = Boolean.FALSE; private String pos = ""; private String indexRel = ""; private String coartRel = ""; private DataItem[] data; private EntriesItem[] entries; @SuppressWarnings("unchecked") public Family(Element famel) { setName(famel.getAttributeValue("name")); pos = famel.getAttributeValue("pos"); String isClosed = famel.getAttributeValue("closed"); if (isClosed != null && isClosed.equals("true")) { setClosed(Boolean.TRUE); } String indexRelVal = famel.getAttributeValue("indexRel"); if (indexRelVal != null) { indexRel = indexRelVal; } String coartRelVal = famel.getAttributeValue("coartRel"); if (coartRelVal != null) { coartRel = coartRelVal; } List entriesList = famel.getChildren("entry"); entries = new EntriesItem[entriesList.size()]; for (int j=0; j < entriesList.size(); j++) { entries[j] = new EntriesItem(entriesList.get(j), this); } List members = famel.getChildren("member"); data = new DataItem[members.size()]; for (int j=0; j < members.size(); j++) { data[j] = new DataItem(members.get(j)); } } public Family(String s) { setName(s); } public boolean isClosed() { return closed.booleanValue(); } public void setName(String s) { name = s; } public void setClosed(Boolean b) { closed = b; } public void setPOS(String s) { pos = s; } public void setIndexRel(String s) { indexRel = s; } public void setCoartRel(String s) { coartRel = s; } public void setData(DataItem[] dm) { data = dm; } public void setEntries(EntriesItem[] em) { entries = em; } public String getName() { return name; } /** Delegates to first entry. */ public String getSupertag() { return entries[0].getSupertag(); } public Boolean getClosed() { return closed; } public String getPOS() { return pos; } public String getIndexRel() { return indexRel; } public String getCoartRel() { return coartRel; } public DataItem[] getData() { return data; } public EntriesItem[] getEntries() { return entries; } } ================================================ FILE: src/opennlp/ccg/lexicon/FullWord.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.util.*; import java.util.*; /** * A FullWord object is a word with all possible fields. * The factory methods return interned objects. * * @author Michael White * @version $Revision: 1.6 $, $Date: 2009/07/17 04:23:30 $ */ public class FullWord extends WordWithPitchAccent { private static final long serialVersionUID = -3115687437782457735L; /** List of attribute-value pairs, which must be strings. */ protected List> attrValPairs; /** The stem. */ protected String stem; /** The part of speech. */ protected String POS; /** The supertag. */ protected String supertag; /** The semantic class (optional). */ protected String semClass; /** Returns the list of extra attribute-value pairs. */ protected List> getAttrValPairsList() { return attrValPairs; } /** Returns the stem. */ public String getStem() { return stem; } /** Returns the part of speech. */ public String getPOS() { return POS; } /** Returns the supertag. */ public String getSupertag() { return supertag; } /** Returns the semantic class (may be null). */ public String getSemClass() { return semClass; } /** Constructor for full word. */ protected FullWord( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ) { super(form, pitchAccent); this.attrValPairs = attrValPairs; this.stem = stem; this.POS = POS; this.supertag = supertag; this.semClass = semClass; } /** Factory. */ public static class Factory implements WordFactory { // reusable word, for looking up already interned ones private FullWord w = new FullWord(null, null, null, null, null, null, null); // sets the form and factors of the reusable word w private void setW( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ) { w.form = form; w.pitchAccent = pitchAccent; w.attrValPairs = attrValPairs; w.stem = stem; w.POS = POS; w.supertag = supertag; w.semClass = semClass; } // looks up the word equivalent to w, or if none, returns a new one based on it private Word getOrCreateFromW() { Word retval = (Word) Interner.getGlobalInterned(w); if (retval != null) return retval; if (w.isSurfaceWord() && w.attrValPairs == null) { if (w.pitchAccent == null) retval = new SimpleWord(w.form); else retval = new WordWithPitchAccent(w.form, w.pitchAccent); } else retval = new FullWord(w.form, w.pitchAccent, w.attrValPairs, w.stem, w.POS, w.supertag, w.semClass); return (Word) Interner.globalIntern(retval); } /** Creates a surface word with the given interned form. */ public synchronized Word create(String form) { return create(form, null, null, null, null, null, null); } /** Creates a (surface or full) word with the given normalized attribute name and value. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used for the form, ..., semantic class. */ public synchronized Word create(String attr, String val) { String form = null; String pitchAccent = null; List> attrValPairs = null; String stem = null; String POS = null; String supertag = null; String semClass = null; if (attr == Tokenizer.WORD_ATTR) form = val; else if (attr == Tokenizer.PITCH_ACCENT_ATTR) pitchAccent = val; else if (attr == Tokenizer.STEM_ATTR) stem = val; else if (attr == Tokenizer.POS_ATTR) POS = val; else if (attr == Tokenizer.SUPERTAG_ATTR) supertag = val; else if (attr == Tokenizer.SEM_CLASS_ATTR) semClass = val; else { attrValPairs = new ArrayList>(1); attrValPairs.add(new Pair(attr, val)); } return create(form, pitchAccent, attrValPairs, stem, POS, supertag, semClass); } /** Creates a (surface or full) word from the given canonical factors. */ public synchronized Word create( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ) { setW(form, pitchAccent, attrValPairs, stem, POS, supertag, semClass); return getOrCreateFromW(); } } } ================================================ FILE: src/opennlp/ccg/lexicon/LexException.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-3 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; /** * Any exception thrown if something wrong happens in the lexicon. * * @author Gann Bierner * @author Michael White * @version $Revision: 1.3 $, $Date: 2005/10/20 17:30:30 $ */ public class LexException extends Exception { private static final long serialVersionUID = 1L; /** The message. */ protected String msg; /** * Constructor with message. */ public LexException(String s) { msg = s; } /** Returns exception message. */ public String toString() { return "Lexicon Exception: " + msg; } } ================================================ FILE: src/opennlp/ccg/lexicon/Lexicon.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-9 Jason Baldridge, Gann Bierner and // Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import opennlp.ccg.unify.*; import opennlp.ccg.util.*; import opennlp.ccg.hylo.*; import org.jdom.*; import java.io.*; import java.net.*; import java.util.*; import gnu.trove.*; /** * Contains words and their associated categories and semantics. * Lookup can be filtered by plugging in a supertagger. * * * @author Gann Bierner * @author Jason Baldridge * @author Michael White * @version $Revision: 1.78 $, $Date: 2011/10/31 02:01:06 $ */ public class Lexicon { /** Flag used to indicate a purely syntactic edge, with no associated semantics. */ public static final String NO_SEM_FLAG = "*NoSem*"; /** Constant used to signal the substitution of the stem or pred. */ public static final String DEFAULT_VAL = "[*DEFAULT*]"; // supertagger private SupertaggerAdapter _supertagger = null; // various maps private GroupMap _words; private GroupMap _stems; private GroupMap _macros; private HashMap _macroItems; private GroupMap _posToEntries; private GroupMap _stagToEntries; private GroupMap _predToWords; private GroupMap _relsToPreds; private GroupMap _coartRelsToPreds; // coarticulation attrs private Set _coartAttrs; private Set _indexedCoartAttrs; // attrs per atomic category type, across all entries private GroupMap _catsToAttrs; private Set _lfAttrs; // distributive attributes private String[] _distributiveAttrs = null; // licensing features private LicensingFeature[] _licensingFeatures = null; // relation sorting private HashMap _relationIndexMap = new HashMap(); // interner for caching lex lookups during realization private Interner lookupCache = new Interner(true); /** The grammar that this lexicon is part of. */ public final Grammar grammar; /** The tokenizer. (Defaults to DefaultTokenizer.) */ public final Tokenizer tokenizer; /** Flag for whether the lexicon is open, ie complete lexical category mappings are not expected. (Defaults to false.) */ public boolean openlex = false; /** Flag for whether to show warnings for failed sem class unification. (Defaults to false.) */ public boolean debugSemClasses = false; /************************************************************* * Constructor *************************************************************/ public Lexicon(Grammar grammar) { this.grammar = grammar; this.tokenizer = new DefaultTokenizer(); } /** Constructor with tokenizer. */ public Lexicon(Grammar grammar, Tokenizer tokenizer) { this.grammar = grammar; this.tokenizer = tokenizer; } //------------------------------------------------------------- /** Sets the supertagger (null if none). */ public void setSupertagger(SupertaggerAdapter supertagger) { _supertagger = supertagger; } /** Loads the lexicon and morph files. */ public void init(URL lexiconUrl, URL morphUrl) throws IOException { List lexicon = null; List morph = null; List macroModel = null; // load category families (lexicon), morph forms and macros lexicon = getLexicon(lexiconUrl); Pair,List> morphInfo = getMorph(morphUrl); morph = morphInfo.a; macroModel = morphInfo.b; // index words; also index stems to words, as default preds // store indexed coarticulation attrs too _words = new GroupMap(); _predToWords = new GroupMap(); _coartAttrs = new HashSet(); _indexedCoartAttrs = new HashSet(); for (MorphItem morphItem : morph) { Word surfaceWord = morphItem.getSurfaceWord(); _words.put(surfaceWord, morphItem); _predToWords.put(morphItem.getWord().getStem(), surfaceWord); if (morphItem.isCoart()) { Word indexingWord = morphItem.getCoartIndexingWord(); _words.put(indexingWord, morphItem); Pair first = indexingWord.getSurfaceAttrValPairs().next(); _indexedCoartAttrs.add(first.a); for (Iterator> it = surfaceWord.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); _coartAttrs.add(p.a); } } } // index entries based on stem+pos _stems = new GroupMap(); _posToEntries = new GroupMap(); // index entries by supertag+pos, for supertagging _stagToEntries = new GroupMap(); // also index rels and coart rels to preds _relsToPreds = new GroupMap(); _coartRelsToPreds = new GroupMap(); // and gather list of attributes used per atomic category type _catsToAttrs = new GroupMap(); _lfAttrs = new HashSet(); // and remember family and ent, names, for checking excluded list on morph items HashSet familyAndEntryNames = new HashSet(); // index each family for (Family family : lexicon) { familyAndEntryNames.add(family.getName()); EntriesItem[] entries = family.getEntries(); DataItem[] data = family.getData(); // for generic use when we get an unknown stem // from the morphological analyzer if (!family.isClosed()) { _posToEntries.put(family.getPOS(), entries); } // scan through entries for (int j=0; j < entries.length; j++) { // index EntriesItem eItem = entries[j]; _stagToEntries.put(eItem.getSupertag()+family.getPOS(), eItem); if (eItem.getStem().length() > 0) { _stems.put(eItem.getStem()+family.getPOS(), eItem); } try { // gather features eItem.getCat().forall(gatherAttrs); // record names familyAndEntryNames.add(eItem.getName()); familyAndEntryNames.add(eItem.getQualifiedName()); } catch (RuntimeException exc) { System.err.println("exception for: " + family.getName() + ": " + exc); } } // scan through data for (int j=0; j < data.length; j++) { DataItem dItem = data[j]; _stems.put(dItem.getStem()+family.getPOS(), new Pair(dItem,entries)); // index non-default preds to words if (!dItem.getStem().equals(dItem.getPred())) { Collection words = (Collection) _predToWords.get(dItem.getStem()); if (words == null) { if (!openlex) { System.out.print("Warning: couldn't find words for pred '"); System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'"); } } else { for (Iterator it = words.iterator(); it.hasNext(); ) { _predToWords.put(dItem.getPred(), it.next()); } } } } // index rels to preds // nb: this covers relational (eg @xe) and featural (eg @epast) // elementary predications List indexRels = new ArrayList(3); String familyIndexRel = family.getIndexRel(); if (familyIndexRel.length() > 0) { indexRels.add(familyIndexRel); } for (int j=0; j < entries.length; j++) { EntriesItem eItem = entries[j]; String indexRel = eItem.getIndexRel(); if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) { indexRels.add(indexRel); } } for (Iterator it = indexRels.iterator(); it.hasNext(); ) { String indexRel = it.next(); // nb: not indexing on entries items, b/c some stems are still defaults for (int j=0; j < data.length; j++) { DataItem dItem = data[j]; _relsToPreds.put(indexRel, dItem.getPred()); } } // index coart rels (features, really) to preds String coartRel = family.getCoartRel(); if (coartRel.length() > 0) { for (int j=0; j < data.length; j++) { _coartRelsToPreds.put(coartRel, data[j].getPred()); } } } // index the macros _macros = new GroupMap(); // nb: could just index MacroItem objects for feature structures too; // this might be a bit cleaner, but life is short _macroItems = new HashMap(); for (MacroItem mi : macroModel) { String macName = mi.getName(); FeatureStructure[] specs = mi.getFeatureStructures(); for (int j=0; j < specs.length; j++) { _macros.put(macName, specs[j]); } // this is for handling LF part of macros _macroItems.put(macName, mi); } // with morph items, check POS, macro names, excluded list for xref for (MorphItem morphItem : morph) { Word w = morphItem.getWord(); if (!openlex && !_stems.containsKey(w.getStem() + w.getPOS()) && !_posToEntries.containsKey(w.getPOS())) { System.err.println( "Warning: no entries for stem '" + w.getStem() + "' and POS '" + w.getPOS() + "' found for word '" + w + "'" ); } String[] macroNames = morphItem.getMacros(); for (int j=0; j < macroNames.length; j++) { if (!_macroItems.containsKey(macroNames[j])) { System.err.println("Warning: macro " + macroNames[j] + " not found for word '" + morphItem.getWord() + "'"); } } String[] excludedNames = morphItem.getExcluded(); for (int j=0; j < excludedNames.length; j++) { if (!familyAndEntryNames.contains(excludedNames[j])) { System.err.println("Warning: excluded family or entry '" + excludedNames[j] + "' not found for word '" + morphItem.getWord() + "'"); } } } } /** Expands inheritsFrom links to feature equations for those features not explicitly listed. */ public void expandInheritsFrom(Category cat) { expandInheritsFrom(cat, null); } /** Expands inheritsFrom links to feature equations for those features not explicitly listed. */ public void expandInheritsFrom(Category cat, Category cat2) { // index feature structures featStrucMap.clear(); cat.forall(indexFeatStrucs); if (cat2 != null) { cat2.forall(indexFeatStrucs); } // add feature eqs cat.forall(doInheritsFrom); if (cat2 != null) { cat2.forall(doInheritsFrom); } } // gathers attrs from a category private CategoryFcn gatherAttrs = new CategoryFcnAdapter() { public void forall(Category c) { if (!(c instanceof AtomCat)) return; String type = ((AtomCat)c).getType(); FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; for (Iterator it = fs.getAttributes().iterator(); it.hasNext(); ) { String att = it.next(); _catsToAttrs.put(type, att); if (fs.getValue(att) instanceof LF) { _lfAttrs.add(att); } } } }; // a map from indices to atomic categories, reset for each category private TIntObjectHashMap featStrucMap = new TIntObjectHashMap(); // fills in featStrucMap for a category private CategoryFcn indexFeatStrucs = new CategoryFcnAdapter() { public void forall(Category c) { FeatureStructure fs = c.getFeatureStructure(); if (fs != null && fs.getIndex() != 0) featStrucMap.put(fs.getIndex(), fs); } }; // adds feature equations to percolate attributes from inheritsFrom feature // structure, except for any attributes already present private CategoryFcn doInheritsFrom = new CategoryFcnAdapter() { public void forall(Category c) { // get feature structures if (!(c instanceof AtomCat)) return; String type = ((AtomCat)c).getType(); FeatureStructure fs = c.getFeatureStructure(); GFeatStruc gfs = (GFeatStruc) fs; if (gfs == null || gfs.getInheritsFrom() == 0) return; int inhf = gfs.getInheritsFrom(); FeatureStructure inhfFS = (FeatureStructure) featStrucMap.get(inhf); if (inhfFS != null) { // copy values of features from inhfFS not already present for (Iterator it = inhfFS.getAttributes().iterator(); it.hasNext(); ) { String att = it.next(); if (gfs.hasAttribute(att)) continue; gfs.setFeature(att, UnifyControl.copy(inhfFS.getValue(att))); } // for each possible attr used with this type and not already present, // add feature equation Collection attrs = (Collection) _catsToAttrs.get(type); if (attrs == null) return; for (Iterator it = attrs.iterator(); it.hasNext(); ) { String att = it.next(); if (gfs.hasAttribute(att)) continue; String varName = att.toUpperCase() + inhf; if (_lfAttrs.contains(att)) { gfs.setFeature(att, new HyloVar(varName)); inhfFS.setFeature(att, new HyloVar(varName)); } else { gfs.setFeature(att, new GFeatVar(varName)); inhfFS.setFeature(att, new GFeatVar(varName)); } } } else { System.err.println( "Warning: no feature structure with inheritsFrom index of " + inhf + " found in category " + c ); } } }; /** * Returns the lexical signs indexed by the given rel, or null if none. */ public Collection getSignsFromRel(String rel) { // check cache (if not doing supertagging) if (_supertagger == null) { RelLookup lookup = new RelLookup(rel); RelLookup retLookup = (RelLookup) lookupCache.getInterned(lookup); if (retLookup != null) return retLookup.signs; } // lookup signs via preds Collection preds = (Collection) _relsToPreds.get(rel); if (preds == null) return null; Collection retval = getSignsFromRelAndPreds(rel, preds); // cache non-null result (if not doing supertagging) if (_supertagger == null && retval != null) { RelLookup lookup = new RelLookup(rel); lookup.signs = retval; lookupCache.intern(lookup); } return retval; } // get signs for rel via preds, or null if none private Collection getSignsFromRelAndPreds(String rel, Collection preds) { List retval = new ArrayList(); for (Iterator it = preds.iterator(); it.hasNext(); ) { String pred = it.next(); Collection signs = getSignsFromPredAndTargetRel(pred, rel); if (signs != null) retval.addAll(signs); } // return null if none survive filter if (retval.size() > 0) return retval; else return null; } /** * Returns the lexical signs indexed by the given pred. * If the pred is not listed in the lexicon, the tokenizer is * consulted to see if it is a special token (date, time, etc.); * otherwise, null is returned. * Coarticulations are applied for the given rels, if non-null. */ public Collection getSignsFromPred(String pred, List coartRels) { // check cache (if not doing supertagging) if (_supertagger == null) { PredLookup lookup = new PredLookup(pred, coartRels); PredLookup retLookup = (PredLookup) lookupCache.getInterned(lookup); if (retLookup != null) return retLookup.signs; } // lookup pred Collection result = getSignsFromPredAndTargetRel(pred, null); if (result == null) return null; // apply coarts for rels if (coartRels != null) applyCoarts(coartRels, result); // cache result (if not doing supertagging) if (_supertagger == null) { PredLookup lookup = new PredLookup(pred, coartRels); lookup.signs = result; lookupCache.intern(lookup); } // and return return result; } // get signs using an additional arg for a target rel private Collection getSignsFromPredAndTargetRel(String pred, String targetRel) { Collection words = (Collection) _predToWords.get(pred); String specialTokenConst = null; // for robustness, when using supertagger, add words for pred sans sense index int dotIndex = -1; if (_supertagger != null && !Character.isDigit(pred.charAt(0)) && // skip numbers (dotIndex = pred.lastIndexOf('.')) > 0 && pred.length() > dotIndex+1 && pred.charAt(dotIndex+1) != '_') // skip titles, eg Mr._Smith { String barePred = pred.substring(0, dotIndex); Collection barePredWords = (Collection) _predToWords.get(barePred); if (words == null) words = barePredWords; else if (barePredWords != null) { Set unionWords = new HashSet(words); unionWords.addAll(barePredWords); words = unionWords; } } if (words == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(pred)); if (specialTokenConst == null) return null; // lookup words with pred = special token const Collection specialTokenWords = (Collection) _predToWords.get(specialTokenConst); // replace special token const with pred if (specialTokenWords == null) return null; words = new ArrayList(specialTokenWords.size()); for (Iterator it = specialTokenWords.iterator(); it.hasNext(); ) { Word stw = it.next(); Word w = Word.createSurfaceWord(stw, pred); words.add(w); } } List retval = new ArrayList(); for (Iterator it = words.iterator(); it.hasNext(); ) { Word w = it.next(); try { SignHash signs = getSignsFromWord(w, specialTokenConst, pred, targetRel); retval.addAll(signs.asSignSet()); } // shouldn't happen catch (LexException exc) { System.err.println("Unexpected lex exception for word " + w + ": " + exc); } } return retval; } // look up and apply coarts for given rels to each sign in result private void applyCoarts(List coartRels, Collection result) { List inputSigns = new ArrayList(result); result.clear(); List outputSigns = new ArrayList(inputSigns.size()); // for each rel, lookup coarts and apply to input signs, storing results in output signs for (Iterator it = coartRels.iterator(); it.hasNext(); ) { String rel = it.next(); Collection preds = (Collection) _coartRelsToPreds.get(rel); if (preds == null) continue; // not expected Collection coartResult = getSignsFromRelAndPreds(rel, preds); if (coartResult == null) continue; for (Iterator it2 = coartResult.iterator(); it2.hasNext(); ) { Sign coartSign = it2.next(); // apply to each input for (int j = 0; j < inputSigns.size(); j++) { Sign sign = inputSigns.get(j); grammar.rules.applyCoart(sign, coartSign, outputSigns); } } // switch output to input for next iteration inputSigns.clear(); inputSigns.addAll(outputSigns); outputSigns.clear(); } // add results back result.addAll(inputSigns); } /** * For a string of 1 or more surface words, return all of the lexical * entries for each word as a list of sign hashes. * Tokenization is performed using the configured tokenizer. * * @param w the words in string format * @return a list of sign hashes * @exception LexException thrown if word not found */ public List getEntriesFromWords(String s) throws LexException { List entries = new ArrayList(); List words = tokenizer.tokenize(s); for (Iterator it = words.iterator(); it.hasNext(); ) { Word w = it.next(); SignHash signs = getSignsFromWord(w); if (signs.size() == 0) { throw new LexException("Word not in lexicon: \"" + w +"\""); } entries.add(signs); } return entries; } /** * For a given word, return all of its surface word's lexical entries. * If the word is not listed in the lexicon, the tokenizer is * consulted to see if it is a special token (date, time, etc.); * otherwise an exception is thrown. * If the word has coarticulations, all applicable coarticulation * entries are applied to the base word, in an arbitrary order. * * @param w the word * @return a sign hash * @exception LexException thrown if word not found */ public SignHash getSignsFromWord(Word w) throws LexException { // reduce word to its core, removing coart attrs if any Word surfaceWord = Word.createSurfaceWord(w); Word coreWord = (surfaceWord.attrsIntersect(_coartAttrs)) ? Word.createCoreSurfaceWord(surfaceWord, _coartAttrs) : surfaceWord; // lookup core word SignHash result = getSignsFromWord(coreWord, null, null, null); if (result.size() == 0) { throw new LexException(coreWord + " not found in lexicon"); } // return signs if no coart attrs if (coreWord == surfaceWord) return result; // otherwise apply coarts for word applyCoarts(surfaceWord, result); return result; } // look up and apply coarts for w to each sign in result @SuppressWarnings("unchecked") private void applyCoarts(Word w, SignHash result) throws LexException { List inputSigns = new ArrayList(result.asSignSet()); result.clear(); List outputSigns = new ArrayList(inputSigns.size()); // for each surface attr, lookup coarts and apply to input signs, storing results in output signs for (Iterator> it = w.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); String attr = (String) p.a; if (!_indexedCoartAttrs.contains(attr)) continue; String val = (String) p.b; Word coartWord = Word.createWord(attr, val); SignHash coartResult = getSignsFromWord(coartWord, null, null, null); for (Iterator it2 = coartResult.iterator(); it2.hasNext(); ) { Sign coartSign = it2.next(); // apply to each input for (int j = 0; j < inputSigns.size(); j++) { Sign sign = inputSigns.get(j); grammar.rules.applyCoart(sign, coartSign, outputSigns); } } // switch output to input for next iteration inputSigns.clear(); inputSigns.addAll(outputSigns); outputSigns.clear(); } // add results back result.addAll(inputSigns); } // get signs with additional args for a known special token const, target pred and target rel private SignHash getSignsFromWord(Word w, String specialTokenConst, String targetPred, String targetRel) throws LexException { Collection morphItems = (specialTokenConst == null) ? (Collection) _words.get(w) : null; if (morphItems == null) { // check for special tokens if (specialTokenConst == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(w.getForm())); targetPred = w.getForm(); } if (specialTokenConst != null) { Word key = Word.createSurfaceWord(w, specialTokenConst); morphItems = (Collection) _words.get(key); } // otherwise throw lex exception if (morphItems == null) throw new LexException(w + " not in lexicon"); } SignHash result = new SignHash(); for (Iterator MI = morphItems.iterator(); MI.hasNext();) { getWithMorphItem(w, MI.next(), targetPred, targetRel, result); } return result; } // given MorphItem private void getWithMorphItem(Word w, MorphItem mi, String targetPred, String targetRel, SignHash result) throws LexException { // get supertags for filtering, if a supertagger is installed Map supertags = null; Set supertagsFound = null; if (_supertagger != null) { supertags = _supertagger.getSupertags(); if (supertags != null) supertagsFound = new HashSet(supertags.size()); } // get macro adder MacroAdder macAdder = getMacAdder(mi); // if we have this stem in our lexicon String stem = mi.getWord().getStem(); String pos = mi.getWord().getPOS(); Set explicitEntries = null; // for storing entries from explicitly listed family members if (_stems.containsKey(stem+pos)) { explicitEntries = new HashSet(); Collection stemItems = (Collection)_stems.get(stem+pos); for (Iterator I=stemItems.iterator(); I.hasNext();) { Object item = I.next(); // see if it's an EntriesItem if (item instanceof EntriesItem) { EntriesItem entry = (EntriesItem) item; // do lookup getWithEntriesItem(w, mi, stem, stem, targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } // otherwise it has to be a Pair containing a DataItem and // an EntriesItem[] else { @SuppressWarnings("rawtypes") DataItem dItem = (DataItem)((Pair)item).a; @SuppressWarnings("rawtypes") EntriesItem[] entries = (EntriesItem[])((Pair)item).b; // store entries explicitEntries.add(entries); // do lookup getWithDataItem(w, mi, dItem, entries, targetPred, targetRel, macAdder, supertags, supertagsFound, result); } } } // for entries that are not explicitly in the lexicon file, we have to create // Signs from the open class entries with the appropriate part-of-speech Collection entrySets = (Collection)_posToEntries.get(pos); if (entrySets != null) { for (Iterator E=entrySets.iterator(); E.hasNext(); ) { EntriesItem[] entries = E.next(); // skip if entries explicitly listed if (explicitEntries != null && explicitEntries.contains(entries)) continue; // otherwise get entries with pred = targetPred, or stem if null String pred = (targetPred != null) ? targetPred : stem; getWithDataItem(w, mi, new DataItem(stem, pred), entries, targetPred, targetRel, macAdder, supertags, supertagsFound, result); } } // finally do entries for any remaining supertags if (supertags != null) { for (String supertag : supertags.keySet()) { if (supertagsFound.contains(supertag)) continue; Set entries = _stagToEntries.get(supertag+pos); if (entries == null) continue; // nb: could be a POS mismatch // get entries with pred = targetPred, or stem if null String pred = (targetPred != null) ? targetPred : stem; for (EntriesItem entry : entries) { if (!entry.getStem().equals(DEFAULT_VAL)) continue; getWithEntriesItem(w, mi, stem, pred, targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } } } } // given DataItem private void getWithDataItem(Word w, MorphItem mi, DataItem item, EntriesItem[] entries, String targetPred, String targetRel, MacroAdder macAdder, Map supertags, Set supertagsFound, SignHash result) { for (int i=0; i < entries.length; i++) { EntriesItem entry = entries[i]; if (entry.getStem().equals(DEFAULT_VAL)) { getWithEntriesItem(w, mi, item.getStem(), item.getPred(), targetPred, targetRel, entry, macAdder, supertags, supertagsFound, result); } } } // given EntriesItem private void getWithEntriesItem(Word w, MorphItem mi, String stem, String pred, String targetPred, String targetRel, EntriesItem item, MacroAdder macAdder, Map supertags, Set supertagsFound, SignHash result) { // ensure apropos if (targetPred != null && !targetPred.equals(pred)) return; if (targetRel != null && !targetRel.equals(item.getIndexRel()) && !targetRel.equals(item.getCoartRel())) return; if (!item.getActive().booleanValue()) return; if (mi.excluded(item)) return; try { // copy and add macros Category cat = item.getCat().copy(); macAdder.addMacros(cat); // replace DEFAULT_VAL with pred, after first // unifying type of associated nom var(s) with sem class unifySemClass(cat, mi.getWord().getSemClass()); REPLACEMENT = pred; cat.deepMap(defaultReplacer); // check supertag // TODO: think about earlier checks for efficiency, for grammars where macros and preds don't matter //Double lexprob = null; // nb: skipping lex log probs, don't seem to be helpful if (supertags != null) { // skip if not found String stag = cat.getSupertag(); if (!supertags.containsKey(stag)) return; // otherwise update found supertags supertagsFound.add(stag); // get lex prob //lexprob = supertags.get(stag); } // propagate types of nom vars propagateTypes(cat); // handle distrib attrs and inherits-from propagateDistributiveAttrs(cat); expandInheritsFrom(cat); // merge stem, pos, sem class from morph item, plus supertag from cat Word word = Word.createFullWord(w, mi.getWord(), cat.getSupertag()); // set origin and lexprob Sign sign = new Sign(word, cat); sign.setOrigin(); //if (lexprob != null) { // sign.addData(new SupertaggerAdapter.LexLogProb((float) Math.log10(lexprob))); //} // return sign result.insert(sign); } catch (RuntimeException exc) { System.err.println( "Warning: ignoring entry: " + item.getName() + " of family: " + item.getFamilyName() + " for stem: " + stem + " b/c: " + exc.toString() ); } } // the sem class for defaultNomvarSetter private SimpleType SEMCLASS = null; // unify sem class with default nom var(s) private void unifySemClass(Category cat, String semClass) { if (semClass == null || cat.getLF() == null) return; SEMCLASS = grammar.types.getSimpleType(semClass); try { cat.getLF().deepMap(defaultNomvarUnifier); } catch (TypePropagationException tpe) { if (debugSemClasses) { System.err.println( "Warning: unable to unify types '" + tpe.st1 + "' and '" + tpe.st2 + "' in unifying sem class in cat: \n" + cat ); } } } // mod function to unify type of nom var for DEFAULT_VAL with SEMCLASS private ModFcn defaultNomvarUnifier = new ModFcn() { public void modify(Mutable m) { if (!(m instanceof SatOp)) return; SatOp satop = (SatOp) m; if (!(satop.getArg() instanceof Proposition)) return; Proposition prop = (Proposition) satop.getArg(); if (!prop.getName().equals(DEFAULT_VAL)) return; if (!(satop.getNominal() instanceof NominalVar)) return; NominalVar nv = (NominalVar) satop.getNominal(); SimpleType st = nv.getType(); // check equality if (st.equals(SEMCLASS)) return; // otherwise unify types, update nv try { SimpleType stU = (SimpleType) st.unify(SEMCLASS, null); nv.setType(stU); } catch (UnifyFailure uf) { throw new TypePropagationException(st, SEMCLASS); } } }; // the replacement string for defaultReplacer private String REPLACEMENT = ""; // mod function to replace DEFAULT_VAL with REPLACEMENT private ModFcn defaultReplacer = new ModFcn() { public void modify(Mutable m) { if (m instanceof Proposition) { Proposition prop = (Proposition) m; if (prop.getName().equals(DEFAULT_VAL)) prop.setAtomName(REPLACEMENT); } else if (m instanceof FeatureStructure) { FeatureStructure fs = (FeatureStructure) m; for (Iterator it = fs.getAttributes().iterator(); it.hasNext(); ) { String attr = it.next(); Object val = fs.getValue(attr); if (val instanceof SimpleType && ((SimpleType)val).getName().equals(DEFAULT_VAL)) { fs.setFeature(attr, grammar.types.getSimpleType(REPLACEMENT)); } } } } }; // a cache for macro adders private Map macAdderMap = new HashMap(); // returns a macro adder for the given morph item private MacroAdder getMacAdder(MorphItem mi) { // check map MacroAdder retval = macAdderMap.get(mi); if (retval != null) return retval; // set up macro adder IntHashSetMap macrosFromLex = new IntHashSetMap(); String[] newMacroNames = mi.getMacros(); List macroItems = new ArrayList(); for (int i=0; i < newMacroNames.length; i++) { Set featStrucs = (Set)_macros.get(newMacroNames[i]); if (featStrucs != null) { for (Iterator fsIt = featStrucs.iterator(); fsIt.hasNext();) { FeatureStructure fs = fsIt.next(); macrosFromLex.put(fs.getIndex(), fs); } } MacroItem macroItem = _macroItems.get(newMacroNames[i]); if (macroItem != null) { macroItems.add(macroItem); } else { // should be checked earlier too System.err.println("Warning: macro " + newMacroNames[i] + " not found for word '" + mi.getWord() + "'"); } } retval = new MacroAdder(macrosFromLex, macroItems); // update map and return macAdderMap.put(mi, retval); return retval; } // // type propagation // /** Propagates types of nomvars in the given category. */ public void propagateTypes(Category cat) { propagateTypes(cat, null); } /** Propagates types of nomvars in the given categories. */ public void propagateTypes(Category cat, Category cat2) { try { nomvarMap.clear(); cat.deepMap(nomvarTypePropagater); if (cat2 != null) cat2.deepMap(nomvarTypePropagater); cat.deepMap(nomvarTypePropagater); if (cat2 != null) cat2.deepMap(nomvarTypePropagater); } catch (TypePropagationException tpe) { if (debugSemClasses) { System.err.println( "Warning: unable to unify types '" + tpe.st1 + "' and '" + tpe.st2 + "' in cat: \n" + cat ); if (cat2 != null) System.err.println("and cat: \n" + cat2); } } } // a map from a cat's nomvars to types, // just using the var's name for equality @SuppressWarnings("unchecked") private Map nomvarMap = new THashMap( new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(Object o) { return ((NominalVar)o).getName().hashCode(); } public boolean equals(Object o1, Object o2) { return ((NominalVar)o1).getName().equals(((NominalVar)o2).getName()); } } ); // exception for unification failures in propagating types private class TypePropagationException extends RuntimeException { private static final long serialVersionUID = 1L; SimpleType st1; SimpleType st2; TypePropagationException(SimpleType st1, SimpleType st2) { this.st1 = st1; this.st2 = st2; } } // mod function to propagate nomvar types; // needs to be called twice after clearing nomvarMap private ModFcn nomvarTypePropagater = new ModFcn() { public void modify(Mutable m) { if (m instanceof NominalVar) { NominalVar nv = (NominalVar) m; SimpleType st = nv.getType(); SimpleType st0 = nomvarMap.get(nv); // add type to map if no type found if (st0 == null) { nomvarMap.put(nv, st); return; } // check equality if (st.equals(st0)) return; // otherwise unify types, update nv and map try { SimpleType stU = (SimpleType) st.unify(st0, null); nv.setType(stU); nomvarMap.put(nv, stU); } catch (UnifyFailure uf) { throw new TypePropagationException(st, st0); } } } }; // // distributive attribute propagation // /** * Returns the list of distributive attributes, or null if none. */ public String[] getDistributiveAttrs() { return _distributiveAttrs; } /** * Gathers and propagates the unique values of each * distributive attribute. */ public void propagateDistributiveAttrs(Category cat) { propagateDistributiveAttrs(cat, null); } /** * Gathers and propagates the unique values of each * distributive attribute. */ public void propagateDistributiveAttrs(Category cat, Category cat2) { if (_distributiveAttrs == null) return; resetDistrAttrVals(); cat.forall(gatherDistrAttrVals); if (cat2 != null) { cat2.forall(gatherDistrAttrVals); } cat.forall(propagateUniqueDistrAttrVals); if (cat2 != null) { cat2.forall(propagateUniqueDistrAttrVals); } } // an array of lists, one for each distributive attr @SuppressWarnings("rawtypes") private List[] distrAttrVals = null; @SuppressWarnings("rawtypes") private void resetDistrAttrVals() { if (distrAttrVals == null) { distrAttrVals = new List[_distributiveAttrs.length]; for (int i = 0; i < distrAttrVals.length; i++) { distrAttrVals[i] = new ArrayList(3); } return; } for (int i = 0; i < distrAttrVals.length; i++) { distrAttrVals[i].clear(); } } // gathers distinct values for each distributive attr private CategoryFcn gatherDistrAttrVals = new CategoryFcnAdapter() { @SuppressWarnings("unchecked") public void forall(Category c) { if (!(c instanceof AtomCat)) return; FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; for (int i = 0; i < _distributiveAttrs.length; i++) { String attr = _distributiveAttrs[i]; Object val = fs.getValue(attr); if (val != null && !distrAttrVals[i].contains(val)) { distrAttrVals[i].add(val); } } } }; // propagates unique values for each distributive attr private CategoryFcn propagateUniqueDistrAttrVals = new CategoryFcnAdapter() { public void forall(Category c) { if (!(c instanceof AtomCat)) return; FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; for (int i = 0; i < _distributiveAttrs.length; i++) { if (distrAttrVals[i].size() != 1) continue; Object distVal = distrAttrVals[i].get(0); String attr = _distributiveAttrs[i]; Object val = fs.getValue(attr); if (val == null) { fs.setFeature(attr, UnifyControl.copy(distVal)); } } } }; // // licensing features // /** * Returns the list of licensing features. */ public LicensingFeature[] getLicensingFeatures() { return _licensingFeatures; } /** * Returns the index of the given relation in the relation sort order, * or the index of "*" if the relation is not explicitly listed. */ public Integer getRelationSortIndex(String rel) { Integer retval = _relationIndexMap.get(rel); if (retval != null) return retval; retval = _relationIndexMap.get("*"); if (retval != null) return retval; return new Integer(-1); } // // access to maps (limited) // /** Returns whether the given rel (semantic feature, really) is one used to signal coarticulation. */ public boolean isCoartRel(String rel) { return _coartRelsToPreds.containsKey(rel); } // // classes for caching lex lookups during realization // // a class for caching lookups of signs from rels // nb: equality is checked just on the rel, to check for a cached lookup private static class RelLookup { String rel; Collection signs; RelLookup(String s) { rel = s; } public int hashCode() { return rel.hashCode(); } public boolean equals(Object obj) { return (obj instanceof RelLookup) && rel.equals(((RelLookup)obj).rel); } } // a class for caching lookups of signs from preds and coart rels // nb: equality is checked just on the pred and coart rels, to check for a cached lookup private static class PredLookup { String pred; List coartRels; Collection signs; PredLookup(String s, List l) { pred = s; coartRels = l; } public int hashCode() { return pred.hashCode() + ((coartRels != null) ? coartRels.hashCode() : 0); } public boolean equals(Object obj) { if (!(obj instanceof PredLookup)) return false; PredLookup pLook = (PredLookup) obj; if (!pred.equals(pLook.pred)) return false; if (coartRels == null) return (pLook.coartRels == null); return coartRels.equals(pLook.coartRels); } } // // XML loading routines // private class MorphScanner extends XmlScanner { List morphItems = new ArrayList(); List macroItems = new ArrayList(); public void handleElement(Element e) { // create morph item if (e.getName().equals("entry")) { try { morphItems.add(new MorphItem(e)); } catch (RuntimeException exc) { System.err.println("Skipping morph item: " + e.getAttributeValue("word")); System.err.println(exc.toString()); } } // create macro item else if (e.getName().equals("macro")) { try { macroItems.add(new MacroItem(e)); } catch (RuntimeException exc) { System.err.println("Skipping macro item: " + e.getAttributeValue("name")); System.err.println(exc.toString()); } } } }; private Pair,List> getMorph(URL url) throws IOException { // scan XML MorphScanner morphScanner = new MorphScanner(); morphScanner.parse(url); // return morph and macro items return new Pair,List>(morphScanner.morphItems, morphScanner.macroItems); } private class LexiconScanner extends XmlScanner { List lexicon = new ArrayList(); Element distrElt = null; Element licensingElt = null; Element relationSortingElt = null; public void handleElement(Element e) { // create family if (e.getName().equals("family")) { try { lexicon.add(new Family(e)); } catch (RuntimeException exc) { System.err.println("Skipping family: " + e.getAttributeValue("name")); System.err.println(exc.toString()); } } // save distributive attributes else if (e.getName().equals("distributive-features")) distrElt = e; // save licensing features else if (e.getName().equals("licensing-features")) licensingElt = e; // save relation sort order else if (e.getName().equals("relation-sorting")) relationSortingElt = e; } }; private List getLexicon(URL url) throws IOException { // scan XML, creating families LexiconScanner lexiconScanner = new LexiconScanner(); lexiconScanner.parse(url); // get distributive attributes, if any if (lexiconScanner.distrElt != null) { String distrAttrs = lexiconScanner.distrElt.getAttributeValue("attrs"); _distributiveAttrs = distrAttrs.split("\\s+"); } // load licensing features loadLicensingFeatures(lexiconScanner.licensingElt); // load relation sort order loadRelationSortOrder(lexiconScanner.relationSortingElt); // return families return lexiconScanner.lexicon; } // get licensing features, with appropriate defaults @SuppressWarnings("unchecked") private void loadLicensingFeatures(Element licensingElt) { List licensingFeats = new ArrayList(); boolean containsLexFeat = false; if (licensingElt != null) { for (Iterator it = licensingElt.getChildren("feat").iterator(); it.hasNext(); ) { Element featElt = it.next(); String attr = featElt.getAttributeValue("attr"); if (attr.equals("lex")) containsLexFeat = true; String val = featElt.getAttributeValue("val"); List alsoLicensedBy = null; String alsoVals = featElt.getAttributeValue("also-licensed-by"); if (alsoVals != null) { alsoLicensedBy = Arrays.asList(alsoVals.split("\\s+")); } boolean licenseEmptyCats = true; boolean licenseMarkedCats = false; boolean instantiate = true; byte loc = LicensingFeature.BOTH; String lmc = featElt.getAttributeValue("license-marked-cats"); if (lmc != null) { licenseMarkedCats = Boolean.valueOf(lmc).booleanValue(); // change defaults licenseEmptyCats = false; loc = LicensingFeature.TARGET_ONLY; instantiate = false; } String lec = featElt.getAttributeValue("license-empty-cats"); if (lec != null) { licenseEmptyCats = Boolean.valueOf(lec).booleanValue(); } String inst = featElt.getAttributeValue("instantiate"); if (inst != null) { instantiate = Boolean.valueOf(inst).booleanValue(); } String locStr = featElt.getAttributeValue("location"); if (locStr != null) { if (locStr.equals("target-only")) loc = LicensingFeature.TARGET_ONLY; if (locStr.equals("args-only")) loc = LicensingFeature.ARGS_ONLY; if (locStr.equals("both")) loc = LicensingFeature.BOTH; } licensingFeats.add( new LicensingFeature( attr, val, alsoLicensedBy, licenseEmptyCats, licenseMarkedCats, instantiate, loc ) ); } } if (!containsLexFeat) { licensingFeats.add(LicensingFeature.defaultLexFeature); } _licensingFeatures = new LicensingFeature[licensingFeats.size()]; licensingFeats.toArray(_licensingFeatures); } // default relation sort order private static String[] defaultRelationSortOrder = { "BoundVar", "PairedWith", "Restr", "Body", "Scope", "*", "GenRel", "Coord", "Append" }; // get relation sort order, or use defaults private void loadRelationSortOrder(Element relationSortingElt) { // use defaults if no order specified if (relationSortingElt == null) { for (int i = 0; i < defaultRelationSortOrder.length; i++) { _relationIndexMap.put(defaultRelationSortOrder[i], new Integer(i)); } return; } // otherwise load from 'order' attribute String orderAttr = relationSortingElt.getAttributeValue("order"); String[] relSortOrder = orderAttr.split("\\s+"); for (int i = 0; i < relSortOrder.length; i++) { _relationIndexMap.put(relSortOrder[i], new Integer(i)); } } /* * Accessor for words map */ public GroupMap getWords() { return _words; } } ================================================ FILE: src/opennlp/ccg/lexicon/LicensingFeature.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import java.util.*; /** * A licensing feature is one which controls the licensing and * instantiation of semantically null or marked categories * in the realizer. Defaults are handled in Lexicon.loadLicensingFeatures. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2009/12/21 03:27:18 $ */ public class LicensingFeature { /** * The name of the licensing feature. */ public final String attr; /** * The string value of the licensing feature, or null if any value will do. */ public final String val; /** * A list of string values of other features that suffice to license * categories with this feature. */ public final List alsoLicensedBy; /** * A flag indicating whether semantically null categories with the * licensing feature need to be licensed. */ public final boolean licenseEmptyCats; /** * A flag indicating whether initial categories with the * licensing feature are marked and need to be licensed. */ public final boolean licenseMarkedCats; /** * A flag indicating whether semantically empty categories with the * licensing feature should be instantiated. */ public final boolean instantiate; /** * The location of the licensing feature on the category to be licensed. * The value must be one of TARGET_ONLY, ARGS_ONLY or BOTH. */ public final byte loc; /** * Location of the feature on the target category only. */ public static final byte TARGET_ONLY = 1; /** * Location of the feature on the argument categories only. */ public static final byte ARGS_ONLY = 2; /** * Location of the feature on either the target category * or the argument categories. */ public static final byte BOTH = 0; /** Constructor. */ public LicensingFeature( String attr, String val, List alsoLicensedBy, boolean licenseEmptyCats, boolean licenseMarkedCats, boolean instantiate, byte loc ) { this.attr = attr; this.val = val; List emptyList = Collections.emptyList(); this.alsoLicensedBy = (alsoLicensedBy != null) ? alsoLicensedBy : emptyList; this.licenseEmptyCats = licenseEmptyCats; this.licenseMarkedCats = licenseMarkedCats; this.instantiate = instantiate; this.loc = loc; } /** Default lex feature. */ public static final LicensingFeature defaultLexFeature = new LicensingFeature("lex", null, null, true, false, true, BOTH); /** Simple lex feature, for comparison purposes. */ public static final LicensingFeature simpleLexFeature = new LicensingFeature("lex", null, null, true, false, false, BOTH); } ================================================ FILE: src/opennlp/ccg/lexicon/ListPairWord.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import java.util.*; import opennlp.ccg.util.Pair; /** * A ListPairWord represents a word via a list of pairs of interned * attributes and values. It is intended to be a simple wrapper of * the list to make it act like a word, without requiring canonical * instances. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2010/01/17 04:49:24 $ */ public class ListPairWord extends Word { private static final long serialVersionUID = 1L; /** * The list of pairs of attributes and values. */ protected List> pairsList; /** Constructor. */ public ListPairWord(List> pairsList) { this.pairsList = pairsList; } /** Returns the surface form. */ public String getForm() { return getValFromInterned(Tokenizer.WORD_ATTR); } /** Returns the pitch accent. */ public String getPitchAccent() { return getValFromInterned(Tokenizer.PITCH_ACCENT_ATTR); } /** Returns the list of extra attribute-value pairs. */ protected List> getAttrValPairsList() { List> retval = null; for (Pair pair : pairsList) { if (!isKnownAttr(pair.a)) { if (retval == null) retval = new ArrayList>(5); retval.add(pair); } } return retval; } /** Returns the stem. */ public String getStem() { return getValFromInterned(Tokenizer.STEM_ATTR); } /** Returns the part of speech. */ public String getPOS() { return getValFromInterned(Tokenizer.POS_ATTR); } /** Returns the supertag. */ public String getSupertag() { return getValFromInterned(Tokenizer.SUPERTAG_ATTR); } /** Returns the semantic class. */ public String getSemClass() { return getValFromInterned(Tokenizer.SEM_CLASS_ATTR); } /** Returns the value of the attribute with the given name, or null if none. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used to retrieve the form, ..., semantic class. */ public String getVal(String attr) { String internedAttr = attr.intern(); // use == on interned attr return getValFromInterned(internedAttr); } /** Returns the value of the given interned attr, or null if none. */ protected String getValFromInterned(String attr) { for (Pair pair : pairsList) { if (pair.a == attr) return pair.b; } return null; } } ================================================ FILE: src/opennlp/ccg/lexicon/MacroAdder.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.synsem.*; import opennlp.ccg.util.*; import opennlp.ccg.unify.*; import opennlp.ccg.hylo.*; import java.util.*; /** * Adds the features from macros to a category. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.6 $, $Date: 2011/03/20 20:11:57 $ */ public class MacroAdder { private IntHashSetMap _specificMacros; private List _macroItems; // for LF macros public MacroAdder(IntHashSetMap sm, List macroItems) { _specificMacros = sm; _macroItems = macroItems; } public void addMacros(Category cat) { // add features cat.deepMap(addIndexedFeatures); // append preds to LF LF lf = cat.getLF(); for (int i=0; i < _macroItems.size(); i++) { MacroItem mi = _macroItems.get(i); LF[] preds = mi.getPreds(); for (int j=0; j < preds.length; j++) { LF pred = (LF) preds[j].copy(); if (!HyloHelper.isElementaryPredication(pred)) { System.out.println( "Warning: ignoring LF macro pred, which is not an elementary predication: " + pred ); continue; } lf = HyloHelper.append(lf, pred); } } // sort and reset LF HyloHelper.sort(lf); cat.setLF(lf); } private ModFcn addIndexedFeatures = new ModFcn() { @SuppressWarnings("rawtypes") public void modify(Mutable c) { if (c instanceof AtomCat) { FeatureStructure fs = ((AtomCat)c).getFeatureStructure(); int fsIndex = fs.getIndex(); Set featStrucs = (Set)_specificMacros.get(fsIndex); if (null == featStrucs) { return; } FeatureStructure $fs = fs.copy(); for (Iterator fsIt = featStrucs.iterator(); fsIt.hasNext();) { FeatureStructure macroFS = (FeatureStructure) fsIt.next(); $fs = $fs.inherit(macroFS); } ((AtomCat)c).setFeatureStructure($fs); } } }; } ================================================ FILE: src/opennlp/ccg/lexicon/MacroItem.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.HyloHelper; import org.jdom.*; import java.util.*; /** * Data structure for storing information about morphological macros. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/12/21 03:27:18 $ */ public class MacroItem { private String name; private FeatureStructure[] featStrucs; private LF[] preds; public MacroItem() {}; @SuppressWarnings("unchecked") public MacroItem (Element e) { name = e.getAttributeValue("name"); if (name == null) { name = e.getAttributeValue("n"); } List fsEls = e.getChildren("fs"); featStrucs = new FeatureStructure[fsEls.size()]; for (int i=0; i predElts = lfElt.getChildren(); preds = new LF[predElts.size()]; for (int i=0; i < predElts.size(); i++) { preds[i] = HyloHelper.getLF(predElts.get(i)); } } } public void setName(String s) { name=s; } //public void setSpecs(ArrayList al) {specs = al; } public String getName() { return name; } public FeatureStructure[] getFeatureStructures() { return featStrucs; } public LF[] getPreds() { return preds; } //public void addSpec(String s) { specs.add(s); } //public void removeSpec(String s) { // specs.remove(specs.indexOf(s)); //} } ================================================ FILE: src/opennlp/ccg/lexicon/MorphItem.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import org.jdom.*; import opennlp.ccg.grammar.Grammar; /** * A data structure for morphological entries. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.11 $, $Date: 2009/10/17 20:46:20 $ */ public class MorphItem { private static final String[] emptyStringArray = new String[0]; private Word surfaceWord; private Word word; private Word coartIndexingWord = null; private String[] macros = emptyStringArray; private String[] excluded = emptyStringArray; private boolean coart = false; /** Constructor. */ public MorphItem() {}; /** Constructor from XML element. */ public MorphItem(Element e) { String coartString = e.getAttributeValue("coart"); if ("true".equals(coartString)) coart = true; String wordString = e.getAttributeValue("word"); boolean strictFactors = coart; // parse with flag for strict factors with coart items Word tokenizedWord = Grammar.theGrammar.lexicon.tokenizer.parseToken(wordString, strictFactors); surfaceWord = Word.createSurfaceWord(tokenizedWord); String stem = e.getAttributeValue("stem"); if (stem == null) stem = surfaceWord.getForm(); String POS = e.getAttributeValue("pos"); String supertag = null; // supertag comes later from syn cat String semClass = e.getAttributeValue("class"); word = Word.createFullWord(surfaceWord, stem, POS, supertag, semClass); String macrosString = e.getAttributeValue("macros"); if (macrosString != null) { macros = macrosString.split("\\s+"); } String excludedString = e.getAttributeValue("excluded"); if (excludedString != null) { excluded = excludedString.split("\\s+"); } // index on first attr of coarts if (coart) { String indexAttr = wordString.substring(0, wordString.indexOf("-")); String indexVal = surfaceWord.getVal(indexAttr); coartIndexingWord = Word.createWord(indexAttr, indexVal); } } /** Returns whether the name, qualified name or family name of the given entries item is in the excluded list. */ public boolean excluded(EntriesItem eItem) { if (excluded.length == 0) return false; for (int i = 0; i < excluded.length; i++) { if (eItem.getName().equals(excluded[i])) return true; if (eItem.getQualifiedName().equals(excluded[i])) return true; if (eItem.getFamilyName().equals(excluded[i])) return true; } return false; } /** Returns the full word. */ public Word getWord() { return word; } /** Returns the surface word (without the stem, POS and semantic class). */ public Word getSurfaceWord() { return surfaceWord; } /** Returns the macro names. */ public String[] getMacros() { return macros; } /** Returns the names of the excluded entries. */ public String[] getExcluded() { return excluded; } /** Returns whether the morph item is a coarticulation, eg a pitch accent. */ public boolean isCoart() { return coart; } /** Returns the word for indexing this coarticulation (or null if not a coarticulation). */ public Word getCoartIndexingWord() { return coartIndexingWord; } /** Returns a string for this morph item. */ // nb: excluded not handled public String toString() { StringBuffer sb = new StringBuffer(); sb.append('['); for (int i=0; i < macros.length; i++) { sb.append(macros[i]); if (i < macros.length-1) sb.append(','); } sb.append(']'); return "{" + word + " => " + sb + "}"; } } ================================================ FILE: src/opennlp/ccg/lexicon/SimpleWord.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.util.*; import java.util.*; /** * A SimpleWord object is a surface word which stores just the word form. * SimpleWord serves as the base class for concrete instantiations of words. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/07/17 04:23:30 $ */ public class SimpleWord extends Word { private static final long serialVersionUID = 181491057498517717L; /** The surface form. */ protected String form; /** Returns the surface form. */ public String getForm() { return form; } /** Returns the pitch accent. */ public String getPitchAccent() { return null; } /** Returns the list of extra attribute-value pairs. */ protected List> getAttrValPairsList() { return null; } /** Returns the stem. */ public String getStem() { return null; } /** Returns the part of speech. */ public String getPOS() { return null; } /** Returns the supertag. */ public String getSupertag() { return null; } /** Returns the semantic class. */ public String getSemClass() { return null; } /** Returns the value of the attribute with the given name, or null if none. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used to retrieve the form, ..., semantic class. */ public String getVal(String attr) { String internedAttr = attr.intern(); // use == on interned attr if (internedAttr == Tokenizer.WORD_ATTR) return getForm(); if (internedAttr == Tokenizer.PITCH_ACCENT_ATTR) return getPitchAccent(); if (internedAttr == Tokenizer.STEM_ATTR) return getStem(); if (internedAttr == Tokenizer.POS_ATTR) return getPOS(); if (internedAttr == Tokenizer.SUPERTAG_ATTR) return getSupertag(); if (internedAttr == Tokenizer.SEM_CLASS_ATTR) return getSemClass(); List> pairs = getAttrValPairsList(); if (pairs == null) return null; for (int i = 0; i < pairs.size(); i++) { Pair p = pairs.get(i); if (p.a == internedAttr) return p.b; } return null; // not found } /** Constructor. */ protected SimpleWord(String form) { this.form = form; } } ================================================ FILE: src/opennlp/ccg/lexicon/SupertaggerAdapter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008-9 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import java.util.*; /** * The SupertaggerAdapter interface is for plugging a supertagger into the * lexicon in order to return only the desired, high probability categories * during lexical lookup. Once the supertagger has been plugged in, * using Lexicon.setSupertagger, the supertagger will be consulted during * each lexical lookup for the desired categories, using getSupertags. * Note that this entails that the supertagger must update its state between lexical * lookup calls; in this way, identical words in a sentence can have different * predicted categories. * * The supertagger returns beta-best categories for lexical lookup * according to a sequence of beta settings it maintains internally. * Associated probabilities for each supertag are also returned. * The 'include gold' option controls whether gold standard tags are included * during training. * * At present, the lexicon must contain appropriate morph items for all words. * However, the supertags assigned to a word need not be limited to those explicitly * listed in the lexicon. When there is an explicit entry, it will be used, * as doing so allows the specification of a 'pred' which differs from the stem, * as well as macros that can affect the supertag. Otherwise, when using a supertagger, * it is no longer necessary to list stems with categories in the lexicon, as the * supertagger becomes responsible for this mapping. * * Note also that at present, only one supertagger (for either parsing or realization) * may be plugged in to the lexicon at a time. * * @author Michael White * @version $Revision: 1.11 $, $Date: 2010/12/08 15:24:26 $ */ public interface SupertaggerAdapter { /** * Class for caching a lexical item's log prob in a sign. */ public static class LexLogProb { /** The log prob. */ public final float logprob; /** Constructor. */ public LexLogProb(float logprob) { this.logprob = logprob; } } /** * Returns the supertags of the desired categories for the current lexical lookup * as a map from supertags to contextual probabilities (or null to accept all). */ public Map getSupertags(); /** * Sets the flag for whether to include gold tags. */ public void setIncludeGold(boolean includeGold); /** * Resets beta to the most restrictive value. */ public void resetBeta(); /** * Resets beta to the least restrictive value. */ public void resetBetaToMax(); /** * Advances beta to the next most restrictive setting. */ public void nextBeta(); /** * Advances beta to the next less restrictive setting. */ public void previousBeta(); /** * Returns whether there are any less restrictive beta settings * remaining in the sequence. */ public boolean hasMoreBetas(); /** * Returns whether there are any more restrictive beta settings * remaining in the sequence. */ public boolean hasLessBetas(); /** Returns all the beta values. */ public double[] getBetas(); /** Sets the beta values. */ public void setBetas(double[] betas); /** Returns the current beta value. */ public double getCurrentBetaValue(); } ================================================ FILE: src/opennlp/ccg/lexicon/Tokenizer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import java.util.*; /** * The Tokenizer interface provides a way to customize tokenization * and handling of special tokens. * A custom tokenizer may be specified in the grammar file. * DefaultTokenizer provides a default implementation, which can also * be subclassed for custom behavior. * * @author Michael White * @version $Revision: 1.14 $, $Date: 2005/10/20 17:30:30 $ **/ public interface Tokenizer { /** Name used to tokenize word form attribute (usually left implicit). */ public static final String WORD_ATTR = "W"; /** Name used to tokenize stem attribute. */ public static final String STEM_ATTR = "S"; /** Name used to tokenize part-of-speech attribute. */ public static final String POS_ATTR = "P"; /** Name used to tokenize supertag attribute. */ public static final String SUPERTAG_ATTR = "T"; /** Name used to tokenize semantic class attribute. */ public static final String SEM_CLASS_ATTR = "C"; /** Name used to tokenize pitch accent attribute. */ public static final String PITCH_ACCENT_ATTR = "A"; /** Constant used to signal the substitution of a date as the pred. */ public static final String DATE_VAL = "[*DATE*]"; /** Constant used to represent the semantic class date. */ public static final String DATE_CLASS = "date"; /** Constant used to signal the substitution of a time as the pred. */ public static final String TIME_VAL = "[*TIME*]"; /** Constant used to represent the semantic class time. */ public static final String TIME_CLASS = "time"; /** Constant used to signal the substitution of a number as the pred. */ public static final String NUM_VAL = "[*NUM*]"; /** Constant used to represent the semantic class number. */ public static final String NUM_CLASS = "num"; /** Constant used to signal the substitution of an amount as the pred. */ public static final String AMT_VAL = "[*AMT*]"; /** Constant used to represent the semantic class amount. */ public static final String AMT_CLASS = "amt"; /** Constant used to signal the substitution of a duration as the pred. */ public static final String DUR_VAL = "[*DUR*]"; /** Constant used to represent the semantic class duration. */ public static final String DUR_CLASS = "dur"; /** * Constant used to signal the substitution of a named entity * (not listed in lexicon) as the pred. */ public static final String NE_VAL = "[*NE*]"; /** Constant used to represent the semantic class (other) named entity. */ public static final String NE_CLASS = "ne"; /** * Parses an input string into a list of words, * including any explicitly given factors, * and the semantic class of special tokens. * Tokens are parsed into words using parseToken. */ public List tokenize(String s); /** * Parses an input string into a list of words, * including any explicitly given factors, * and the semantic class of special tokens. * Tokens are parsed into words using parseToken, according to the given * flag for whether to parse factors strictly. */ public List tokenize(String s, boolean strictFactors); /** * Parses a token into a word, including any explicitly given factors * and the semantic class of special tokens. */ public Word parseToken(String token); /** * Parses a token into a word, including any explicitly given factors * and the semantic class of special tokens, according to the given * flag for whether to parse factors strictly. */ public Word parseToken(String token, boolean strictFactors); /** * Returns a string (eg Tokenizer.DATE_CLASS) indicating the semantic class * of special token, if the given token is recognized as a special * token; otherwise returns null. */ public String isSpecialToken(String token); /** * Returns the special token constant for the given special token class, * or null if none. */ public String getSpecialTokenConstant(String semClass); /** * Returns true iff the given string is a special token constant * (eg Tokenizer.DATE_VAL). */ public boolean isSpecialTokenConstant(String s); /** Returns true iff the token is recognized as a date. */ public boolean isDate(String token); /** Returns true iff the token is recognized as a time. */ public boolean isTime(String token); /** Returns true iff the token is recognized as a number. */ public boolean isNum(String token); /** Returns true iff the token is recognized as an amount. */ public boolean isAmt(String token); /** * Returns true iff the token is recognized as a named entity (not listed in lexicon). */ public boolean isNamedEntity(String token); /** * Adds a semantic class to replace words with for language models. */ public void addReplacementSemClass(String semClass); /** * Returns whether the given semantic class is one to replace words with for language models. * The sem class is assumed to have been interned. */ public boolean isReplacementSemClass(String semClass); /** * Returns a string for the given list of words. */ public String getOrthography(List words); /** * Returns a string for the given list of words, optionally with semantic class replacement. */ public String getOrthography(List words, boolean semClassReplacement); /** * Returns a string for the given word, optionally with semantic class replacement. */ public String getOrthography(Word w, boolean semClassReplacement); /** * Returns a string for the given list of words, * in the format expected by the SRILM tool for factored language models. */ public String format(List words); /** * Returns a string for the given word, * in the format expected by the SRILM tool for factored language models. */ public String format(Word word); /** * Returns a string for the given list of words, * in the format expected by the SRILM tool for factored language models, * optionally with semantic class replacement. */ public String format(List words, boolean semClassReplacement); /** * Returns a string for the given word, * in the format expected by the SRILM tool for factored language models, * optionally with semantic class replacement. */ public String format(Word word, boolean semClassReplacement); /** * Returns one or more orthographic words for the given word. * This method is called from within Sign.getWordsInXml as * part of producing the textual output of realization. */ public List expandWord(Word word); /** * Returns one or more orthographic words for the given date token. */ public List expandDate(String date); /** * Returns one or more orthographic words for the given time token. */ public List expandTime(String time); /** * Returns one or more orthographic words for the given number token. */ public List expandNum(String num); /** * Returns one or more orthographic words for the given amount token. */ public List expandAmt(String amt); /** * Returns one or more orthographic words for the given named entity token. */ public List expandNamedEntity(String namedEntity); } ================================================ FILE: src/opennlp/ccg/lexicon/TrueCaser.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; /** * A simple class that is constructed with a file containing a list of words * that are all and only those found (presumably from a large corpus) to be * more frequently upper-cased. Any word-string passed to it (that is also * not a named entity string) is queried in this list. If the word is there * it is restored to the casing found in the list, if it is not, it is converted * to lower-case. * * There is an option, skipAlreadyLower, to skip any word that is already in lower case, * so as to avoid mistakenly uppercasing words. This option is enabled by default. * * @author Dennis N. Mehay * */ import java.util.*; import java.io.*; import java.nio.charset.Charset; import opennlp.ccg.lexicon.Word; import opennlp.ccg.lexicon.DefaultTokenizer; import java.util.zip.GZIPInputStream; public class TrueCaser { /** Flag for skipping words already in lower case (enabled by default). */ public boolean skipAlreadyLower = true; /** * A map from lower-cased keys to the true-cased forms (from the list passed in during construction). */ private Map caseMap = new HashMap(); /** * For the heuristics that determine whether something is in title case: what percentage of the sentence must be * cased to make the title-case detector go off? */ private double titleCaseThreshold; /** * Constructor that just takes a path to a list of words that are only upper-case (title-case is default = 0.9). */ public TrueCaser(String pathToOnlyUCWords) { this(pathToOnlyUCWords, 0.5); } /** * Constructor that takes a path to a list of words that are only upper-case and a title-case threshold. */ public TrueCaser(String pathToOnlyUCWords, double titleCaseThreshold) { this.titleCaseThreshold = titleCaseThreshold; this.caseMap = TrueCaser.readInCaseMap(pathToOnlyUCWords); if (this.caseMap == null) { System.err.println("Something went wrong."+System.getProperty("line.separator")+ "Make sure you passed in a file of true-cased words, etc."); System.exit(-1); } else { //System.err.println("Reading in list of true-cased words: "+pathToOnlyUCWords+System.getProperty("line.separator")+ // " at a title-case heursitic detection threshold of: "+titleCaseThreshold); } } /** * Static method to read in the list of words. */ public static Map readInCaseMap(String pathToOnlyUCWords) { Map caseMap = new HashMap(); BufferedReader inRead = null; try { Charset utf8 = Charset.availableCharsets().get("UTF-8"); inRead = new BufferedReader( (pathToOnlyUCWords.toLowerCase().endsWith(".gz")) ? (new InputStreamReader(new GZIPInputStream(new FileInputStream(new File(pathToOnlyUCWords))), utf8)) : (new InputStreamReader(new FileInputStream(new File(pathToOnlyUCWords)), utf8)) ); String ln = inRead.readLine(); while(ln != null) { ln = ln.trim(); if(!ln.startsWith("#")) { caseMap.put(ln.toLowerCase(), ln); } ln = inRead.readLine(); } } catch (IOException ioe) { return null; } finally { try { inRead.close(); } catch (Exception e) { // do nothing. } } return caseMap; } /** * Truecase a candidate word. If the word is in the list of more commonly * cased words, then return this cased form. If not, normalize to lowercase if * this is the first word or the sentence is in titlecase. Else return the word as-is. * If skipAlreadyLower is enabled, skip the word if it is already in lower case. */ public String trueCase(String theWord, boolean isTitle, boolean isFirstWord) { String loweredWord = theWord.toLowerCase(); // skip word in lower case per flag if (skipAlreadyLower && theWord.equals(loweredWord)) return theWord; // look up the truecased version; if not there, and title-case or first // word in sentence, lowercase it, otherwise, don't touch it. String trueCasedTheWord = caseMap.get(loweredWord); if (trueCasedTheWord != null) { return trueCasedTheWord; } else { return (isTitle || isFirstWord) ? loweredWord : theWord; } } /** * Truecase a whole sentence. If the sentence appears to be in title-case (as determined by a heuristic * that is triggered by there being greater than 0.X of the first 10 words, if there are that many, * being cased) normalize any word that is not in the true-case list to lower-case. Otherwise, leave all * words that are not in the true-case list alone, except the first word (which is normalized to lower-case) * The title-case threshold is a creation-time parameter. * As always, if skipAlreadyLower is enabled, the word is skipped if it is already in lower case. Note that * the second word is counted as the first word if the first token is a left quote (single or double). */ public String trueCaseSentence(String sentence) { String[] parts = sentence.split("\\s+"); StringBuffer res = new StringBuffer(parts.length); boolean isTitle = isTitleCased(parts); // truecase the whole sentence (only normalizing by lowercasing if // titlecase detector went off). int i = 0; for (String prt : parts) { boolean isFirstWord = (i == 0 || (i == 1 && (parts[0].equals("``") || parts[0].equals("`")))); res.append(" " + trueCase(prt, isTitle, isFirstWord)); i++; } return res.toString().trim(); } /** * Returns true iff the percentage of the first 10 words (or the whole sentence if it's less than 10 words) * that have a case distinction is greater than or equal to 'titleCaseThreshold'. */ public boolean isTitleCased(String[] words) { int numCased = 0, cursor = 0; for (String wd : words) { if (cursor >= 10) { break; } if (!wd.toLowerCase().equals(wd)) { numCased++; } cursor++; } return (numCased / ((words.length < 10) ? (words.length + 0.0) : 10.0) >= titleCaseThreshold); } public String tcWordToString(String newWordForm, Word oldWord) { StringBuffer sb = new StringBuffer(); sb.append(newWordForm); if (oldWord.getPOS() != null) sb.append(":P-").append(DefaultTokenizer.escape(oldWord.getPOS())); if (oldWord.getSupertag() != null) sb.append(":T-").append(DefaultTokenizer.escape(oldWord.getSupertag())); if (oldWord.getSemClass() != null) sb.append(":C-").append(DefaultTokenizer.escape(oldWord.getSemClass())); if (sb.length() == 0) sb.append((String) null); return sb.toString(); } public static void main(String[] args) throws IOException { String newline = System.getProperty("line.separator"); String usage = newline + "java TrueCaser -t -r -i -o " + newline; if(args.length == 0 || args[0] .equals("h") || args[0] .equals("-h") || args[0] .equals("--h") || args[0] .equals("--help") || args[0] .equals("-help")) { System.err.println(usage); System.exit(0); } String truecasefile = null, inputfile = null, outputfile = null; double tcThresh = 0.5; for (int a = 0; a < args.length; a++) { if (args[a].equals("-t")) { truecasefile = args[++a]; continue; } if (args[a].equals("-r")) { tcThresh = Double.parseDouble(args[++a]); continue; } if (args[a].equals("-i")) { inputfile = args[++a]; continue; } if (args[a].equals("-o")) { outputfile = args[++a]; continue; } System.err.println("unrecognized option " + args[a] + "."); System.err.println(usage); System.exit(0); } Charset utf8 = Charset.availableCharsets().get("UTF-8"); // input of text (assumed to be tokenized utf-8-encoded text). BufferedReader in = new BufferedReader(new InputStreamReader( ((inputfile == null) ? System.in : (new FileInputStream( new File(inputfile)))), utf8)); // output stream (back to tokenized utf-8-encoded text). BufferedWriter out = new BufferedWriter(new OutputStreamWriter( ((outputfile == null) ? System.out : (new FileOutputStream( new File(outputfile)))), utf8)); // file of true-cased words is arg0. TrueCaser tc = new TrueCaser(truecasefile, tcThresh); String sent = in.readLine(); while (sent != null) { sent = tc.trueCaseSentence(sent.trim()); out.write(sent + System.getProperty("line.separator")); sent = in.readLine(); } out.close(); in.close(); } } ================================================ FILE: src/opennlp/ccg/lexicon/Word.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; import opennlp.ccg.util.*; import java.io.*; import java.util.*; import gnu.trove.*; /** * A Word object may either be a surface word or a full word. * A surface word holds a surface form, an optional pitch accent, and an * optional list of extra attribute-value pairs. * A full word additionally contains a stem, part of speech, supertag and semantic class. * A word may be a multiword consisting of multiple orthographic words, * typically separated by underscores in the surface form. * For efficient storage and equality checking, Word objects are interned by * the factory methods of the configured WordFactory. * * @author Michael White * @version $Revision: 1.21 $, $Date: 2009/12/16 22:39:20 $ */ abstract public class Word implements Serializable, Comparable { private static final long serialVersionUID = 1L; /** Returns the surface form. */ abstract public String getForm(); /** Returns the pitch accent. */ abstract public String getPitchAccent(); // empty iterator private static Iterator> emptyIterator = new ArrayList>(0).iterator(); /** Returns an iterator over the extra attribute-value pairs. */ public Iterator> getAttrValPairs() { List> pairs = getAttrValPairsList(); return (pairs != null) ? pairs.iterator() : emptyIterator; } /** Returns an iterator over the surface attribute-value pairs, including the pitch accent (if any). */ public Iterator> getSurfaceAttrValPairs() { List> pairs = getAttrValPairsList(); String pitchAccent = getPitchAccent(); if (pairs == null && pitchAccent == null) return emptyIterator; else if (pairs == null) { List> retval = new ArrayList>(1); retval.add(new Pair(Tokenizer.PITCH_ACCENT_ATTR, pitchAccent)); return retval.iterator(); } else if (pitchAccent == null) return pairs.iterator(); else { List> retval = new ArrayList>(pairs); retval.add(new Pair(Tokenizer.PITCH_ACCENT_ATTR, pitchAccent)); return retval.iterator(); } } /** Returns the list of extra attribute-value pairs. */ abstract protected List> getAttrValPairsList(); /** Returns the stem. */ abstract public String getStem(); /** Returns the part of speech. */ abstract public String getPOS(); /** Returns the supertag. */ abstract public String getSupertag(); /** Returns the semantic class. */ abstract public String getSemClass(); /** Returns the value of the attribute with the given name, or null if none. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used to retrieve the form, ..., semantic class. */ abstract public String getVal(String attr); // the known attr names private static Set knownAttrs = initKnownAttrs(); @SuppressWarnings("unchecked") private static Set initKnownAttrs() { Set knownAttrs = new THashSet(new TObjectIdentityHashingStrategy()); String[] names = { Tokenizer.WORD_ATTR, Tokenizer.PITCH_ACCENT_ATTR, Tokenizer.STEM_ATTR, Tokenizer.POS_ATTR, Tokenizer.SUPERTAG_ATTR, Tokenizer.SEM_CLASS_ATTR }; for (int i = 0; i < names.length; i++) { knownAttrs.add(names[i]); } return knownAttrs; } /** Returns whether the given attr is a known one (vs an extra one). */ public static boolean isKnownAttr(String attr) { return knownAttrs.contains(attr.intern()); } /** Returns true if the form is non-null, while the stem, part of speech, supertag and semantic class are null. */ public boolean isSurfaceWord() { return getForm() != null && getStem() == null && getPOS() == null && getSupertag() == null && getSemClass() == null; } // factory methods /** Factory interface. */ public interface WordFactory { /** Creates a surface word with the given interned form. */ public Word create(String form); /** Creates a (surface or full) word with the given normalized attribute name and value. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used for the form, ..., semantic class. */ public Word create(String attr, String val); /** Creates a (surface or full) word from the given canonical factors. */ public Word create( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ); } /** The word factory to use. */ protected static WordFactory wordFactory = new FullWord.Factory(); // NB: could try different factory methods for concrete words, but // it's unclear whether it makes much difference // protected static WordFactory wordFactory = new FactorChainWord.Factory(); /** Creates a surface word with the given form. */ public static synchronized Word createWord(String form) { form = (form != null) ? form.intern() : null; return wordFactory.create(form); } /** Creates a (surface or full) word. */ public static synchronized Word createWord( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ) { // normalize factors form = (form != null) ? form.intern() : null; pitchAccent = (pitchAccent != null) ? pitchAccent.intern() : null; if (attrValPairs != null) { if (attrValPairs.isEmpty()) attrValPairs = null; else { attrValPairs = new ArrayList>(attrValPairs); sortAttrValPairs(attrValPairs); for (int i = 0; i < attrValPairs.size(); i++) { Pair p = attrValPairs.get(i); String attr = p.a.intern(); String val = (p.b != null) ? p.b.intern() : null; attrValPairs.set(i, new Pair(attr, val)); } } } stem = (stem != null) ? stem.intern() : null; POS = (POS != null) ? POS.intern() : null; supertag = (supertag != null) ? supertag.intern() : null; semClass = (semClass != null) ? semClass.intern() : null; // create word return createWordDirectly(form, pitchAccent, attrValPairs, stem, POS, supertag, semClass); } // comparator for attr-val pairs private static Comparator> attrValComparator = new Comparator>() { public int compare(Pair p1, Pair p2) { return p1.a.compareTo(p2.a); } }; /** Sorts attr-val pairs by attr name. */ private static void sortAttrValPairs(List> pairs) { Collections.sort(pairs, attrValComparator); } /** Creates a (surface or full) word directly, from the given canonical factors. */ private static synchronized Word createWordDirectly( String form, String pitchAccent, List> attrValPairs, String stem, String POS, String supertag, String semClass ) { return wordFactory.create(form, pitchAccent, attrValPairs, stem, POS, supertag, semClass); } /** Creates a (surface or full) word with the given attribute name and value. The attribute names Tokenizer.WORD_ATTR, ..., Tokenizer.SEM_CLASS_ATTR may be used for the form, ..., semantic class. */ public static synchronized Word createWord(String attr, String val) { attr = attr.intern(); val = (val != null) ? val.intern() : null; return wordFactory.create(attr, val); } /** Creates a (surface or full) word from the given one, replacing the word form with the given one. */ public static synchronized Word createWord(Word word, String form) { if (form != null) form = form.intern(); return createWordDirectly( form, word.getPitchAccent(), word.getAttrValPairsList(), word.getStem(), word.getPOS(), word.getSupertag(), word.getSemClass() ); } /** Creates a (surface or full) word from the given one, replacing the form and stem with the semantic class, uppercased. */ public static synchronized Word createWordUsingSemClass(Word word) { String form = word.getSemClass().toUpperCase().intern(); String stem = form; return createWordDirectly( form, word.getPitchAccent(), word.getAttrValPairsList(), stem, word.getPOS(), word.getSupertag(), word.getSemClass() ); } /** Creates a (surface or full) word from the given surface one, adding the second word's additional attr-val pairs. */ public static synchronized Word createWordWithAttrs(Word word, Word word2) { // get accent String accent = word.getPitchAccent(); if (accent == null) accent = word2.getPitchAccent(); // get attrs boolean mixedAttrs = false; List> pairs = word.getAttrValPairsList(); List> pairs2 = word2.getAttrValPairsList(); if (pairs == null && pairs2 != null) { pairs = pairs2; } else if (pairs2 != null) { mixedAttrs = true; pairs = new ArrayList>(pairs); for (int i = 0; i < pairs2.size(); i++) { if (!pairs.contains(pairs2.get(i))) { pairs.add(pairs2.get(i)); } } } // get rest String form = word.getForm(); String stem = word.getStem(); String POS = word.getPOS(); String supertag = word.getSupertag(); String semClass = word.getSemClass(); // with mixed attrs, need to normalize if (mixedAttrs) return createWord(form, accent, pairs, stem, POS, supertag, semClass); else return createWordDirectly(form, accent, pairs, stem, POS, supertag, semClass); } /** Creates a full word from the given surface one, adding the given stem, POS and semantic class. */ public static synchronized Word createFullWord(Word word, String stem, String POS, String supertag, String semClass) { stem = (stem != null) ? stem.intern() : null; POS = (POS != null) ? POS.intern() : null; supertag = (supertag != null) ? supertag.intern() : null; semClass = (semClass != null) ? semClass.intern() : null; return createWordDirectly(word.getForm(), word.getPitchAccent(), word.getAttrValPairsList(), stem, POS, supertag, semClass); } /** Creates a full word from the given surface one, adding the second (full) given word's stem, POS and semantic class, as well as the second word's additional attr-val pairs, plus the given supertag. */ public static synchronized Word createFullWord(Word word, Word word2, String supertag) { boolean mixedAttrs = false; List> pairs = word.getAttrValPairsList(); List> pairs2 = word2.getAttrValPairsList(); if (pairs == null && pairs2 != null) { pairs = pairs2; } else if (pairs2 != null) { mixedAttrs = true; pairs = new ArrayList>(pairs); for (int i = 0; i < pairs2.size(); i++) { if (!pairs.contains(pairs2.get(i))) { pairs.add(pairs2.get(i)); } } } if (mixedAttrs) { return createWord( word.getForm(), word.getPitchAccent(), pairs, word2.getStem(), word2.getPOS(), supertag, word2.getSemClass() ); } else { supertag = (supertag != null) ? supertag.intern() : null; return createWordDirectly( word.getForm(), word.getPitchAccent(), pairs, word2.getStem(), word2.getPOS(), supertag, word2.getSemClass() ); } } /** Creates a surface word from the given one, removing the stem, POS, supertag and semantic class. */ public static synchronized Word createSurfaceWord(Word word) { return createWordDirectly(word.getForm(), word.getPitchAccent(), word.getAttrValPairsList(), null, null, null, null); } /** Creates a surface word from the given one, removing the stem, POS, supertag and semantic class, and replacing the form with the given one. */ public static synchronized Word createSurfaceWord(Word word, String form) { form = (form != null) ? form.intern() : null; return createWordDirectly(form, word.getPitchAccent(), word.getAttrValPairsList(), null, null, null, null); } /** Creates a surface word from the given one, removing the stem, POS, supertag and semantic class, and replacing the form with the semantic class, uppercased. */ public static synchronized Word createSurfaceWordUsingSemClass(Word word) { String form = word.getSemClass().toUpperCase().intern(); return createWordDirectly(form, word.getPitchAccent(), word.getAttrValPairsList(), null, null, null, null); } /** Creates a core surface word from the given one, removing all attrs in the given set. */ public static synchronized Word createCoreSurfaceWord(Word word, Set attrsSet) { String form = word.getForm(); String accent = word.getPitchAccent(); if (accent != null && attrsSet.contains(Tokenizer.PITCH_ACCENT_ATTR)) accent = null; List> pairs = word.getAttrValPairsList(); if (pairs != null) { pairs = new ArrayList>(pairs); Iterator> pairsIt = pairs.iterator(); while (pairsIt.hasNext()) { Pair pair = pairsIt.next(); if (attrsSet.contains(pair.a)) { pairsIt.remove(); } } return createWord(form, accent, pairs, null, null, null, null); } else { return createWordDirectly(form, accent, null, null, null, null, null); } } /** Returns a hash code for this word. */ public int hashCode() { int hc = System.identityHashCode(getForm()); hc = 31*hc + System.identityHashCode(getPitchAccent()); for (Iterator> it = getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); hc = 31*hc + System.identityHashCode(p.a); hc = 31*hc + System.identityHashCode(p.b); } hc = 31*hc + System.identityHashCode(getStem()); hc = 31*hc + System.identityHashCode(getPOS()); hc = 31*hc + System.identityHashCode(getSupertag()); hc = 31*hc + System.identityHashCode(getSemClass()); return hc; } /** Returns whether this word equals the given object. */ public boolean equals(Object obj) { if (this == obj) return true; // nb: can use ==, since constructor interns all factors if (!(obj instanceof Word)) return false; Word word = (Word) obj; boolean sameFields = getForm() == word.getForm() && getPitchAccent() == word.getPitchAccent() && getStem() == word.getStem() && getPOS() == word.getPOS() && getSupertag() == word.getSupertag() && getSemClass() == word.getSemClass(); if (!sameFields) return false; List> pairs = getAttrValPairsList(); List> wordPairs = word.getAttrValPairsList(); if (pairs == null && wordPairs == null) return true; if (pairs == null || wordPairs == null) return false; if (pairs.size() != wordPairs.size()) return false; for (int i = 0; i < pairs.size(); i++) { if (!pairs.get(i).equals(wordPairs.get(i))) return false; } return true; } /** Returns an int representing lexicographic sorting. */ public int compareTo(Word word) { if (this == word) return 0; int cmp = 0; cmp = compare(getForm(), word.getForm()); if (cmp != 0) return cmp; cmp = compare(getPitchAccent(), word.getPitchAccent()); if (cmp != 0) return cmp; cmp = compare(getStem(), word.getStem()); if (cmp != 0) return cmp; cmp = compare(getPOS(), word.getPOS()); if (cmp != 0) return cmp; cmp = compare(getSupertag(), word.getSupertag()); if (cmp != 0) return cmp; cmp = compare(getSemClass(), word.getSemClass()); if (cmp != 0) return cmp; List> pairs = getAttrValPairsList(); List> wordPairs = word.getAttrValPairsList(); if (pairs == null && wordPairs == null) return 0; if (pairs == null) return -1; if (wordPairs == null) return 1; if (pairs.size() < wordPairs.size()) return -1; if (pairs.size() > wordPairs.size()) return 1; for (int i = 0; i < pairs.size(); i++) { Pair p = pairs.get(i); Pair wp = wordPairs.get(i); cmp = p.a.compareTo(wp.a); if (cmp != 0) return cmp; cmp = p.b.compareTo(wp.b); if (cmp != 0) return cmp; } return 0; } // compares strings, accounting for nulls private int compare(String s1, String s2) { if (s1 == null && s2 == null) return 0; if (s1 == null) return -1; if (s2 == null) return 1; return s1.compareTo(s2); } /** Returns whether this word's surface attributes intersect with the given ones. */ public boolean attrsIntersect(Set attrsSet) { if (getPitchAccent() != null && attrsSet.contains(Tokenizer.PITCH_ACCENT_ATTR)) return true; for (Iterator> it = getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); if (attrsSet.contains(p.a)) return true; } return false; } /** Returns a hash code for this word's restriction to a surface word. */ public int surfaceWordHashCode() { int hc = System.identityHashCode(getForm()); hc = 31*hc + System.identityHashCode(getPitchAccent()); for (Iterator> it = getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); hc = 31*hc + System.identityHashCode(p.a); hc = 31*hc + System.identityHashCode(p.b); } return hc; } /** Returns whether this word and the given object have equal restrictions to surface words. */ public boolean surfaceWordEquals(Object obj) { if (this == obj) return true; // nb: can use ==, since constructor interns all factors if (!(obj instanceof Word)) return false; Word word = (Word) obj; boolean sameFields = getForm() == word.getForm() && getPitchAccent() == word.getPitchAccent(); if (!sameFields) return false; List> pairs = getAttrValPairsList(); List> wordPairs = word.getAttrValPairsList(); if (pairs == null && wordPairs == null) return true; if (pairs == null || wordPairs == null) return false; if (pairs.size() != wordPairs.size()) return false; for (int i = 0; i < pairs.size(); i++) { if (!pairs.get(i).equals(wordPairs.get(i))) return false; } return true; } /** Returns canonical version of deserialized word. */ public Object readResolve() throws ObjectStreamException { return createWord(getForm(), getPitchAccent(), getAttrValPairsList(), getStem(), getPOS(), getSupertag(), getSemClass()); } /** Shows non-trivial fields separated by underscores. */ public String toString() { StringBuffer sb = new StringBuffer(); if (getForm() != null) sb.append(getForm()); if (getPitchAccent() != null) sb.append('_').append(getPitchAccent()); for (Iterator> it = getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); sb.append('_').append(p.b); } if (getStem() != null && getStem() != getForm()) sb.append('_').append(getStem()); if (getPOS() != null) sb.append('_').append(getPOS()); if (getSupertag() != null) sb.append('_').append(getSupertag()); if (getSemClass() != null) sb.append('_').append(getSemClass()); if (sb.length() == 0) sb.append((String)null); return sb.toString(); } /** Tests serialization. */ public static void main(String[] argv) throws IOException, ClassNotFoundException { // create words Word w = createWord("ran"); Word fw = createFullWord(w, "run", "VBD", "s\\np", "MOTION"); Word wb = createWordWithAttrs(w, createWord("B", "L")); // write to tmp.out String filename = "tmp.ser"; ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(filename)); System.out.println("Writing w: " + w); out.writeObject(w); System.out.println("Writing fw: " + fw); out.writeObject(fw); System.out.println("Writing wb: " + wb); out.writeObject(wb); out.close(); // read from tmp.out ObjectInputStream in = new ObjectInputStream(new FileInputStream(filename)); System.out.print("Reading w2: "); Word w2 = (Word) in.readObject(); System.out.println(w2); System.out.print("Reading fw2: "); Word fw2 = (Word) in.readObject(); System.out.println(fw2); System.out.print("Reading wb2: "); Word wb2 = (Word) in.readObject(); System.out.println(wb2); in.close(); // test identity (and thus readResolve) System.out.println("w == w2?: " + (w == w2)); System.out.println("fw == fw2?: " + (fw == fw2)); System.out.println("wb == wb2?: " + (wb == wb2)); } } ================================================ FILE: src/opennlp/ccg/lexicon/WordWithPitchAccent.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.lexicon; /** * A WordWithPitchAccent object is a surface word with an optional pitch accent * but no further attributes. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/07/17 04:23:30 $ */ public class WordWithPitchAccent extends SimpleWord { private static final long serialVersionUID = 1510997962756436949L; /** The pitch accent. */ protected String pitchAccent; /** Returns the pitch accent. */ public String getPitchAccent() { return pitchAccent; } /** Constructor. */ protected WordWithPitchAccent(String form, String pitchAccent) { super(form); this.pitchAccent = pitchAccent; } } ================================================ FILE: src/opennlp/ccg/ngrams/AAnFilter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.lexicon.*; import opennlp.ccg.util.*; import java.util.*; import java.io.*; import gnu.trove.*; /** * AN n-gram filter that detects "a" followed by a word beginning with a vowel, * or "an" followed by a word beginning with a consonant. * Note that this filter only makes an approximate check, which may be augmented * with a set of exceptions. * Exceptions may be culled from a file of bigrams using cullAAnExceptions, * which may be accessed from the command line using the -c option. * An appropriate bigrams file can be produced using the SRILM ngram-count tool, * with the -text and -write2 options. * * @author Michael White * @version $Revision: 1.8 $, $Date: 2011/03/20 20:11:58 $ */ public class AAnFilter implements NgramFilter, Reversible { // exceptions private Set> exceptions = null; /** Constructor. */ public AAnFilter() {} /** Constructor that loads a/an exceptions from the given infile of bigrams. */ public AAnFilter(String infile) throws IOException { loadAAnExceptions(infile); } /** Flag for whether to reverse words before filtering. */ protected boolean reverse = false; /** Get reverse flag. */ public boolean getReverse() { return reverse; } /** Set reverse flag. */ public void setReverse(boolean reverse) { this.reverse = reverse; } /** Returns whether to filter out the given word sequence. */ public boolean filterOut(List words) { for (int i = 0; i < words.size()-1; i++) { Word w1 = words.get(i); Word w2 = words.get(i+1); if (filterOut(w1, w2)) return true; } return false; } /** Returns whether to filter out the given word bigram (reversed if apropos). */ public boolean filterOut(Word w1, Word w2) { if (reverse) { Word tmp = w1; w1 = w2; w2 = tmp; } String f1 = w1.getForm(); if (f1 != "a" && f1 != "an") return false; String f1Alt = (f1 == "a") ? "an" : "a"; String f2 = w2.getForm(); boolean defaultRetval = filterOutByDefault(f1, f2); if (isException(f1, f2) || isException(f1Alt, f2)) return !defaultRetval; else return defaultRetval; } // returns whether to filter out the bigram by default private static boolean filterOutByDefault(String w1, String w2) { boolean w2StartsWithVowel = startsWithVowel(w2); return (w1 == "a" && w2StartsWithVowel) || (w1 == "an" && !w2StartsWithVowel); } // initial vowel private static boolean startsWithVowel(String word) { char c = word.charAt(0); return (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') || (c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'); } // words for a/an private static final Word A_WORD = Word.createWord("a"); private static final Word AN_WORD = Word.createWord("an"); // reusable list for lookup private List keyList = new ArrayListWithIdentityEquals(2); // looks up whether the bigram is an exception private boolean isException(String w1, String w2) { if (exceptions == null) return false; keyList.clear(); keyList.add((w1 == "a") ? A_WORD : AN_WORD); keyList.add(Word.createWord(w2)); return exceptions.contains(keyList); } // singletons for a/an @SuppressWarnings("unchecked") private static final List A_SINGLETON = (List) Interner.globalIntern(new SingletonList(A_WORD)); @SuppressWarnings("unchecked") private static final List AN_SINGLETON = (List) Interner.globalIntern(new SingletonList(AN_WORD)); /** Adds an a/an bigram as an exception. */ @SuppressWarnings("unchecked") public void addException(String w1, String w2) { // make sure w1 is a/an w1 = w1.intern(); if (w1 != "a" && w1 != "an") { System.err.println("Warning: ignoring exception not starting with a/an: " + w1 + " " + w2); return; } // ensure exceptions initialized if (exceptions == null) exceptions = new THashSet(); // intern and add bigram List w1Singleton = (w1 == "a") ? A_SINGLETON : AN_SINGLETON; List w2Singleton = (List) Interner.globalIntern(new SingletonList(Word.createWord(w2))); List excBigram = (List) Interner.globalIntern(new StructureSharingList(w1Singleton, w2Singleton)); exceptions.add(excBigram); } /** Culls a/an exceptions from the given infile of bigrams, writing them to the given outfile. */ public static void cullAAnExceptions(String infile, String outfile) throws IOException { Reader in = new BufferedReader(new FileReader(infile)); StreamTokenizer tokenizer = NgramScorer.initTokenizer(in); PrintWriter out = new PrintWriter(new FileWriter(outfile)); String[] tokens = new String[2]; // loop through lines while (tokenizer.ttype != StreamTokenizer.TT_EOF) { // read line into tokens NgramScorer.readLine(tokenizer, tokens); // check for blank/incomplete line if (tokens[1] == null) continue; // check for a/an exception if (tokens[0].equals("a") || tokens[0].equals("an")) { String aan = tokens[0].intern(); String word = tokens[1]; if (filterOutByDefault(aan, word)) { // write to exceptions file out.println(aan + " " + word); } } } // done in.close(); out.flush(); out.close(); } /** Loads a/an exceptions from the given infile of bigrams. */ public void loadAAnExceptions(String infile) throws IOException { Reader in = new BufferedReader(new FileReader(infile)); StreamTokenizer tokenizer = NgramScorer.initTokenizer(in); String[] tokens = new String[2]; // loop through lines while (tokenizer.ttype != StreamTokenizer.TT_EOF) { // read line into tokens NgramScorer.readLine(tokenizer, tokens); // check for blank/incomplete line if (tokens[1] == null) continue; // add a/an exception addException(tokens[0], tokens[1]); } // done in.close(); } /** Test loading and filtering, or cull exceptions from bigrams. */ public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.ngrams.AAnFilter () | -c "; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } // cull exceptions with -c if (args[0].equals("-c")) { String infile = args[1]; String outfile = args[2]; System.out.println("Culling a/an exceptions from " + infile + " to " + outfile); cullAAnExceptions(infile, outfile); System.exit(0); } // otherwise optionally load exceptions, ... AAnFilter aanFilter = new AAnFilter(); String infile = null; String tokens = null; if (args.length >= 2) { infile = args[0]; tokens = args[1]; } else tokens = args[0]; if (infile != null) { System.out.println("Loading exceptions from: " + infile); System.out.println(); aanFilter.loadAAnExceptions(infile); } // then filter given tokens Tokenizer tokenizer = new DefaultTokenizer(); List words = tokenizer.tokenize(tokens); //, true); System.out.println("filtering: " + tokens); System.out.println("filter out: " + aanFilter.filterOut(words)); } } ================================================ FILE: src/opennlp/ccg/ngrams/AbstractStandardNgramModel.java ================================================ /* * $Id: AbstractStandardNgramModel.java,v 1.3 2009/12/21 03:27:18 mwhite14850 Exp $ */ package opennlp.ccg.ngrams; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import opennlp.ccg.lexicon.Word; import opennlp.ccg.util.Pair; /** * Abstract class for shared methods used by all standard ngram models. * Adapted from the original StandardNgramModel class. * * @author Scott Martin * @version $Revision: 1.3 $ * @since 0.9.2 */ abstract class AbstractStandardNgramModel extends NgramScorer { /** Reusable list of strings to score. */ protected List stringsToScore = new ArrayList(); /** * Creates a new ngram model of the given order. * @param order The order of the model. * @param useSemClasses Whether this model should use semantic classes. * @see NgramScorer#NgramScorer(int, boolean) */ protected AbstractStandardNgramModel(int order, boolean useSemClasses) { super(order, useSemClasses); numNgrams = new int[order]; } /** * Creates a new ngram model with the specified order. * @see AbstractStandardNgramModel#AbstractStandardNgramModel(int, boolean) */ protected AbstractStandardNgramModel(int order) { this(order, false); } /** * Converts the words in wordsToScore to strings in stringsToScore, before * scoring. */ @Override protected void prepareToScoreWords() { stringsToScore.clear(); for (int i = 0; i < wordsToScore.size(); i++) { Word w = wordsToScore.get(i); String s = w.getForm(); // check for sem class replacement String scr = semClassReplacement(w); if (scr != null) s = scr; // add pitch accent and attrs, if any String pitchAccent = w.getPitchAccent(); Iterator> pairs = w.getAttrValPairs(); if (pitchAccent != null || pairs.hasNext()) { StringBuffer sb = new StringBuffer(); sb.append(s); if (pitchAccent != null) sb.append('_').append(pitchAccent); for (; pairs.hasNext(); ) { Pair p = pairs.next(); sb.append('_').append(p.b); } s = sb.toString().intern(); } // check for unknown word if (openVocab && trieMapRoot.getChild(s) == null) s = ""; // add key stringsToScore.add(s); } } /** * Returns the log prob of the ngram starting at the given index * in wordsToScore and with the given order, with backoff. * (Assumes words in wordsToScore have already been converted to strings in * stringsToScore, via call to prepareToScoreWords.) */ @Override protected float logProbFromNgram(int i, int order) { // skip initial start tag if (i == 0 && order == 1 && stringsToScore.get(0) == "") return 0; // set keys list keysList.clear(); for (int j = i; j < i+order; j++) { keysList.add(stringsToScore.get(j)); } if (debugScore) { System.out.print("logp( " + keysList.get(order-1) + " | "); if (order > 1) { System.out.print(keysList.get(order-2) + " ... "); } System.out.print(") = "); } // calc log prob float retval = logProb(0, order); if (debugScore) System.out.println("" + retval); return retval; } } ================================================ FILE: src/opennlp/ccg/ngrams/ConditionalProbabilityTable.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import java.io.IOException; import java.util.*; import opennlp.ccg.lexicon.*; import opennlp.ccg.util.*; /** * A conditional probability table implemented via a FactoredNgramModelFamily, * where probabilities are determined by n-grams of "factors" of a single "word" * (whether the pairs of attributes and values have anything to do with words * or not). * * @author Michael White * @version $Revision: 1.2 $, $Date: 2010/02/25 22:26:10 $ */ public class ConditionalProbabilityTable { /** * The model, as a factored n-gram model family. */ protected FactoredNgramModelFamily model; /** * Constructor with filename for model as a factored n-gram model family. */ public ConditionalProbabilityTable(String filename) throws IOException { model = new FactoredNgramModelFamily(filename, false); if (model.order != 1) { throw new RuntimeException("A conditional probability table must have n-gram order 1."); } } /** Sets the debug score flag. */ public void setDebug(boolean debugScore) { model.setDebug(debugScore); } /** * Returns a probability from the model for the given list of attribute-value * pairs, which are assumed to have already been interned, by converting * the result of the logprob method. */ public double score(List> attrValList) { return NgramScorer.convertToProb(logprob(attrValList)); } /** * Returns a log probability from the model for the given list of attribute-value * pairs, which are assumed to have already been interned. * The order of the list does not matter because the model probabilities are * defined by the factored n-gram model family specification. */ public double logprob(List> attrValList) { return model.logprob(new SingletonList(new ListPairWord(attrValList))); } /** Tests loading and scoring. */ public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.ngrams.ConditionalProbabilityTable "; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } String specfile = args[0]; String tokens = args[1]; List> pairs = new ArrayList>(); String[] tokenArray = tokens.split("\\s+"); for (int i=0; i < tokenArray.length; i+=2) { String attr = tokenArray[i].intern(); String val = tokenArray[i+1].intern(); pairs.add(new Pair(attr, val)); } System.out.println("Loading conditional probability table from: " + specfile); ConditionalProbabilityTable table = new ConditionalProbabilityTable(specfile); FactoredNgramModelFamily lmFamily = table.model; System.out.println("primary child var: " + lmFamily.primaryGroup.childName); if (lmFamily.furtherGroups != null) { for (int i = 0; i < lmFamily.furtherGroups.length; i++) { System.out.println("further child var: " + lmFamily.furtherGroups[i].childName); } } System.out.println("openVocab: " + lmFamily.openVocab); System.out.println(); System.out.println("scoring: " + tokens); System.out.println(); table.setDebug(true); double logprob = table.logprob(pairs); double score = NgramScorer.convertToProb(logprob); System.out.println("score: " + score); System.out.println("logprob: " + logprob); } } ================================================ FILE: src/opennlp/ccg/ngrams/FactoredNgramModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.lexicon.*; import opennlp.ccg.util.*; import java.util.*; import java.io.*; /** * A scorer for a factored n-gram backoff model. * The file format is the one generated by the SRILM version 1.4.1 fngram-count tool. * Only static backoff orders are supported at present, with * the most distant parent variable dropped at each backoff point. * Unknown words/factors are mapped to <unk> if the latter is present in * the model. * * @author Michael White * @version $Revision: 1.22 $, $Date: 2011/10/11 03:29:42 $ */ public class FactoredNgramModel extends NgramScorer { /** Map for caching and reusing individual models by filename. */ public static Map modelCache = new HashMap(); /** The variable to predict. */ public final ModelVariable child; /** The parent variables to condition on, backing off from the end. */ public final ModelVariable[] parents; /** A variable in a factored n-gram model. */ public class ModelVariable { /** The variable name. */ public final String name; /** The (absolute value of the) position in the history. */ public final int position; /** Makes a model variable from a string such as "W" or "W(-1)". */ public ModelVariable(String str) { int leftparen = str.indexOf("("); if (leftparen > 0) { name = str.substring(0,leftparen).intern(); int rightparen = str.indexOf(")"); position = Math.abs(Integer.parseInt(str.substring(leftparen+1,rightparen))); } else { name = str.intern(); position = 0; } } } /** * Loads a factored n-gram model for the given child variable, with the given parent variables, * from the file with the given name, in the SRILM format. * The flag for using sem classes is defaulted to true. */ public FactoredNgramModel(String child, String parents[], String filename) throws IOException { this(child, parents, filename, true); } /** * Loads a factored n-gram model for the given child variable, with the given parent variables, * from the file with the given name, in the SRILM format, * and with the given flag for using sem classes. */ public FactoredNgramModel(String child, String parents[], String filename, boolean useSemClasses) throws IOException { this.useSemClasses = useSemClasses; this.child = new ModelVariable(child); this.parents = new ModelVariable[parents.length]; order = 1; for (int i = 0; i < parents.length; i++) { this.parents[i] = new ModelVariable(parents[i]); order = Math.max(order, this.parents[i].position + 1); } this.numNgrams = new int[(int)Math.pow(2, parents.length)]; // check cache FactoredNgramModel cachedModel = modelCache.get(filename); if (cachedModel != null) { // share trie etc. numNgrams = cachedModel.numNgrams; openVocab = cachedModel.openVocab; trieMapRoot = cachedModel.trieMapRoot; cachedLogProbs = cachedModel.cachedLogProbs; } // otherwise load model else { Reader in = new BufferedReader(new FileReader(filename)); readModel(in); modelCache.put(filename, this); } } /** * Returns a list of feature keys for the ngram starting at the given index in * wordsToScore and with the given order, using the keys in keysList after * setting them appropriately with setKeysToNgram; returns null if this * operation does not succeed normally. With factored models, factor keys * are sequenced as two string keys. */ protected List ngram(int i, int order) { boolean ok = setKeysToNgram(i, order); if (!ok) return null; featureKeysList.clear(); for (int j=0; j < keysList.size(); j++) { Object key = keysList.get(j); if (key instanceof String) { featureKeysList.add(Tokenizer.WORD_ATTR); featureKeysList.add((String)key); } else if (!(key instanceof FactorKey)) { throw new RuntimeException("Factor keys expected here! key: " + key); } else { FactorKey fkey = (FactorKey) key; featureKeysList.add(fkey.factor); featureKeysList.add(fkey.val); } } return featureKeysList; } /** Returns the log prob of the ngram starting at the given index in wordsToScore and with the given order, with backoff. If using sem classes, then words with replacement sem classes are mapped to words with the sem class replacing the form and stem. Any remaining unknown words/factors are mapped to <unk>, if the latter is present in the model. */ // extracts factor keys from full words according to parents list, // then determines log prob from the list of factor keys protected float logProbFromNgram(int i, int order) { // skip initial start tag if (i == 0 && order == 1 && ((Word)wordsToScore.get(0)).getForm() == "") return 0; // set up factor keys keysList.clear(); int i0 = i + order-1; // index of current word // determine last available parent, if full context not available int lastParentIndex = parents.length-1; if (this.order > order) { for (int j = 0; j < parents.length; j++) { if (parents[j].position >= order) { lastParentIndex = j-1; break; } } } // go through parents in reverse order, // extracting and adding factor keys for (int j = lastParentIndex; j >= 0; j--) { int pos_j = i0 - parents[j].position; if (pos_j < i) continue; // skip if pos_j past i Word w = (Word) wordsToScore.get(pos_j); keysList.add(makeFactorKey(w, parents[j].name)); } // add factor key for child Word current = (Word) wordsToScore.get(i0); keysList.add(makeFactorKey(current, child.name)); if (debugScore) { System.out.print("logp( " + keysList.get(keysList.size()-1) + " | "); for (int j = keysList.size()-2; j >= 0; j--) { System.out.print(keysList.get(j) + " "); } System.out.print(") = "); } // calc log prob from factor keys float retval = logProb(0, keysList.size()); // NB: workaround for apparent bug in SRILM 1.4.1 fngram-count tool, // whereby prob for does not use higher-order contexts for // factors other than W: just use zero if (current.getForm() == "" && child.name != "W") retval = 0; if (debugScore) System.out.println("" + retval); return retval; } // makes a factor key from the given word by extracting // the attribute with the given name, where // the delimiter tokens are treated as a special case, // and the attr val is adjusted if using sem classes private Object makeFactorKey(Word w, String attr) { // special cases for and : just return // a word with this form, regardless of the attr String form = w.getForm(); if (form == "" || form == "") { return FactorKey.getKey(attr, form); } // get val for this attr String val = w.getVal(attr); // check for sem class replacement for form or stem if (attr == Tokenizer.WORD_ATTR || attr == Tokenizer.STEM_ATTR) { String scr = semClassReplacement(w); if (scr != null) val = scr; } // make factor key Object retval = FactorKey.getKey(attr, val); // check for unknown val if (openVocab && trieMapRoot.getChild(retval) == null) { val = ""; retval = FactorKey.getKey(attr, val); } // return return retval; } // reads in model private void readModel(Reader in) throws IOException { // setup //Tokenizer wordTokenizer = (Grammar.theGrammar != null) // ? Grammar.theGrammar.lexicon.tokenizer // : new DefaultTokenizer(); StreamTokenizer tokenizer = initTokenizer(in); String[] tokens = new String[parents.length+3]; Object[] factorKeys = new Object[parents.length+1]; boolean foundData = false; int numParents = -1; int parentsInt = -1; List currentPrefix = new ArrayList(); List currentKeys = null; List> currentChildren = null; // loop through lines while (tokenizer.ttype != StreamTokenizer.TT_EOF) { // read line into tokens readLine(tokenizer, tokens); // check for blank line if (tokens[0] == null) continue; // check for initial delimiter if (tokens[0].equals("\\data\\")) { foundData = true; continue; } if (!foundData) continue; // read header line if (tokens[0].equals("ngram")) { int equalPos = tokens[1].indexOf("="); int n = Integer.decode(tokens[1].substring(0,equalPos)).intValue(); int total = Integer.parseInt(tokens[1].substring(equalPos+1)); numNgrams[n] = total; // init children, keys lists if (currentChildren == null) { currentChildren = new ArrayList>(total); currentKeys = new ArrayList(total); } // calc totals (not actually used anymore) if (n == numNgrams.length-1) { @SuppressWarnings("unused") int totalNgrams = 0; for (int i = 0; i < numNgrams.length; i++) { totalNgrams += numNgrams[i]; } // System.out.println("totalNgrams: " + totalNgrams); } continue; } // check for final delimiter if (tokens[0].equals("\\end\\")) { addTrieMapChildren(currentPrefix, currentKeys, currentChildren); break; } // read line starting new parents context int gramsPos = -1; if (tokens[0].startsWith("\\") && (gramsPos = tokens[0].indexOf("-grams:")) > 0) { // add current children addTrieMapChildren(currentPrefix, currentKeys, currentChildren); // update num parents // System.out.println(tokens[0]); parentsInt = Integer.decode(tokens[0].substring(1,gramsPos)).intValue(); numParents = numParents(parentsInt); continue; } if (numParents < 0) continue; // current order is num parents plus one for child var int currentOrder = numParents + 1; // parse a line of the form // ... [] // i.e. a log prob, followed by N vars, a child var, and an optional back off weight // NB: unlike the ARPA format, here the here is // associated with the backoff of ... // rather than context consisting of ... // read logprob float logprob = Float.parseFloat(tokens[0]); // unescape, intern factor keys for (int i = 1; i < currentOrder+1; i++) { String attr = (i == currentOrder) ? child.name : parents[(currentOrder-i)-1].name; String val = tokens[i]; int hyphenPos = val.indexOf('-'); if (hyphenPos > 0) { String attrCheck = val.substring(0, hyphenPos).intern(); if (attr != attrCheck) { System.err.println( "Warning: expected attr " + attr + " rather than " + attrCheck + " in " + tokens[i] ); } val = val.substring(hyphenPos+1); } val = DefaultTokenizer.unescape(val); if (val != null) val = val.intern(); factorKeys[i-1] = FactorKey.getKey(attr, val); } // check prefix boolean samePrefix = (currentPrefix.size() == currentOrder-1); for (int i = 0; samePrefix && i < currentOrder-1; i++) { if (factorKeys[i] != currentPrefix.get(i)) samePrefix = false; } // if changed, add current children, reset prefix if (!samePrefix) { addTrieMapChildren(currentPrefix, currentKeys, currentChildren); for (int i = 0; i < currentOrder-1; i++) { currentPrefix.add(factorKeys[i]); } } Object key = factorKeys[currentOrder-1]; currentKeys.add(key); currentChildren.add(new TrieMap(new NgramFloats(logprob, 0))); // read back-off weight, if present if (tokens[currentOrder+1] != null) { float bow = Float.parseFloat(tokens[currentOrder+1]); // add to prefix node TrieMap prefixNode = trieMapRoot.findChildFromList(currentPrefix); NgramFloats nfloats = prefixNode.data; if (nfloats != null) nfloats.bow = bow; else prefixNode.data = new NgramFloats(0, bow); } } // set openVocab according to presence of child Object unkKey = FactorKey.getKey(child.name,""); openVocab = (trieMapRoot.getChild(unkKey) != null); } // returns the number of parents present in the parentsInt spec private int numParents(int parentsInt) { int retval = 0; for (int i = 0; i < parents.length; i++) { if ((parentsInt & 1) != 0) retval++; parentsInt = parentsInt >> 1; } return retval; } /** Test loading and scoring. */ // NB: This produces the same scores as the SRILM fngram tool when both // and tags are used, except that ... // NB: There is a workaround for an apparent bug in SRILM 1.4.1 fngram-count tool, // whereby prob for does not use higher-order contexts for // factors other than W, and thus a log prob of zero is just used instead. public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.ngrams.FactoredNgramModel "; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } String child = args[0]; String[] parents = args[1].split("\\s+"); String lmfile = args[2]; String tokens = args[3]; System.out.println("Loading n-gram model from: " + lmfile); FactoredNgramModel lm = new FactoredNgramModel(child, parents, lmfile, true); System.out.println("child var: " + lm.child.name); for (int i = 0; i < lm.parents.length; i++) { System.out.println("parent var: (" + lm.parents[i].name + "," + lm.parents[i].position + ")"); } System.out.println("order: " + lm.order); System.out.println("openVocab: " + lm.openVocab); System.out.println(); // System.out.println("trie map: "); // System.out.println(lm.trieMapRoot.toString()); // System.out.println(); Tokenizer tokenizer = new DefaultTokenizer(); List words = tokenizer.tokenize(tokens, true); System.out.println("scoring: "); for (int i = 0; i < words.size(); i++) { System.out.println(words.get(i).toString()); } System.out.println(); lm.debugScore = true; lm.setWordsToScore(words, true); lm.prepareToScoreWords(); double logprob = lm.logprob(); double score = convertToProb(logprob); System.out.println(); System.out.println("score: " + score); System.out.println("logprob: " + logprob); System.out.println("ppl: " + NgramScorer.convertToPPL(logprob / (words.size()-1))); } } ================================================ FILE: src/opennlp/ccg/ngrams/FactoredNgramModelFamily.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.lexicon.*; import opennlp.ccg.perceptron.Alphabet; import opennlp.ccg.perceptron.FeatureMap; import java.util.*; import java.io.*; /** * A scorer consisting of a family of factored n-gram backoff models. * The family of models is specified using the factored language model specification * file format given as input to the SRILM version 1.4.1 fngram-count tool. * Each individual model is loaded as a FactoredNgramModel instance, and thus * only static backoff orders are supported at present, with * the most distant parent variable dropped at each backoff point. * The first model in the specification file should be the primary one. * It may be followed by any number of models for the same child variable * but with lower history orders; these models will be used when the * full history is unavailable, if their order matches the available * number of words in the history. For example, if the primary model is * a trigram model, then a bigram model may also be given (with a potentially * different backoff order) for scoring bigram word sequences. * Following these models, there may also be further (sequences of) models for scoring * different child variables. If present, the scores calculated for these * child variables will be multipled with the score calculated for the primary * model's child variable (typically the word form). For example, one or more * models may be given to calculate the probability of the word's pitch accent, * independently of the word form (though potentially looking at some of the same history). * With each individual model, the parents are assumed to be listed in backoff order. * Unknown words are mapped to <unk> if the latter is present in * the first model. * * @author Michael White * @version $Revision: 1.15 $, $Date: 2010/02/25 22:26:11 $ */ public class FactoredNgramModelFamily extends NgramScorer { /** The primary model group. */ public final ModelGroup primaryGroup; /** Any additional model groups, for different child variables. */ public final ModelGroup[] furtherGroups; /** A factored n-gram model and any secondary ones for the same child variable. */ public class ModelGroup { /** The child variable name. */ public final String childName; /** The primary model. */ public final FactoredNgramModel primaryModel; /** The secondary models. */ public final FactoredNgramModel[] secondaryModels; /** Makes a model group from the given primary model. */ public ModelGroup(FactoredNgramModel primaryModel, FactoredNgramModel[] secondaryModels) { this.childName = primaryModel.child.name; this.primaryModel = primaryModel; this.secondaryModels = secondaryModels; } /** Returns the appropriate model for the given order. */ public FactoredNgramModel getModel(int order) { if (secondaryModels == null) return primaryModel; for (int i = 0; i < secondaryModels.length; i++) { if (secondaryModels[i].order == order) return secondaryModels[i]; } return primaryModel; } /** Propagates the reverse flag. */ public void setReverse(boolean reverse) { primaryModel.setReverse(reverse); if (secondaryModels == null) return; for (int i = 0; i < secondaryModels.length; i++) { secondaryModels[i].setReverse(reverse); } } /** Propagates the debug score flag. */ public void setDebug(boolean debugScore) { primaryModel.setDebug(debugScore); if (secondaryModels == null) return; for (int i = 0; i < secondaryModels.length; i++) { secondaryModels[i].setDebug(debugScore); } } /** Propagates wordsToScore to the given list, for sharing purposes. */ protected void shareWordsToScore(List wordsToScore) { primaryModel.shareWordsToScore(wordsToScore); if (secondaryModels == null) return; for (int i = 0; i < secondaryModels.length; i++) { secondaryModels[i].shareWordsToScore(wordsToScore); } } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { primaryModel.setAlphabet(alphabet); if (secondaryModels == null) return; for (int i = 0; i < secondaryModels.length; i++) { secondaryModels[i].setAlphabet(alphabet); } } } /** * Loads a family of factored n-gram models * from the file with the given name, in the SRILM format. * The flag for using sem classes is defaulted to true. */ public FactoredNgramModelFamily(String filename) throws IOException { this(filename, true); } /** * Loads a family of factored n-gram models * from the file with the given name, in the SRILM format, * and with the given flag for using sem classes. */ public FactoredNgramModelFamily(String filename, boolean useSemClasses) throws IOException { this.useSemClasses = useSemClasses; List modelGroups = readModel(filename); this.primaryGroup = modelGroups.get(0); if (modelGroups.size() == 1) this.furtherGroups = null; else { this.furtherGroups = new ModelGroup[modelGroups.size()-1]; for (int i = 1; i < modelGroups.size(); i++) { this.furtherGroups[i-1] = modelGroups.get(i); } } order = primaryGroup.primaryModel.order; openVocab = primaryGroup.primaryModel.openVocab; } /** Sets reverse flag, and propagates to component models. */ public void setReverse(boolean reverse) { super.setReverse(reverse); primaryGroup.setReverse(reverse); if (furtherGroups == null) return; for (int i = 0; i < furtherGroups.length; i++) { furtherGroups[i].setReverse(reverse); } } /** Sets debug score flag, and propagates to component models. */ public void setDebug(boolean debugScore) { super.setDebug(debugScore); primaryGroup.setDebug(debugScore); if (furtherGroups == null) return; for (int i = 0; i < furtherGroups.length; i++) { furtherGroups[i].setDebug(debugScore); } } /** Sets wordsToScore to the given list, for sharing purposes. */ protected void shareWordsToScore(List wordsToScore) { this.wordsToScore = wordsToScore; primaryGroup.shareWordsToScore(wordsToScore); if (furtherGroups == null) return; for (int i = 0; i < furtherGroups.length; i++) { furtherGroups[i].shareWordsToScore(wordsToScore); } } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { super.setAlphabet(alphabet); primaryGroup.setAlphabet(alphabet); if (furtherGroups == null) return; for (int i = 0; i < furtherGroups.length; i++) { furtherGroups[i].setAlphabet(alphabet); } } /** * Increments ngram counts for the ngrams starting at the given index in * wordsToScore and with the given order. */ protected void incNgrams(FeatureMap featmap, int i, int order) { // do primary group List ngram = ngram(primaryGroup, i, order); if (ngram != null) { Alphabet.Feature f = alphabet.index(ngram); if (f != null) featmap.inc(f); } // then any further ones if (furtherGroups != null) { for (int j = 0; j < furtherGroups.length; j++) { List ngram2 = ngram(furtherGroups[j], i, order); if (ngram2 == null) continue; Alphabet.Feature f2 = alphabet.index(ngram2); if (f2 != null) featmap.inc(f2); } } } // get ngram from a model group private List ngram(ModelGroup modelGroup, int i, int order) { FactoredNgramModel modelToUse = modelGroup.primaryModel; // with less than full history, get possibly different model to use if (order < modelToUse.order) { modelToUse = modelGroup.getModel(order); } return modelToUse.ngram(i, order); } /** Returns the log prob of the ngram starting at the given index in wordsToScore and with the given order, with backoff. */ protected float logProbFromNgram(int i, int order) { float logProbTotal = 0; // do primary group logProbTotal += logProbFromNgram(primaryGroup, i, order); // then any further ones if (furtherGroups != null) { for (int j = 0; j < furtherGroups.length; j++) logProbTotal += logProbFromNgram(furtherGroups[j], i, order); } return logProbTotal; } // calculate the log prob from a model group private float logProbFromNgram(ModelGroup modelGroup, int i, int order) { FactoredNgramModel modelToUse = modelGroup.primaryModel; // with less than full history, get possibly different model to use if (order < modelToUse.order) { modelToUse = modelGroup.getModel(order); if (debugScore && modelToUse != modelGroup.primaryModel) { int modelNum = Arrays.asList(modelGroup.secondaryModels).indexOf(modelToUse); System.out.print("[2ndary model " + modelNum + "] "); } } return modelToUse.logProbFromNgram(i, order); } /** The max number of tokens to allow per line in the spec file. */ public static int MAX_TOKENS_PER_LINE = 64; // reads in model, returning model groups private List readModel(String filename) throws IOException { // setup File infile = new File(filename); Reader in = new BufferedReader(new FileReader(infile)); StreamTokenizer tokenizer = initTokenizer(in); String[] tokens = new String[MAX_TOKENS_PER_LINE]; // read in models FactoredNgramModel[] models = null; int numModels = -1; int currentModel = 0; // loop through lines while (tokenizer.ttype != StreamTokenizer.TT_EOF) { // read line into tokens readLine(tokenizer, tokens); // check for blank line if (tokens[0] == null) continue; // check for comment if (tokens[0].charAt(0) == '#') continue; // read num models, if not yet found if (numModels < 0) { numModels = Integer.parseInt(tokens[0]); models = new FactoredNgramModel[numModels]; continue; } // skip rest if already read in numModels if (currentModel >= numModels) break; // read model spec if second token is a colon // line format is : ... if (tokens[1] != null && tokens[1].equals(":")) { // read child String child = tokens[0]; // read parents int numParents = Integer.parseInt(tokens[2]); String[] parents = new String[numParents]; for (int i = 0; i < numParents; i++) parents[i] = tokens[i+3]; // read lm filename (skipping count file name) String lmfn = tokens[numParents+4]; // make filename relative to spec file dir File lmfile = new File(infile.getParentFile(), lmfn); lmfn = lmfile.getPath(); // load current model models[currentModel] = new FactoredNgramModel(child, parents, lmfn, useSemClasses); // share wordsToScore models[currentModel].shareWordsToScore(wordsToScore); // inc current model currentModel++; } } // ensure models found if (models == null) { throw new IOException("No models found in: " + filename); } // check num models int actualNumModels = 0; for (int i = 0; i < numModels; i++) { if (models[i] != null) actualNumModels++; } if (actualNumModels != numModels) { System.err.println("Warning: Only found " + actualNumModels + "/" + numModels + " in " + filename); numModels = actualNumModels; } // assign models to groups List modelGroups = new ArrayList(); int modelIndex = 0; while (modelIndex < numModels) { // get primary model, remember child name FactoredNgramModel primaryModel = models[modelIndex]; String childName = primaryModel.child.name; modelIndex++; List secondaryModelsList = new ArrayList(); // get secondary models with same child name while (modelIndex < numModels && models[modelIndex].child.name == childName) { secondaryModelsList.add(models[modelIndex]); modelIndex++; } FactoredNgramModel[] secondaryModels = new FactoredNgramModel[secondaryModelsList.size()]; secondaryModelsList.toArray(secondaryModels); // make, add model group modelGroups.add(new ModelGroup(primaryModel, secondaryModels)); } // done return modelGroups; } /** Test loading and scoring. */ public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.ngrams.FactoredNgramModelFamily "; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } String specfile = args[0]; String tokens = args[1]; System.out.println("Loading n-gram model family from: " + specfile); FactoredNgramModelFamily lmFamily = new FactoredNgramModelFamily(specfile); System.out.println("primary child var: " + lmFamily.primaryGroup.childName); if (lmFamily.furtherGroups != null) { for (int i = 0; i < lmFamily.furtherGroups.length; i++) { System.out.println("further child var: " + lmFamily.furtherGroups[i].childName); } } System.out.println("order: " + lmFamily.order); System.out.println("openVocab: " + lmFamily.openVocab); System.out.println(); Tokenizer tokenizer = new DefaultTokenizer(); List words = tokenizer.tokenize(tokens, true); System.out.println("scoring: "); for (int i = 0; i < words.size(); i++) { System.out.println(words.get(i).toString()); } System.out.println(); lmFamily.setDebug(true); lmFamily.setWordsToScore(words, true); lmFamily.prepareToScoreWords(); double logprob = lmFamily.logprob(); double score = convertToProb(logprob); System.out.println("score: " + score); System.out.println("logprob: " + logprob); System.out.println("ppl: " + NgramScorer.convertToPPL(logprob / (words.size()-1))); } } ================================================ FILE: src/opennlp/ccg/ngrams/KenNgramModel.java ================================================ package opennlp.ccg.ngrams; import opennlp.ccg.ngrams.kenlm.jni.KenLM; import opennlp.ccg.ngrams.kenlm.MurmurHash; import java.io.*; import java.util.List; import java.util.ArrayList; import java.util.Map; import java.util.HashMap; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.lexicon.Word; public class KenNgramModel extends AbstractStandardNgramModel { // the nuts-n-bolts JNI class. private KenLM kenlm = null; // Map of hash of word to integer representation (integerized word used by KenLM). private Map hash2ID = new HashMap(); // Map of String (word to score) to hash of word. // private Map word2Hash = new HashMap(); // Map from hashes to tokens (to see whether we have encountered a new token). private Map hash2String = new HashMap(); // List of vocabulary tokens. New items are added and assigned their index as a representation. private List vocabList = new ArrayList(); // Whether to lowercase text before querying to the language model (e.g., "Pierre Vinken" => "pierre vinken"). private boolean lowercaseText = false; // Whether to split up named entities before querying the language model (e.g., "Pierre_Vinken" => "Pierre Vinken") private boolean splitNEs = false; // What character delimter to use to split NEs on. private char neDelim = '_'; // A reusable container for scoring strings. // private List someStringsToScore = null; // Whether to print out messages that trace the scoring process. public boolean debugScore = false; public KenNgramModel(int order, String lmFile, boolean useSemClasses, boolean lowercaseText, boolean splitNEs, char neDelim, boolean useNgramFeatures) throws IOException { super(order, useSemClasses); this.lowercaseText = lowercaseText; this.splitNEs = splitNEs; this.neDelim = neDelim; kenlm = new KenLM(order, lmFile); // someStringsToScore = new ArrayList(order); this.useNgramFeatures = useNgramFeatures; } /** Construct with order and filename. (Delegates to superclass for these flags). */ public KenNgramModel(int order, String lmFile) throws IOException { this(order, lmFile, false); } /** Construct with order and filename and an indication of whether to use semantic classes. (Delegates to superclass for these flags). */ public KenNgramModel(int order, String lmFile, boolean useSemClasses) throws IOException { this(order, lmFile, useSemClasses, false, false, '_', false); } /** * Integerize a word and register it with the LM, if needed. */ public int id(String token) { synchronized (this) { long hash = 0; try { hash = MurmurHash.hash64(token); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } String hash_word = hash2String.get(hash); if (hash_word != null) { return hash2ID.get(hash); } else { int id = vocabList.size(); // let kenlm know about this word's ID. kenlm.registerWord(token, id); vocabList.add(token); hash2String.put(hash, token); hash2ID.put(hash, id); return id; } } } /** * Lowercase each token, if desired, and split each token into a list of tokens * (splitting on NE delim token), if desired. */ protected List splitAndLowercase(List words) { List tmp = new ArrayList(words.size()); if(!(lowercaseText || splitNEs)) { return words; } else { for(Word w : words) { String wdString = w.getForm(); String[] parts = wdString.replace(neDelim,' ').split("\\s+"); // ArrayList subTmp = new ArrayList(parts.length); for(String part : parts) { String newWdForm = (lowercaseText) ? part.toLowerCase() : part; // add null attr/val list, since it is not accessible. tmp.add(Word.createWord(newWdForm, w.getPitchAccent(), null, newWdForm, w.getPOS(), w.getSupertag(), w.getSemClass())); } } return tmp; } } /** * Resets wordsToScore to the given ones, reversing them when the reverse * flag is true, and adding sentence delimiters if not already present, when * the completeness flag is true. Delegates to the superclass */ @Override protected void setWordsToScore(List words, boolean complete) { wordsToScore.clear(); tagsAdded = false; List tmp = splitAndLowercase(words); words = tmp; super.setWordsToScore(words, complete); } /** * Calculates a log probability of a delineated substring of the strings * to score using KenLM. * @param pos The start position (inclusive) within the strings to score. * @param len The length, starting from pos, of the string * that should be used. */ @Override public float logProb(int pos, int len) { try { List range = new ArrayList(keysList.size()); for(Object wts : keysList.subList(pos, pos + len)) range.add((String)wts); int rangeSize = range.size(); if(rangeSize == 0) { throw new IllegalArgumentException("empty range specified for log prob"); } // Get hashes of words. int[] wds = new int[range.size()]; int cursor = 0; for(String s : range) wds[cursor++] = id(s); // call KenLM float result = kenlm.prob(wds); if(debugScore) { String wd = range.get(range.size()-1); String context = ""; for(String contextWord : range.subList(0,range.size()-1)) context += " " + contextWord; context = context.trim(); System.out.println("logp(" + wd + " | " + context + ") = " + result); } return result; } catch(IndexOutOfBoundsException e) { return 0.0f; } } /** Test loading and scoring. */ // NB: This produces the same scores as the SRILM ngram tool when both // and tags are used. public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.ngrams.KenLM "; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } long start = System.currentTimeMillis(); String order = args[0]; String lmfile = args[1]; String tokens = args[2]; boolean lowercase = true, splitNEs = false; // we want to prove that there are NEs to split and that there are uppercase chars to preserve. for(char c : tokens.toCharArray()) { if (c == '_') { splitNEs = true; if (!lowercase) break; } if (Character.isUpperCase(c)) { lowercase = false; if(splitNEs) break; } } System.out.println("Loading n-gram model with order " + order + " from: " + lmfile); KenNgramModel lm = new KenNgramModel(Integer.parseInt(order), lmfile, false, lowercase, splitNEs, '_', false); lm.debugScore = true; int secs = (int) (System.currentTimeMillis() - start) / 1000; System.out.println("secs: " + secs); System.out.println(); Tokenizer tokenizer = new DefaultTokenizer(); List words = tokenizer.tokenize(tokens); System.out.println("scoring: " + tokens); System.out.println(); lm.setWordsToScore(words, true); lm.prepareToScoreWords(); double logprob = lm.logprob(); double score = convertToProb(logprob); System.out.println(); System.out.println("score: " + score); System.out.println("logprob: " + logprob); // Find out how many words there are here. int size = lm.splitAndLowercase(words).size(); System.out.println("ppl: " + NgramScorer.convertToPPL(logprob / (size-1))); } } ================================================ FILE: src/opennlp/ccg/ngrams/LinearNgramScorerCombo.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import java.util.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.synsem.Sign; import opennlp.ccg.lexicon.Word; /** * Linear combination of n-gram probability models, * interpolated at the word level; can also be used * as a feature extractor in a perceptron model. * The models must have the same direction. * * @author Michael White * @version $Revision: 1.17 $, $Date: 2009/06/22 04:32:47 $ */ public class LinearNgramScorerCombo extends NgramScorer implements FeatureExtractor { /** The component models. */ protected NgramScorer[] models; /** The weights. */ protected double weights[]; /** * Creates a new linear combo model with the given component models * and with the combination weights determined by the rank order * centroid method. The models are assumed to be ordered from * most to least important. */ public LinearNgramScorerCombo(NgramScorer[] models) { this(models, rankOrderCentroidWeights(models.length)); } /** * Creates a new linear combo model with the given component models * and combination weights. The weights are assumed to sum to 1, * and the number of weights is assumed to match the number of models. * The wordsToScore list is shared across the component models. */ public LinearNgramScorerCombo(NgramScorer[] models, double[] weights) { this.models = models; this.weights = weights; for (int i = 0; i < models.length; i++) { models[i].shareWordsToScore(wordsToScore); order = Math.max(order, models[i].order); } } /** Set reverse flag, and propagate to component models. */ public void setReverse(boolean reverse) { super.setReverse(reverse); for (int i = 0; i < models.length; i++) { models[i].setReverse(reverse); } } /** Sets wordsToScore to the given list, for sharing purposes. */ protected void shareWordsToScore(List wordsToScore) { this.wordsToScore = wordsToScore; for (int i = 0; i < models.length; i++) { models[i].shareWordsToScore(wordsToScore); } } /** Does further preparation before scoring words for each component model. */ protected void prepareToScoreWords() { for (int i = 0; i < models.length; i++) { models[i].prepareToScoreWords(); } } /** Returns the log prob of the ngram starting at the given index in wordsToScore and with the given order, with backoff. In particular, returns the linear combination using the established weights of the probabilities given by the component models, converted back to a log prob (base 10). */ protected float logProbFromNgram(int i, int order) { double prob = 0; for (int j = 0; j < models.length; j++) { prob += convertToProb(models[j].logProbFromNgram(i, order)) * weights[j]; } return (float) convertToLogProb(prob); } /** * Increments ngram counts for the ngrams starting at the given index in * wordsToScore and with the given order. The implementation * delegates to the component models. */ protected void incNgrams(FeatureMap featmap, int i, int order) { for (int j = 0; j < models.length; j++) { if (models[j].useNgramFeatures) models[j].incNgrams(featmap, i, order); } } /** Flag for including the interpolated log prob as a feature. */ protected boolean useInterpLogProb = true; /** Sets the flag for including interpolated log prob as a feature. */ public void setInterpLogProb(boolean useInterpLogProb) { this.useInterpLogProb = useInterpLogProb; } /** * Returns a feature vector with the log prob features * for the given sign and completeness flag. * In particular, returns the log prob from each model as * the value of a feature named '$ngramN'. * The interpolated log prob is also returned as a feature '$ngram' if * the flag for including the interpolated log prob as a feature is set (the default). */ public FeatureVector extractLogProbs(Sign sign, boolean complete) { FeatureList retval = new FeatureList(models.length+1); if (useInterpLogProb) { Alphabet.Feature f = alphabet.index("$ngram"); if (f != null) retval.add(f, (float)logprob(sign, complete)); } for (int i=0; i < models.length; i++) { Alphabet.Feature f = alphabet.index("$ngram"+i); if (f != null) retval.add(f, (float)models[i].logprob(sign, complete)); } return retval; } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { super.setAlphabet(alphabet); for (NgramScorer scorer : models) { scorer.setAlphabet(alphabet); } } } ================================================ FILE: src/opennlp/ccg/ngrams/NgramDiversityPruningStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.lexicon.Word; import opennlp.ccg.realize.*; import opennlp.ccg.synsem.Sign; import java.util.*; /** * A diversity pruning strategy that defines signs to be * notCompellinglyDifferent if the n-1 initial and final words * are the same, where n is the n-gram order of interest. * The single arg constructor defaults the singleBestPerGroup flag * to true, which can increase efficiency with no loss in quality * when only the single best output is of interest (as long as the * reduction in the search space outweighs the extra time necessary * to check for the same initial and final words). * * @author Michael White * @version $Revision: 1.3 $, $Date: 2009/12/21 03:27:18 $ */ public class NgramDiversityPruningStrategy extends DiversityPruningStrategy { /** The n-gram order. */ protected int order; /** Constructor that defaults singleBestPerGroup to true. */ public NgramDiversityPruningStrategy(int order) { this(order, true); } /** Full constructor. */ public NgramDiversityPruningStrategy(int order, boolean singleBestPerGroup) { this.order = order; this.singleBestPerGroup = singleBestPerGroup; } /** Returns true iff the given signs are not compellingly different. In particular, returns true iff the n-1 initial and final words are the same. */ public boolean notCompellinglyDifferent(Sign sign1, Sign sign2) { List words1 = sign1.getWords(); List words2 = sign2.getWords(); int words1Len = words1.size(); int words2Len = words2.size(); for (int i = 0; i < order-1 && i < words1Len && i < words2Len; i++) { if (words1.get(i) != words2.get(i)) return false; } int wordsLenDiff = words1Len-words2Len; for (int i = words1Len-1; i > words1Len-order && i >= 0 && i >= wordsLenDiff; i--) { int j = i - wordsLenDiff; if (words1.get(i) != words2.get(j)) return false; } return true; } } ================================================ FILE: src/opennlp/ccg/ngrams/NgramFilter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.lexicon.Word; import java.util.*; /** * Interface for objects that filter unhappy n-grams. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2005/10/20 18:49:42 $ */ public interface NgramFilter { /** Returns whether to filter out the given list of words. */ public boolean filterOut(List words); } ================================================ FILE: src/opennlp/ccg/ngrams/NgramPrecisionModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.grammar.*; import opennlp.ccg.lexicon.*; import opennlp.ccg.perceptron.FeatureVector; import opennlp.ccg.synsem.Sign; import opennlp.ccg.util.*; import gnu.trove.*; import java.util.*; /** * N-gram precision scoring model, using a linear combination of * n-grams with rank order centroid weights, and optionally replacing word forms with * their semantic classes. * Words in the target strings are assumed to contain any desired delimiters. * With the exact matches flag set, only exact matches count. * * @author Michael White * @version $Revision: 1.18 $, $Date: 2011/05/15 20:35:06 $ */ public class NgramPrecisionModel extends NgramScorer implements SelfParaphraseBiaser { // n-grams in the target phrases @SuppressWarnings("unchecked") private Set> targetNgrams = new THashSet(); // weights private double[] weights = null; // exact matches flag private boolean exactMatches = false; /** Reusable list of reduced words. */ protected List reducedWords = new ArrayList(); /** Reusable word list, with identity equals. */ protected List wordList = new ArrayListWithIdentityEquals(); /** * Creates a new 4-gram precision model with no initial target strings * and with the combination weights determined by the rank order centroid method. * Word forms are not replaced by their semantic classes. */ public NgramPrecisionModel() { this(new String[]{}, false); } /** * Creates a new 4-gram precision model from the given target strings * and with the combination weights determined by the rank order centroid method. * Word forms are not replaced by their semantic classes. */ public NgramPrecisionModel(String[] targets) { this(targets, false); } /** * Creates a new n-gram precision model of the given order from the given target strings * and with the combination weights determined by the rank order centroid method. * Word forms are not replaced by their semantic classes. */ public NgramPrecisionModel(String[] targets, int order) { this(targets, order, false); } /** * Creates a new 4-gram precision model from the given target strings, * with the given flag controlling whether word forms are replaced by their semantic classes, * and with the combination weights determined by the rank order centroid method. */ public NgramPrecisionModel(String[] targets, boolean useSemClasses) { this(targets, 4, useSemClasses); } /** * Creates a new n-gram precision model of the given order from the given target strings, * with the given flag controlling whether word forms are replaced by their semantic classes, * and with the combination weights determined by the rank order centroid method. */ public NgramPrecisionModel(String[] targets, int order, boolean useSemClasses) { this(targets, order, useSemClasses, rankOrderCentroidWeights(order)); } /** * Creates a new n-gram precision model of the given order from the given target strings, * with the given flag controlling whether word forms are replaced by their semantic classes, * and with the given combination weights, beginning with the * highest-order weight and ending with the lowest-order (unigram) weight. */ public NgramPrecisionModel(String[] targets, int order, boolean useSemClasses, double[] weights) { this.useSemClasses = useSemClasses; this.order = order; this.weights = new double[order]; for (int i = 0; i < order; i++) { this.weights[order-(i+1)] = weights[i]; } initTargetNgrams(targets); } /** Sets the exact matches flag. */ public void setExactMatches(boolean exactMatches) { this.exactMatches = exactMatches; } /** Returns the exact matches flag. */ public boolean getExactMatches() { return exactMatches; } /** Reduces the words in wordsToScore to reducedWords, before scoring. */ protected void prepareToScoreWords() { reducedWords.clear(); for (int i = 0; i < wordsToScore.size(); i++) { Word w = wordsToScore.get(i); reducedWords.add(reduceWord(w)); } } /** Returns the given word reduced to a surface word, using the sem class, if apropos. */ protected Word reduceWord(Word w) { if (useSemClasses && isReplacementSemClass(w.getSemClass())) return Word.createSurfaceWordUsingSemClass(w); else return Word.createSurfaceWord(w); } /** * Returns a score between 0 (worst) and 1 (best) for the given sign * and completeness flag, based on the n-gram score of the sign's words. * If the sign is complete, sentence delimiters are added before * scoring the words, if not already present. * Returns 0 if any filter flags the n-gram for filtering, or if * the sign has no words. * Otherwise, sets signToScore, calls prepareToScoreWords, * and then calculates and returns the n-gram precision score. * In particular, returns the linear combination using the established weights * of the various n-gram precision scores (from unigram up to the configured order), * where the n-gram precision is the number of n-grams with a match in the target * strings divided by the number of n-grams in the word sequence. * With the exact matches flag set, only exact matches count. * With short sequences (less than the order), the score is adjusted * proportionally to the max score. */ public synchronized double score(Sign sign, boolean complete) { // setup List words = sign.getWords(); if (words == null) return 0; signToScore = sign; setWordsToScore(words, complete); if (ngramFilters != null) { for (int i = 0; i < ngramFilters.size(); i++) { NgramFilter filter = ngramFilters.get(i); if (filter.filterOut(wordsToScore)) return 0; } } prepareToScoreWords(); // calc weighted precision score double retval = 0; for (int i = 0; i < order; i++) { retval += weights[i] * ngramPrecision(i+1); } signToScore = null; // adjust score for short sequences int numWords = wordsToScore.size(); if (numWords < order) { double max = 0.0; for (int i=0; i < numWords; i++) max += weights[i]; retval = retval / max; } // done return retval; } /** * Returns the features as counts of each ngram for the given sign and completeness flag. * This method returns the feature map as a feature vector. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { return extractFeatureMap(sign, complete); } /** Not supported; throws an UnsupportedOperationException. */ protected float logProbFromNgram(int i, int order) { throw new UnsupportedOperationException(); } // returns the n-gram precision of the given order, or zero if too few words private double ngramPrecision(int order) { int numWords = reducedWords.size(); int numNgrams = numWords - (order-1); if (numNgrams <= 0) return 0; int matches = 0; for (int i=0; i < numNgrams; i++) { setNgram(reducedWords, i, order); if (targetNgrams.contains(wordList)) matches++; } if (exactMatches) return (matches == numNgrams) ? 1.0 : 0.0; else return (matches * 1.0) / numNgrams; } /** Sets wordList to be the n-gram of the given order using words starting at pos i. */ protected synchronized void setNgram(List words, int i, int order) { wordList.clear(); for (int j = 0; j < order; j++) { wordList.add(words.get(i+j)); } } /** * Sets the keys in keysList to hold the ngram starting at the given index in * wordsToScore and with the given order; returns true if the operation * succeeds normally. The implementation uses reducedWords. */ protected boolean setKeysToNgram(int i, int order) { keysList.clear(); for (int j = 0; j < order; j++) { keysList.add(reducedWords.get(i+j).getForm()); } return true; } /** Makes a canonical n-gram of the given order using words starting at pos i. Sublists are shared, a la a trie data structure. */ @SuppressWarnings("unchecked") protected List makeNgram(List words, int i, int order) { // check for one already interned setNgram(words, i, order); List alreadyInterned = (List) Interner.getGlobalInterned(wordList); if (alreadyInterned != null) return alreadyInterned; // if order is 1, intern new singleton list if (order == 1) { return (List) Interner.globalIntern(new SingletonList(words.get(i))); } // otherwise, extend list for the first word with suffix list List firstOneList = makeNgram(words, i, 1); List suffixList = makeNgram(words, i+1, order-1); return (List) Interner.globalIntern(new StructureSharingList(firstOneList, suffixList)); } // initializes the n-grams from the target phrases private void initTargetNgrams(String[] targets) { for (int j = 0; j < targets.length; j++) { if (targets[j].length() == 0) continue; // parse or tokenize target phrase into words List words; if (useSemClasses) // use parsed words to get sem classes words = Grammar.theGrammar.getParsedWords(targets[j]); else words = Grammar.theGrammar.lexicon.tokenizer.tokenize(targets[j]); // add sentence delimiters, if not already present setWordsToScore(words, true); // reduce each word to a surface word, using the sem class if apropos int numWords = wordsToScore.size(); for (int i = 0; i < numWords; i++) { Word w = wordsToScore.get(i); wordsToScore.set(i, reduceWord(w)); } // make and store target n-grams for (int k=0; k < order; k++) { for (int i=0; i < numWords - k; i++) { targetNgrams.add(makeNgram(wordsToScore, i, k+1)); } } } } /** Sets the target strings for implementing the self-paraphrase bias. */ @SuppressWarnings("unchecked") public void setTargets(String[] targets) { targetNgrams = new THashSet(); initTargetNgrams(targets); } } ================================================ FILE: src/opennlp/ccg/ngrams/NgramScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-8 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.synsem.Sign; import opennlp.ccg.synsem.SignScorer; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.lexicon.*; import opennlp.ccg.util.*; import opennlp.ccg.perceptron.*; import java.util.*; import java.io.*; /** * Super class for n-gram scoring models. * * @author Michael White * @version $Revision: 1.37 $, $Date: 2010/02/25 22:26:11 $ */ public abstract class NgramScorer implements SignScorer, Reversible, FeatureExtractor { protected NgramScorer() {} protected NgramScorer(int order) { this(order, false); } protected NgramScorer(int order, boolean useSemClasses) { this.order = order; this.useSemClasses = useSemClasses; } /** The n-gram order of the model. */ protected int order; /** Returns the n-gram order of the model. */ public int getOrder() { return order; } /** Flag for whether to reverse words before scoring (defaults to false). */ protected boolean reverse = false; /** Get reverse flag. */ public boolean getReverse() { return reverse; } /** Set reverse flag, and propagate to any reversible filters. */ public void setReverse(boolean reverse) { this.reverse = reverse; if (ngramFilters != null) { for (int i = 0; i < ngramFilters.size(); i++) { NgramFilter filter = ngramFilters.get(i); if (filter instanceof Reversible) ((Reversible)filter).setReverse(reverse); } } } /** Root of the n-gram trie. Nodes store NgramFloats instances. */ protected TrieMap trieMapRoot = new TrieMap(null); /** An ngram data object, for holding the log prob and backoff weight. */ public static class NgramFloats { /** The log prob. */ public float logprob; /** The backoff weight. */ public float bow; /** Constructor. */ public NgramFloats(float logprob, float bow) { this.logprob = logprob; this.bow = bow; } @Override public String toString() { return "logprob: " + logprob + ", bow: " + bow; } } /** The n-gram totals for different histories. */ protected int[] numNgrams = null; /** Flag for open vocabulary, ie whether the unknown word <unk> is in the model. */ protected boolean openVocab = false; /** Flag for whether to show scoring breakdown. */ protected boolean debugScore = false; /** Sets the debug score flag. */ public void setDebug(boolean debugScore) { this.debugScore = debugScore; } /** List of n-gram filters, for identifying unhappy sequences. */ protected List ngramFilters = null; /** Adds an n-gram filter. */ public void addFilter(NgramFilter filter) { if (ngramFilters == null) { ngramFilters = new ArrayList(); } ngramFilters.add(filter); } /** The alphabet, for filtering features to the relevant ones (when present). */ protected Alphabet alphabet = null; /** Sets the alphabet, so that features can be filtered to the relevant ones (when present). */ public void setAlphabet(Alphabet alphabet) { this.alphabet = alphabet; } /** Weak hash map for cached log probs, keyed from a sign's words. */ protected Map,Float> cachedLogProbs = null; /** Reference to current sign to score. */ protected Sign signToScore = null; /** Reusable list of words to score. */ protected List wordsToScore = new ArrayList(); /** Flag for whether start/end tags were added with the current words. */ protected boolean tagsAdded = false; /** Reusable list of keys for n-gram lookups. */ protected List keysList = new ArrayList(); /** Reusable list of keys for n-gram feature lookups. */ protected List featureKeysList = new ArrayList(); /** Gets a cached log prob for the given list of words (or null if none). */ protected Float getCachedLogProb(List words) { if (cachedLogProbs == null) return null; return cachedLogProbs.get(words); } /** Caches a log prob for the given list of words. */ protected void putCachedLogProb(List words, Float logprob) { if (cachedLogProbs == null) cachedLogProbs = new WeakHashMap,Float>(); cachedLogProbs.put(words, logprob); } /** * Returns a score between 0 (worst) and 1 (best) for the given sign * and completeness flag, based on the n-gram score of the sign's words. * If the sign is complete, sentence delimiters are added before * scoring the words, if not already present. * Returns 0 if any filter flags the n-gram for filtering, or if * the sign has no words. * Otherwise, sets signToScore, calls prepareToScoreWords, * and then returns the result of logprob() converted to a probability. */ public synchronized double score(Sign sign, boolean complete) { return convertToProb(logprob(sign, complete)); } /** * Returns a log prob for the given sign and completeness flag, * based on the n-gram log prob of the sign's words. * If the sign is complete, sentence delimiters are added before * scoring the words, if not already present. * Returns the log prob for zero probability if any filter flags the n-gram for filtering, or if * the sign has no words. * Otherwise, sets signToScore, calls prepareToScoreWords, * and then returns the result of logProb(). */ public synchronized double logprob(Sign sign, boolean complete) { List words = sign.getWords(); if (words == null) return 0; if (!complete) { // check cache Float logprob = getCachedLogProb(words); if (logprob != null) return logprob; } signToScore = sign; setWordsToScore(words, complete); if (ngramFilters != null) { for (int i = 0; i < ngramFilters.size(); i++) { NgramFilter filter = ngramFilters.get(i); if (filter.filterOut(wordsToScore)) return convertToLogProb(0); } } prepareToScoreWords(); double retval = logprob(); signToScore = null; return retval; } /** * Returns an n-gram probability from the given list of words, * by converting the result of the logprob method. */ public synchronized double score(List words) { return convertToProb(logprob(words)); } /** * Returns an n-gram log prob for the given list of words. * This method is a simplified version of scoring a sign's words * that does not cache results, filter n-grams or ever add * sentence delimiters. */ public synchronized double logprob(List words) { setWordsToScore(words, false); prepareToScoreWords(); return logprob(); } /** Sets wordsToScore to the given list, for sharing purposes. */ protected void shareWordsToScore(List wordsToScore) { this.wordsToScore = wordsToScore; } /** * Resets wordsToScore to the given ones, reversing them when the reverse * flag is true, and adding sentence delimiters if not already present, when * the completeness flag is true. Also sets the tagsAdded flag. */ protected void setWordsToScore(List words, boolean complete) { wordsToScore.clear(); tagsAdded = false; if (complete && (reverse || words.get(0).getForm() != "")) { wordsToScore.add(Word.createWord("")); tagsAdded = true; } if (reverse) { for (int j = words.size()-1; j >= 0; j--) { Word w = words.get(j); if (w.getForm() == "" || w.getForm() == "") continue; // skip or wordsToScore.add(w); } } else wordsToScore.addAll(words); if (complete && (reverse || words.get(words.size()-1).getForm() != "")) { wordsToScore.add(Word.createWord("")); tagsAdded = true; } } /** Optional step to do further preparation before scoring words. */ protected void prepareToScoreWords() {} /** Returns a feature map with counts of each ngram for the given sign and completness flag. */ public FeatureMap extractFeatureMap(Sign sign, boolean complete) { FeatureMap featmap = new FeatureMap(); // do setup as with scoring List words = sign.getWords(); if (words == null) return featmap; signToScore = sign; setWordsToScore(words, complete); prepareToScoreWords(); // count ngrams int numWords = wordsToScore.size(); for (int k=1; k <= order; k++) { int numNgrams = numWords - (k-1); if (numNgrams <= 0) continue; for (int i = 0; i < numNgrams; i++) { incNgrams(featmap, i, k); } } // return signToScore = null; return featmap; } /** * Returns a feature vector with the log prob features * for the given sign and completeness flag. * The default implementation returns the log prob as * the value of a feature named '$ngram' plus the counts of each ngram. */ public FeatureVector extractLogProbs(Sign sign, boolean complete) { FeatureList retval = new FeatureList(1); Alphabet.Feature f = alphabet.index("$ngram"); if (f != null) retval.add(f, (float)logprob(sign, complete)); return retval; } /** * Returns the features for the given sign and completeness flag. * The default implementation returns the log prob as * the value of a feature named '$ngram' plus the counts of each ngram. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { FeatureVector logprob = extractLogProbs(sign, complete); if (useNgramFeatures) { FeatureMap map = extractFeatureMap(sign, complete); return new ComposedFeatureVector(logprob, map); } else return logprob; } /** * Increments ngram counts for the ngrams starting at the given index in * wordsToScore and with the given order. The default implementation * uses ngram(i, order). */ protected void incNgrams(FeatureMap featmap, int i, int order) { List ngram = ngram(i, order); if (ngram == null) return; Alphabet.Feature f = alphabet.index(ngram); if (f != null) featmap.inc(f); } /** * Returns a list of feature keys for the ngram starting at the given index in * wordsToScore and with the given order, using the keys in keysList after * setting them appropriately with setKeysToNgram; returns null if this * operation does not succeed normally. */ protected List ngram(int i, int order) { boolean ok = setKeysToNgram(i, order); if (!ok) return null; featureKeysList.clear(); for (int j=0; j < keysList.size(); j++) { Object key = keysList.get(j); if (!(key instanceof String)) { throw new RuntimeException("Feature keys must be strings!"); } else featureKeysList.add((String)key); } return featureKeysList; } /** * Returns a log prob for the words in wordsToScore. * The default method returns the log prob of the word sequence * as determined by this language model's logProbFromNgram method. * The probabilities for the first n-1 words are backed off to the * lower order probabilities. * If the tagsAdded flag is false, the cache is checked to see whether * the log prob of the words of signToScore's initial sign has already * been calculated, and at the end the log prob of signToScore's words * is stored in the cache. */ protected double logprob() { float logProbTotal = 0; int numCached = 0; if (!tagsAdded && signToScore != null) { // check cache for initial words Sign[] inputs = signToScore.getDerivationHistory().getInputs(); if (inputs != null) { Sign initialSign = (!reverse) ? inputs[0] : inputs[inputs.length-1]; List initialWords = initialSign.getWords(); Float logprob = getCachedLogProb(initialWords); if (logprob != null) { logProbTotal = logprob.floatValue(); numCached = initialWords.size(); } } } for (int i = numCached; i < wordsToScore.size(); i++) { int orderToUse = Math.min(order, i+1); int startPos = i - (orderToUse-1); logProbTotal += logProbFromNgram(startPos, orderToUse); } if (!tagsAdded && signToScore != null) { // add log prob to cache putCachedLogProb(signToScore.getWords(), new Float(logProbTotal)); } return logProbTotal; } /** * Returns the log prob of the ngram starting at the given index in * wordsToScore and with the given order, with backoff. */ abstract protected float logProbFromNgram(int i, int order); /** * Sets the keys in keysList to hold the ngram starting at the given index in * wordsToScore and with the given order; returns true if the operation * succeeds normally. The default implementation invokes * logProbFromNgram, and returns false if the log prob is zero. */ protected boolean setKeysToNgram(int i, int order) { float logprob = logProbFromNgram(i, order); return logprob != 0; } /** Flag for using ngrams as features. */ protected boolean useNgramFeatures = true; /** Sets the the flag for using ngrams as features. */ public void setNgramFeatures(boolean useNgramFeatures) { this.useNgramFeatures = useNgramFeatures; } /** * Flag whether to use sem classes in place of words. * Defaults to false. */ protected boolean useSemClasses = false; // tokenizer reference private Tokenizer tokenizer = null; private Tokenizer getTokenizer() { if (tokenizer != null) return tokenizer; if (Grammar.theGrammar != null) tokenizer = Grammar.theGrammar.lexicon.tokenizer; else tokenizer = new DefaultTokenizer(); return tokenizer; } /** Returns whether the given semantic class is a replacement one. */ protected boolean isReplacementSemClass(String semClass) { return semClass != null && getTokenizer().isReplacementSemClass(semClass); } /** * Returns the semantic class replacement value (the semantic class * uppercased and interned) for the given word, if apropos, otherwise null. */ protected String semClassReplacement(Word w) { if (useSemClasses) { String semClass = w.getSemClass(); if (isReplacementSemClass(semClass)) return semClass.toUpperCase().intern(); } // otherwise null return null; } /** * Adds the TrieMap children, with their keys, under the given prefix, then * resets the lists. */ protected void addTrieMapChildren(List prefix, List keys, List> children) { if (!keys.isEmpty()) { TrieMap prefixNode = trieMapRoot.findChildFromList(prefix); prefixNode.addChildren(keys, children); } prefix.clear(); keys.clear(); children.clear(); } /** Returns the TrieMap node for the given sublist of keysList. */ protected TrieMap getNode(int pos, int len) { return trieMapRoot.getChildFromList(keysList.subList(pos, pos+len)); } // from CMU-Cambridge Statistical Language Modeling Toolkit // // p(wd3|wd1,wd2)= if(trigram exists) p_3(wd1,wd2,wd3) // else if(bigram w1,w2 exists) bo_wt_2(w1,w2)*p(wd3|wd2) // else p(wd3|w2) // // p(wd2|wd1)= if(bigram exists) p_2(wd1,wd2) // else bo_wt_1(wd1)*p_1(wd2) /** * Returns the log prob (base 10) of the given sublist of keysList, with * backoff, or -99 if not found. */ protected float logProb(int pos, int len) { TrieMap node = getNode(pos, len); if (node != null && node.data != null) return node.data.logprob; if (len == 1) return -99; float retval = logProb(pos+1, len-1); if (debugScore) System.out.print("(" + (len-1) + "-gram: " + retval + ") "); if (retval > -99) retval += backoffWeight(pos, len-1); return retval; } /** * Returns the back-off weight (log base 10) of the given sublist of * keysList, or 0 if not found. */ protected float backoffWeight(int pos, int len) { TrieMap node = getNode(pos, len); if (node != null && node.data != null) { float retval = node.data.bow; // if (debugScore && retval != 0) System.out.print("(bow: " + retval + ") "); return retval; } return 0; } /** * Returns the rank order centroid weights for a ranked list of the given length. * The weights go from highest to lowest, and sum to 1. */ // ex: // weight 1 0.5208333333333333 // weight 2 0.2708333333333333 // weight 3 0.14583333333333331 // weight 4 0.0625 public static double[] rankOrderCentroidWeights(int length) { double[] retval = new double[length]; for (int i = 0; i < length; i++) { double weight_i = 0; for (int j = i; j < length; j++) { weight_i += 1 / (double) (j+1); } weight_i = weight_i / (double) length; retval[i] = weight_i; } return retval; } /** Converts a base 10 log prob to an actual probability, checking for -99 (not found). */ public static double convertToProb(double logProb) { if (logProb <= -99) { return 0; } else return Math.pow(10, logProb); } /** Converts a probability to a base 10 log prob, returning -99 if zero. */ public static double convertToLogProb(double prob) { if (prob == 0) return -99; else return Math.log(prob) / Math.log(10); } /** Converts a base 10 log prob to the corresponding perplexity. */ public static double convertToPPL(double logProb) { return Math.exp(- logProb * Math.log(10)); } /** Sets up tokenizer for reading in language models. */ protected static StreamTokenizer initTokenizer(Reader in) { StreamTokenizer tokenizer = new StreamTokenizer(in); tokenizer.resetSyntax(); tokenizer.wordChars(0,255); tokenizer.whitespaceChars(' ',' '); tokenizer.whitespaceChars('\t','\t'); tokenizer.whitespaceChars('\n','\n'); tokenizer.whitespaceChars('\r','\r'); tokenizer.eolIsSignificant(true); return tokenizer; } /** * Reads a line of up to tokens.length tokens using the given tokenizer, * with the remaining array elements set to null. */ protected static void readLine(StreamTokenizer tokenizer, String[] tokens) throws IOException { int index = 0; int ttype; while ( (ttype = tokenizer.nextToken()) != StreamTokenizer.TT_EOF && ttype != StreamTokenizer.TT_EOL ) { if (index < tokens.length && ttype == StreamTokenizer.TT_WORD) { tokens[index] = tokenizer.sval; index++; } } for (int i = index; i < tokens.length; i++) { tokens[i] = null; } } } ================================================ FILE: src/opennlp/ccg/ngrams/RepetitionScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.synsem.Sign; import opennlp.ccg.synsem.SignScorer; import opennlp.ccg.lexicon.Word; import java.util.*; import gnu.trove.*; /** * Scores a sign according to how repetitive its words are given the * observed context. Relevant repeated items (eg stems) are counted, * with full counts given to items in the previous words or recent context, * and fractional counts to older items. The score is then assigned according * to the number of repeated items and the configured penalty, as * 10 to the minus (penalty times repeated items). * * @author Michael White * @version $Revision: 1.6 $, $Date: 2011/03/20 20:11:58 $ */ @SuppressWarnings({"unchecked","rawtypes"}) public class RepetitionScorer implements SignScorer { /** The repetition penalty (defaults to 1.0). */ public double penalty = 1.0; /** The fractional count for the older items (defaults to 0.5). */ public double olderCount = 0.5; /** The fractional count for the even older items (defaults to 0.25). */ public double evenOlderCount = 0.25; /** The fractional count for the oldest items (defaults to 0.125). */ public double oldestCount = 0.125; /** The interned POS values to use for repetition scoring purposes. */ protected Set posValsToUse = new THashSet(new TObjectIdentityHashingStrategy()); /** The interned stems to ignore for repetition scoring purposes. */ protected Set stemsToIgnore = new THashSet(new TObjectIdentityHashingStrategy()); /** The interned items (eg stems) seen in the previous words. */ protected Set previousItems = new THashSet(new TObjectIdentityHashingStrategy()); /** The interned items (eg stems) seen in the recent context. */ protected Set contextItems = new THashSet(new TObjectIdentityHashingStrategy()); /** The interned items (eg stems) seen in the older context. */ protected Set olderContextItems = new THashSet(new TObjectIdentityHashingStrategy()); /** The interned items (eg stems) seen in the even older context. */ protected Set evenOlderContextItems = new THashSet(new TObjectIdentityHashingStrategy()); /** The interned items (eg stems) seen in the oldest context. */ protected Set oldestContextItems = new THashSet(new TObjectIdentityHashingStrategy()); /** * Default constructor. * Adds "NNP", "N", "V", "Adj" and "Adv" to posValsToUse, * and "do" and "not" to stemsToIgnore. */ public RepetitionScorer() { String[] posVals = { "NNP", "N", "V", "Adj", "Adv" }; posValsToUse.addAll(Arrays.asList(posVals)); String[] stems = { "do", "not" }; stemsToIgnore.addAll(Arrays.asList(stems)); } /** Resets all the context items. */ public void resetContext() { contextItems.clear(); olderContextItems.clear(); evenOlderContextItems.clear(); oldestContextItems.clear(); } /** Ages the context items, clearing the recent ones. */ public void ageContext() { oldestContextItems.clear(); oldestContextItems.addAll(evenOlderContextItems); evenOlderContextItems.clear(); evenOlderContextItems.addAll(olderContextItems); olderContextItems.clear(); olderContextItems.addAll(contextItems); contextItems.clear(); } /** Adds the items (eg stems) from the given sign's words to the context items. */ public void updateContext(Sign sign) { List words = sign.getWords(); if (words == null) return; for (int i = 0; i < words.size(); i++) { Word word = (Word) words.get(i); updateItems(word, contextItems); } } /** * Adds the items (eg stems) from the given word to the given set. * By default, adds the relevant stems, per the relevantStem method. */ protected void updateItems(Word word, Set set) { String stem = relevantStem(word); if (stem != null) set.add(stem); } /** * Returns the stem of the given word if its POS is in posValsToUse, * unless the stem is in stemsToIgnore; otherwise returns null. */ protected String relevantStem(Word word) { if (!(posValsToUse.contains(word.getPOS()))) return null; String stem = word.getStem(); if (!(stemsToIgnore.contains(stem))) return stem; return null; } /** * Returns a score between 0 (worst) and 1 (best) for the given sign * and completeness flag, according to how repetitive its word are compared to * the observed context. * In particular, returns 10 to the minus (penalty times repeated items), * or zero if there are no words. */ public double score(Sign sign, boolean complete) { List words = sign.getWords(); if (words == null) return 0; return Math.pow(10, -1 * penalty * repeatedItems(words)); } /** * Returns the number of repeated items (eg stems) in the given word list, * using fractional counts for repetitions of older items. * The previous items set is cleared, and then the repeated items * are summed for each word, updating the previous items along the way. */ protected double repeatedItems(List words) { previousItems.clear(); double retval = 0; for (int i = 0; i < words.size(); i++) { Word word = (Word) words.get(i); retval += repeatedItems(word); updateItems(word, previousItems); } return retval; } /** * Returns the number of repeated items (eg stems) in the given word, * using fractional counts for repetitions of older items. * By default, returns 1 (or a fractional count) if the stem is relevant, * per the relevantStem method. */ protected double repeatedItems(Word word) { String stem = relevantStem(word); if (stem == null) return 0; if (contextItems.contains(stem) || previousItems.contains(stem)) return 1; if (olderContextItems.contains(stem)) return olderCount; if (evenOlderContextItems.contains(stem)) return evenOlderCount; if (oldestContextItems.contains(stem)) return oldestCount; return 0; } } ================================================ FILE: src/opennlp/ccg/ngrams/Reversible.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; /** * Interface for reversible n-gram classes. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2005/05/26 22:18:05 $ */ public interface Reversible { /** Get reverse flag. */ public boolean getReverse(); /** Set reverse flag. */ public void setReverse(boolean reverse); } ================================================ FILE: src/opennlp/ccg/ngrams/SRILMNgramModel.java ================================================ /* * $Id: SRILMNgramModel.java,v 1.5 2008/11/09 03:29:36 mwhite14850 Exp $ */ package opennlp.ccg.ngrams; import java.io.File; import java.io.IOException; import java.util.List; import java.util.ListIterator; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.lexicon.Word; /** * A language model that uses the * SRI language modeling * toolkit. * @author Scott Martin * @see SRILM * @version $Revision: 1.5 $ * @since 0.9.2 */ public class SRILMNgramModel extends AbstractStandardNgramModel { /** * Load the binary, platform-dependent library containing the SRILM JNI * bridge code. See ${OPENCCG_HOME}/src/srilmbridge. * @throws UnsatisfiedLinkError If Java can't find the srilmbridge library. */ static { System.loadLibrary("srilmbridge"); } /** * Creates a SRILM language model with the specified ngram order and model * type. * @param order The ngram order to use. * @param lmFile The file to read the langauge model from. * @param useSemClasses Whether or not to use semantic classes. * @param modelType The type of language model. * @throws IOException If a problem occurs reading the language model file. * These include non-existent or unreadable files, file format problems, * etc. */ public SRILMNgramModel(int order, File lmFile, boolean useSemClasses, SRILMNgramModelType modelType) throws IOException { super(order, useSemClasses); loadLMFromFile(order, lmFile, modelType); } /** * Creates a new SRILM language model. * @see SRILMNgramModel#SRILMNgramModel(int, File, boolean, * SRILMNgramModelType) */ public SRILMNgramModel(int order, File lmFile, SRILMNgramModelType modelType) throws IOException { this(order, lmFile, false, modelType); } /** * Loads an LM from a file. * @param ngramOrder The ngram order to use. * @param lmFile The file containing the language model. * @param lmType The type of langauge model to expect. * @throws IOException If the language model file is non-existent or * null, or if a problem occurs loading or parsing the file. */ protected void loadLMFromFile(int ngramOrder, File lmFile, SRILMNgramModelType lmType) throws IOException { if(lmFile == null) { throw new IOException("null file"); } if(!lmFile.exists()) { throw new IOException("file does not exist: " + lmFile); } if(lmFile.isDirectory()) { throw new IOException("file is a directory: " + lmFile); } if(!lmFile.canRead()) { throw new IOException("unable to read file: " + lmFile); } loadLM(ngramOrder, lmFile.getAbsolutePath(), lmType.ordinal()); } /** * Calculates a log probability of a delineated substring of the strings * to score using SRILM. This method reverses the context before passing * the string to SRILM, as this is the format SRILM expects. * @param pos The start position (inclusive) within the strings to score. * @param len The length, starting from pos, of the string * that should be used. */ @Override public float logProb(int pos, int len) { try { // create new because reversing list affects keysList List range = keysList.subList(pos, pos + len); int rangeSize = range.size(); if(rangeSize == 0) { throw new IllegalArgumentException( "empty range specified for log prob"); } // only allocate context array if we have to String[] context = (rangeSize > 1) ? new String[rangeSize - 1] : null; if(context != null) { // reverse for SRILM ListIterator contextIterator = range.listIterator(rangeSize - 1); int i = 0; while(contextIterator.hasPrevious()) { context[i++] = contextIterator.previous().toString(); } } // call SRILM to get word in reversed context return doLogProb(range.get(rangeSize - 1).toString(), context); } catch(IndexOutOfBoundsException e) { return 0.0f; } } /** * Invokes SRILM to load a language model. * @param ngramOrder The order of the language model * @param fileAbsolutePath The absolute path of the file containing the * language model. * @param lmType The language model type. * @throws IOException If a problem happens with SRILM while trying to * load the language model. */ private native void loadLM(int ngramOrder, String fileAbsolutePath, int lmType) throws IOException; /** * Invokes SRILM to calculate the log probability of a string in the * given context. SRILM will make its calculations based on the language * model loaded in {@link #loadLM(int, String, int)}. * @param word The word to calculate a probability for. * @param context The context, in reverse order. For example, to calculate * the probability of the word "rain" in the context of the * string "in the rain", the context should be the array * {the, in}. If the context is null or * zero-length, SRILM will assume this means no context should be used. * @return The log probability of the given word in the given (reversed) * context, as determined by SRILM. */ private native float doLogProb(String word, String[] context); /** * Invokes SRILM to clean up any initialized objects. */ @Override protected native void finalize() throws Throwable; public static void main(String[] args) throws Exception { String usage = "Usage: java opennlp.ccg.ngrams.SRILMNgramModel" + " (-reverse)"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); return; } long start = System.currentTimeMillis(); String order = args[0]; String lmfile = args[1]; String lmType = args[2]; String tokens = args[3]; String reversed = (args.length >= 5 && args[4].equals("-reverse")) ? "reversed " : ""; System.out.println("Loading " + reversed + "n-gram model with order " + order + " from: " + lmfile); SRILMNgramModel lm = new SRILMNgramModel(Integer.parseInt(order), new File(lmfile), SRILMNgramModelType.valueOf(lmType)); if (reversed.length() > 0) lm.setReverse(true); System.out.println("openVocab: " + lm.openVocab); int secs = (int) (System.currentTimeMillis() - start) / 1000; System.out.println("secs: " + secs); System.out.println(); // System.out.println("trie map: "); // System.out.println(lm.trieMapRoot.toString()); // System.out.println(); Tokenizer tokenizer = new DefaultTokenizer(); List words = tokenizer.tokenize(tokens); System.out.println("scoring: " + tokens); System.out.println(); lm.debugScore = true; lm.setWordsToScore(words, true); lm.prepareToScoreWords(); double logprob = lm.logprob(); double score = convertToProb(logprob); System.out.println(); System.out.println("score: " + score); System.out.println("logprob: " + logprob); System.out.println("ppl: " + NgramScorer.convertToPPL(logprob / (words.size()-1))); } } ================================================ FILE: src/opennlp/ccg/ngrams/SRILMNgramModelType.java ================================================ /* * $Id: SRILMNgramModelType.java,v 1.2 2007/05/30 22:53:17 coffeeblack Exp $ */ package opennlp.ccg.ngrams; /** * Used by {@link SRILMNgramModel} to specify the type of language model that * should be used. * @author Scott Martin * @see SRILM * @version $LastChangedRevision$ */ public enum SRILMNgramModelType { /*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!* * It is very important that the order of these does not * * change, as {@link SRILMNgramModel#loadLM(int, String, int)}* * relies on the ordinal. * *!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/ /** * A "standard" ngram model, of the type normally created by * the SRILM binary ngram-count. */ STANDARD, /** * For ngram models based on count LMs. The Google LM format is one of * these. * @see Web 1T 5-gram Version 1 */ COUNT; } ================================================ FILE: src/opennlp/ccg/ngrams/SRILM_FactoredScorerMaker.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.synsem.SignScorer; import java.io.*; /** * A custom scorer maker that builds and loads factored n-gram models using the * SRILM toolkit, which must be separately installed. * Most parameters are set in the FLM spec file. * * @author Michael White * @version $Revision: 1.6 $, $Date: 2007/12/21 05:13:37 $ */ public class SRILM_FactoredScorerMaker extends SRILM_ScorerMaker { /** The base of the model FLM spec file name. Defaults to "test.flm". */ public String flmSpecFileBase = "test.flm"; /** Returns the model FLM spec filename, extending flmSpecFileBase with the discount options string and n-gram order, eg "test.flm.n4". */ protected String flmSpecFilename() { return flmSpecFileBase + "." + discountOptionsStr + getOrder(); } /** Returns the root of the perplexities filename. */ protected String pplFileRoot() { return flmSpecFilename(); } /** Creates fold-specific FLM spec filenames from fold numbers, eg "test.flm.n4.fold1". */ protected String filename(int foldNum) { return flmSpecFilename() + "." + "fold" + foldNum; } /** Writes a fold-specific FLM spec file, given the tmp dir and fold num. The fold-specific file is created by simply replacing ".count" with ".foldN.count" and ".lm" with ".foldN.lm", where N is the fold num, in the model FLM spec file. */ protected void writeFoldSpecFile(File tmpDir, int foldNum) throws IOException { BufferedReader br = new BufferedReader(new FileReader(flmSpecFilename())); File foldSpecFile = new File(tmpDir, filename(foldNum)); PrintWriter out = new PrintWriter(new FileWriter(foldSpecFile)); String dotFoldN = ".fold" + foldNum; String line = null; while ((line = br.readLine()) != null) { int countIndex = line.indexOf(".count"); if (countIndex > 0) { String foldLine = line.substring(0, countIndex); foldLine += dotFoldN; int lmIndex = line.indexOf(".lm", countIndex); foldLine += line.substring(countIndex, lmIndex); foldLine += dotFoldN; foldLine += line.substring(lmIndex); out.println(foldLine); } else { out.println(line); } } out.close(); br.close(); } /** * Prepares a scoring model from the training data, * by exec-ing the SRILM fngram-count tool with the FLM spec file, * and computes perplexity on the test data. * The training/test data are written to foldN-train.txt and foldN-test.txt, * if not already present. * The fold-specific FLM spec file's name is determined by filename(N). */ public void prepScorer(File tmpDir, int foldNum, File trainFile, File testFile) throws IOException { // write fold spec file writeFoldSpecFile(tmpDir, foldNum); // do rest much like standard n-gram scorers super.prepScorer(tmpDir, foldNum, trainFile, testFile); } /** Writes training/test targets. */ protected void writeTargets(File tbFile, String textfile) throws IOException { if (!useSemClasses) cvr.tester.writeTargetsF(tbFile, textfile); else cvr.tester.writeTargetsFSC(tbFile, textfile); } /** Returns the command for making an ngram model. */ protected String countNgrams(int foldNum) { String cmd = "fngram-count -nonull -write-counts -lm " + ((unk) ? "-unk " : "") + "-factor-file " + filename(foldNum) + " " + "-text " + trainingfile(foldNum) + " " + "-debug " + debugLevel; return cmd; } /** Returns the command for calculating perplexity. NB: At present, only the perplexity from the first model is collected. */ protected String scoreNgrams(int foldNum) { String lmfile = filename(foldNum); String cmd2 = "fngram -nonull " + ((unk) ? "-unk " : "") + "-factor-file " + lmfile + " " + "-ppl " + testfile(foldNum); return cmd2; } /** * Loads a scoring model created from the training data. */ public SignScorer loadScorer(File tmpDir, int foldNum, File trainFile) throws IOException { File foldSpecFile = new File(tmpDir, filename(foldNum)); String foldSpecPath = foldSpecFile.getCanonicalPath(); return new FactoredNgramModelFamily(foldSpecPath, useSemClasses); } } ================================================ FILE: src/opennlp/ccg/ngrams/SRILM_ScorerMaker.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.synsem.SignScorer; import opennlp.ccg.test.*; import java.io.*; import java.util.*; /** * A custom scorer maker that builds and loads standard n-gram models using the * SRILM toolkit, which must be separately installed. * This class may be subclassed to set different parameters in the constructor, * for use in cross-validation tests with the realizer. * * @author Michael White * @version $Revision: 1.10 $, $Date: 2007/12/21 05:13:37 $ */ public class SRILM_ScorerMaker implements ScorerMaker { /** Flag specifying whether to use semantic class replacement. */ public boolean useSemClasses = true; /** Flag specifying whether to keep <unk> in the LM. */ public boolean unk = true; /** String specifying the min counts to use in building the n-gram model. */ public String minCountOptions = "-gt1min 1 -gt2min 1 -gt3min 1 -gt4min 1 -gt5min 1 -gt6min 1"; /** String specifying the discounting parameters. */ public String discountOptions = N_DISCOUNT_PARAMS; /** String specifying natural discounting parameters. */ public static final String N_DISCOUNT_PARAMS = "-ndiscount1 -ndiscount2 -ndiscount3 -ndiscount4 -ndiscount5 -ndiscount6"; /** String specifying Witten-Bell discounting parameters. */ public static final String WB_DISCOUNT_PARAMS = "-wbdiscount1 -wbdiscount2 -wbdiscount3 -wbdiscount4 -wbdiscount5 -wbdiscount6"; /** String specifying modified Kneser-Ney natural discounting parameters. */ public static final String KN_DISCOUNT_PARAMS = "-kndiscount1 -kndiscount2 -kndiscount3 -kndiscount4 -kndiscount5 -kndiscount6"; /** String indicating the discounting option in filenames. */ public String discountOptionsStr = "n"; /** Debug level to use. */ public int debugLevel = 1; /** The context for this scorer maker. */ public CrossValidateRealizer cvr = null; /** Sets the context for this scorer maker. */ public void setCVR(CrossValidateRealizer cvr) { this.cvr = cvr; } /** Stores perplexities for each fold, after calls to prepScorer. */ protected List perplexities = new ArrayList(); /** Gets the n-gram order from the context, defaulting to 3. */ protected int getOrder() { int order = cvr.tester.ngramOrder; return (order > 0) ? order : 3; } /** Creates LM filenames from fold numbers. */ protected String filename(int foldNum) { String retval = "fold" + foldNum + "-" + discountOptionsStr; if (useSemClasses) retval += "-sc"; retval += "." + getOrder() + "bo"; return retval; } /** * Prepares a scoring model from the training data, * by exec-ing the SRILM ngram-count tool with the current options, * and computes perplexity on the test data. * The training/test data are written to foldN-train.txt and foldN-test.txt, * if not already present. * The LM file's name is determined by filename(N). */ public void prepScorer(File tmpDir, int foldNum, File trainFile, File testFile) throws IOException { // write training/test files, if not already present writeTrainingAndTestFiles(tmpDir, foldNum, trainFile, testFile); // make counting command String cmd = countNgrams(foldNum); // exec command System.out.print("Writing " + getOrder() + "-gram model: " + filename(foldNum) + "\n"); Process makeLM = Runtime.getRuntime().exec(cmd, null, tmpDir); try { InputStream istr = makeLM.getErrorStream(); //.getInputStream(); int b; while ((b = istr.read()) != -1) { System.out.print((char)b); } int exitval = makeLM.waitFor(); if (exitval != 0) { System.out.println("(f)ngram-count exitval: " + exitval); } } catch (InterruptedException exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } System.out.println(); // make scoring command, for perplexity String cmd2 = scoreNgrams(foldNum); // exec command System.out.print("Measuring perplexity with: " + filename(foldNum) + "\n"); Process measurePPL = Runtime.getRuntime().exec(cmd2, null, tmpDir); try { InputStream istr = measurePPL.getInputStream(); int b; StringBuffer sb = new StringBuffer(); while ((b = istr.read()) != -1) { System.out.print((char)b); sb.append((char)b); } int exitval = measurePPL.waitFor(); // extract perplexity following "ppl= " String pplOut = sb.toString(); int pplStart = pplOut.indexOf("ppl= ") + "ppl= ".length(); int pplEnd = pplOut.indexOf(" ", pplStart); String pplStr = pplOut.substring(pplStart, pplEnd); try { perplexities.add(new Double(pplStr)); } catch (NumberFormatException exc) { System.out.println("Warning, unable to extract perplexity from: " + pplStr); System.out.println(exc.toString()); } if (exitval != 0) { System.out.println("(f)ngram exitval: " + exitval); } } catch (InterruptedException exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } System.out.println(); } /** Writes the training and test files, if not already present. */ protected void writeTrainingAndTestFiles(File tmpDir, int foldNum, File trainFile, File testFile) throws IOException { File trainingFoldFile = new File(tmpDir, trainingfile(foldNum)); if (!trainingFoldFile.exists()) { String trainingFoldPath = trainingFoldFile.getCanonicalPath(); writeTargets(trainFile, trainingFoldPath); } File testFoldFile = new File(tmpDir, testfile(foldNum)); if (!testFoldFile.exists()) { String testFoldPath = testFoldFile.getCanonicalPath(); writeTargets(testFile, testFoldPath); } } /** Returns the name of the training file. */ protected String trainingfile(int foldNum) { return "fold" + foldNum + "-train.txt"; } /** Returns the name of the test file. */ protected String testfile(int foldNum) { return "fold" + foldNum + "-test.txt"; } /** Writes training/test targets. */ protected void writeTargets(File tbFile, String textfile) throws IOException { if (!useSemClasses) cvr.tester.writeTargets(tbFile, textfile); else cvr.tester.writeTargetsSC(tbFile, textfile); } /** Returns the command for making an ngram model. */ protected String countNgrams(int foldNum) { String lmfile = filename(foldNum); String cmd = "ngram-count -order " + getOrder() + " " + ((unk) ? "-unk " : "") + minCountOptions + " " + discountOptions + " " + "-text " + trainingfile(foldNum) + " " + "-lm " + lmfile + " " + "-debug " + debugLevel; return cmd; } /** Returns the command for calculating perplexity. */ protected String scoreNgrams(int foldNum) { String lmfile = filename(foldNum); String cmd2 = "ngram -order " + getOrder() + " " + ((unk) ? "-unk " : "") + "-ppl " + testfile(foldNum) + " " + "-lm " + lmfile; return cmd2; } /** Returns the root of the perplexities filename. */ protected String pplFileRoot() { return discountOptionsStr + getOrder(); } /** * Summarizes perplexities after all calls to prepScorer. */ public void prepScorersSummary(File tmpDir) throws IOException { // summarize to sysout double sum = 0; System.out.print("Perplexities: "); for (int i = 0; i < perplexities.size(); i++) { double ppl = perplexities.get(i).doubleValue(); System.out.print(ppl + " "); sum += ppl; } System.out.println(); double avg = sum / perplexities.size(); System.out.println("Avg: " + avg); // then to xml String filename = "ppl"; if (useSemClasses) filename += "-sc"; filename += "." + pplFileRoot() + ".xml"; System.out.println("Writing perplexities: " + filename); PrintWriter pw = new PrintWriter(new FileWriter(new File(tmpDir, filename))); pw.println(""); for (int i = 0; i < perplexities.size(); i++) { pw.println(" "); } pw.println(""); pw.close(); } /** * Loads a scoring model created from the training data. */ public SignScorer loadScorer(File tmpDir, int foldNum, File trainFile) throws IOException { String lmfile = filename(foldNum); String lmPath = new File(tmpDir, lmfile).getCanonicalPath(); return new StandardNgramModel(getOrder(), lmPath, useSemClasses); } } ================================================ FILE: src/opennlp/ccg/ngrams/SelfParaphraseBiaser.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; /** * Interface for scoring models that implement a self-paraphrase bias. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2011/05/15 20:35:06 $ */ public interface SelfParaphraseBiaser { /** Sets the target strings for implementing the self-paraphrase bias. */ public void setTargets(String[] targets); } ================================================ FILE: src/opennlp/ccg/ngrams/SignScorerInterpolation.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.perceptron.*; import opennlp.ccg.synsem.Sign; import opennlp.ccg.synsem.SignScorer; /** * Linear interpolation of sign scorers, some of which may be feature extractors. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2011/01/15 17:52:59 $ */ public class SignScorerInterpolation implements SignScorer, FeatureExtractor { /** The component models. */ protected SignScorer[] models; /** The weights. */ protected double weights[]; /** The composed feature extractor. */ protected ComposedFeatureExtractor composedFeatureExtractor; /** * Constructor with component models, which are given uniform weights. */ public SignScorerInterpolation(SignScorer[] models) { this.models = models; this.weights = new double[models.length]; for (int i = 0; i < models.length; i++) { weights[i] = 1.0 / models.length; } } /** * Constructor with component models and weights. * The weights are assumed to sum to 1, * and the number of weights is assumed to match the number of models. */ public SignScorerInterpolation(SignScorer[] models, double[] weights) { this.models = models; this.weights = weights; this.composedFeatureExtractor = new ComposedFeatureExtractor(models); } /** * Returns a score between 0 (worst) and 1 (best) for the given sign * and completeness flag, as the interpolation of the scores assigned * by the component models. * In particular, returns the linear combination using the established weights * of the scores given by the component models. */ public double score(Sign sign, boolean complete) { double retval = 0; for (int i = 0; i < models.length; i++) { retval += models[i].score(sign, complete) * weights[i]; } return retval; } /** Sets the alphabet for the component feature extractors. */ public void setAlphabet(Alphabet alphabet) { composedFeatureExtractor.setAlphabet(alphabet); } /** Returns the features for the given sign and completeness flag from the component feature extractors. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { return composedFeatureExtractor.extractFeatures(sign, complete); } } ================================================ FILE: src/opennlp/ccg/ngrams/SignScorerProduct.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import opennlp.ccg.perceptron.*; import opennlp.ccg.synsem.Sign; import opennlp.ccg.synsem.SignScorer; // import java.util.*; /** * Product of sign scorers, some of which may be feature extractors.. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2011/01/15 17:52:59 $ */ public class SignScorerProduct implements SignScorer, FeatureExtractor { /** The component models. */ protected SignScorer[] models; /** The composed feature extractor. */ protected ComposedFeatureExtractor composedFeatureExtractor; /** * Constructor with component models. */ public SignScorerProduct(SignScorer[] models) { this.models = models; this.composedFeatureExtractor = new ComposedFeatureExtractor(models); } /** * Returns a score between 0 (worst) and 1 (best) for the given sign * and completeness flag, as the product of the scores assigned * by the component models. */ public double score(Sign sign, boolean complete) { double retval = 1.0; for (int i = 0; i < models.length; i++) { retval *= models[i].score(sign, complete); } return retval; } /** Sets the alphabet for the component feature extractors. */ public void setAlphabet(Alphabet alphabet) { composedFeatureExtractor.setAlphabet(alphabet); } /** Returns the features for the given sign and completeness flag from the component feature extractors. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { return composedFeatureExtractor.extractFeatures(sign, complete); } } ================================================ FILE: src/opennlp/ccg/ngrams/StandardNgramModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.ngrams; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StreamTokenizer; import java.util.ArrayList; import java.util.List; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.util.TrieMap; /** * A scorer for a standard n-gram backoff model. * Unknown words are mapped to <unk> if the latter is present in * the model. * * @author Michael White * @version $Revision: 1.19 $, $Date: 2011/10/11 03:29:42 $ */ public class StandardNgramModel extends AbstractStandardNgramModel { /** * Loads an n-gram model of the given order in ARPA (Doug Paul) format from * the given reader, with the given flag controlling whether words are * replaced by their semantic classes. */ public StandardNgramModel(int order, Reader in, boolean useSemClasses) throws IOException { super(order, useSemClasses); this.numNgrams = new int[order]; readModel(in); } /** * Loads an n-gram model of the given order in ARPA (Doug Paul) format from * the given reader. Words are not replaced by their semantic classes. */ public StandardNgramModel(int order, Reader in) throws IOException { this(order, in, false); } /** * Loads an n-gram model of the given order in ARPA (Doug Paul) format from * the given file, with the given flag controlling whether words are * replaced by their semantic classes. */ public StandardNgramModel(int order, String filename, boolean useSemClasses) throws IOException { this(order, new BufferedReader(new FileReader(filename)), useSemClasses); } /** * Loads an n-gram model of the given order in ARPA (Doug Paul) format from * the given file. Words are not replaced by their semantic classes. */ public StandardNgramModel(int order, String filename) throws IOException { this(order, filename, false); } // reads in model private void readModel(Reader in) throws IOException { // setup //Tokenizer wordTokenizer = (Grammar.theGrammar != null) // ? Grammar.theGrammar.lexicon.tokenizer // : new DefaultTokenizer(); StreamTokenizer tokenizer = initTokenizer(in); String[] tokens = new String[order+2]; boolean foundData = false; int currentOrder = 0; List currentPrefix = new ArrayList(); List currentKeys = null; List> currentChildren = null; // loop through lines while (tokenizer.ttype != StreamTokenizer.TT_EOF) { // read line into tokens readLine(tokenizer, tokens); // check for blank line if (tokens[0] == null) continue; // check for initial delimiter if (tokens[0].equals("\\data\\")) { foundData = true; continue; } if (!foundData) continue; // read header line if (tokens[0].equals("ngram")) { int n = Integer.parseInt(tokens[1].substring(0,1)); int total = Integer.parseInt(tokens[1].substring(2)); if (n > order) continue; numNgrams[n-1] = total; // init children, keys lists if (currentChildren == null) { currentChildren = new ArrayList>(total); currentKeys = new ArrayList(total); } // calc totals (not actually used anymore) if (n == order) { @SuppressWarnings("unused") int totalNgrams = 0; for (int i = 0; i < order; i++) { totalNgrams += numNgrams[i]; } // System.out.println("totalNgrams: " + totalNgrams); } continue; } // check for final delimiter if (tokens[0].equals("\\end\\")) { // add current children addTrieMapChildren(currentPrefix, currentKeys, currentChildren); break; } // read line starting new order if (tokens[0].equals("\\" + (currentOrder+1) + "-grams:")) { // add current children addTrieMapChildren(currentPrefix, currentKeys, currentChildren); // System.out.println(tokens[0]); currentOrder++; continue; } if (currentOrder == 0) continue; if (currentOrder > order) break; // read logprob float logprob = Float.parseFloat(tokens[0]); // read back-off weight (except with last order) float bow = 0; if (currentOrder < order && tokens[currentOrder+1] != null) { bow = Float.parseFloat(tokens[currentOrder+1]); } // intern string tokens for (int i = 1; i < currentOrder+1; i++) { tokens[i] = tokens[i].intern(); } // check prefix boolean samePrefix = (currentPrefix.size() == currentOrder-1); for (int i = 1; samePrefix && i < currentOrder; i++) { if (tokens[i] != currentPrefix.get(i-1)) samePrefix = false; } // if changed, add current children, reset prefix if (!samePrefix) { addTrieMapChildren(currentPrefix, currentKeys, currentChildren); for (int i = 1; i < currentOrder; i++) { currentPrefix.add(tokens[i]); } } String key = tokens[currentOrder]; currentKeys.add(key); currentChildren.add(new TrieMap(new NgramFloats(logprob, bow))); } // set openVocab according to presence of openVocab = (trieMapRoot.getChild("") != null); } /** Test loading and scoring. */ // NB: This produces the same scores as the SRILM ngram tool when both // and tags are used. public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.ngrams.StandardNgramModel (-reverse)"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } long start = System.currentTimeMillis(); String order = args[0]; String lmfile = args[1]; String tokens = args[2]; String reversed = (args.length >= 4 && args[3].equals("-reverse")) ? "reversed " : ""; System.out.println("Loading " + reversed + "n-gram model with order " + order + " from: " + lmfile); StandardNgramModel lm = new StandardNgramModel(Integer.parseInt(order), lmfile); if (reversed.length() > 0) lm.setReverse(true); System.out.println("openVocab: " + lm.openVocab); int secs = (int) (System.currentTimeMillis() - start) / 1000; System.out.println("secs: " + secs); System.out.println(); // System.out.println("trie map: "); // System.out.println(lm.trieMapRoot.toString()); // System.out.println(); Tokenizer tokenizer = new DefaultTokenizer(); List words = tokenizer.tokenize(tokens); System.out.println("scoring: " + tokens); System.out.println(); lm.debugScore = true; lm.setWordsToScore(words, true); lm.prepareToScoreWords(); double logprob = lm.logprob(); double score = convertToProb(logprob); System.out.println(); System.out.println("score: " + score); System.out.println("logprob: " + logprob); System.out.println("ppl: " + NgramScorer.convertToPPL(logprob / (words.size()-1))); } } ================================================ FILE: src/opennlp/ccg/ngrams/kenlm/MurmurHash.java ================================================ package opennlp.ccg.ngrams.kenlm; import java.io.UnsupportedEncodingException; /** * MurmurHash 2.0. * * The murmur hash is a relative fast hash function from http://murmurhash.googlepages.com/ for * platforms with efficient multiplication. * * This is a re-implementation of the original C code plus some additional features. * * Public domain. * * @author Viliam Holub * @version 1.0.2 * */ public final class MurmurHash { private final static String ENCODING = "UTF-16"; /** * Generates 32 bit hash from byte array of the given length and seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 32 bit hash of the given array */ public static int hash32(final byte[] data, int length, int seed) { // 'm' and 'r' are mixing constants generated offline. // They're not really 'magic', they just happen to work well. final int m = 0x5bd1e995; final int r = 24; // Initialize the hash to a random value int h = seed ^ length; int length4 = length / 4; for (int i = 0; i < length4; i++) { final int i4 = i * 4; int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + ((data[i4 + 2] & 0xff) << 16) + ((data[i4 + 3] & 0xff) << 24); k *= m; k ^= k >>> r; k *= m; h *= m; h ^= k; } // Handle the last few bytes of the input array switch (length % 4) { case 3: h ^= (data[(length & ~3) + 2] & 0xff) << 16; case 2: h ^= (data[(length & ~3) + 1] & 0xff) << 8; case 1: h ^= (data[length & ~3] & 0xff); h *= m; } h ^= h >>> 13; h *= m; h ^= h >>> 15; return h; } /** * Generates 32 bit hash from byte array with default seed value. * * @param data byte array to hash * @param length length of the array to hash * @return 32 bit hash of the given array */ public static int hash32(final byte[] data, int length) { return hash32(data, length, 0x9747b28c); } /** * Generates 32 bit hash from a string. * * @param text string to hash * @return 32 bit hash of the given string * @throws UnsupportedEncodingException */ public static int hash32(final String text) throws UnsupportedEncodingException { final byte[] bytes = text.getBytes(ENCODING); return hash32(bytes, bytes.length); } /** * Generates 32 bit hash from a substring. * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 32 bit hash of the given string * @throws UnsupportedEncodingException */ public static int hash32(final String text, int from, int length) throws UnsupportedEncodingException { return hash32(text.substring(from, from + length)); } /** * Generates 64 bit hash from byte array of the given length and seed. * * @param data byte array to hash * @param length length of the array to hash * @param seed initial seed value * @return 64 bit hash of the given array */ public static long hash64(final byte[] data, int length, int seed) { final long m = 0xc6a4a7935bd1e995L; final int r = 47; long h = (seed & 0xffffffffl) ^ (length * m); int length8 = length / 8; for (int i = 0; i < length8; i++) { final int i8 = i * 8; long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8) + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24) + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40) + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56); k *= m; k ^= k >>> r; k *= m; h ^= k; h *= m; } switch (length % 8) { case 7: h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48; case 6: h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40; case 5: h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32; case 4: h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24; case 3: h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16; case 2: h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8; case 1: h ^= (long) (data[length & ~7] & 0xff); h *= m; }; h ^= h >>> r; h *= m; h ^= h >>> r; return h; } /** * Generates 64 bit hash from byte array with default seed value. * * @param data byte array to hash * @param length length of the array to hash * @return 64 bit hash of the given string */ public static long hash64(final byte[] data, int length) { return hash64(data, length, 0xe17a1465); } /** * Generates 64 bit hash from a string. * * @param text string to hash * @return 64 bit hash of the given string * @throws UnsupportedEncodingException */ public static long hash64(final String text) throws UnsupportedEncodingException { byte[] bytes; bytes = text.getBytes(ENCODING); return hash64(bytes, bytes.length); } /** * Generates 64 bit hash from a substring. * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 64 bit hash of the given array * @throws UnsupportedEncodingException */ public static long hash64(final String text, int from, int length) throws UnsupportedEncodingException { return hash64(text.substring(from, from + length)); } } ================================================ FILE: src/opennlp/ccg/ngrams/kenlm/jni/KenLM.java ================================================ package opennlp.ccg.ngrams.kenlm.jni; // TODO(Joshua devs): include my state object with your LM state then // update this API to pass state instead of int[]. public class KenLM { /** * Load the binary, platform-dependent library containing the KenLM JNI * bridge code. * @throws UnsatisfiedLinkError If Java can't find the 'ken' library. */ static { System.loadLibrary("ken"); } private final long pointer; // this is read from the config file, used to set maximum order private final int ngramOrder; // inferred from model file (may be larger than ngramOrder) private final int N; private final static native long construct(String file_name, float fake_oov); private final static native void destroy(long ptr); private final static native int order(long ptr); private final static native boolean registerWord(long ptr, String word, int id); private final static native float prob(long ptr, int words[]); private final static native float probString(long ptr, int words[], int start); public KenLM(int order, String file_name) { float lm_ceiling_cost = 99.0f; ngramOrder = order; pointer = construct(file_name, -lm_ceiling_cost); N = order(pointer); } public void destroy() { destroy(pointer); } public int getOrder() { return N; } public boolean registerWord(String word, int id) { return registerWord(pointer, word, id); } public float prob(int words[]) { return prob(pointer, words); } } ================================================ FILE: src/opennlp/ccg/parse/Chart.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-10 Jason Baldridge, Gann Bierner and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import gnu.trove.*; import java.io.*; import java.util.*; /** * An implementation of the table (or chart) used for chart parsers like CKY. * Special functions are provided for combining cells of the chart into another * cell. Time or edge or cell limits can be placed on initial chart construction. * A pruning value applies to unpacking, which also limits the number of equivalent * edges kept during chart construction. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.41 $, $Date: 2011/11/16 03:25:27 $ */ public class Chart { // maps edges to representative edges, according to their headwords and cats, sans LFs // NB: using unfilled dependencies in equiv relation appears to unacceptably slow down parsing, // with a significant drop in complete parses @SuppressWarnings("unchecked") private static Map createEdgeMap() { return new THashMap(11, representativeEdgeStrategy); } private static TObjectHashingStrategy representativeEdgeStrategy = new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(Object o) { Sign sign = ((Edge)o).sign; int headpos = Edge.getEdge(sign.getLexHead()).wordPos; return 31*headpos + sign.getCategory().hashCodeNoLF(); //return 31*headpos + sign.getCategory().hashCodeNoLF() + 17*sign.getUnfilledDeps().hashCode(); } public boolean equals(Object o1, Object o2) { if (!(o1 instanceof Edge) || !(o2 instanceof Edge)) return false; Sign sign1 = ((Edge)o1).sign; Sign sign2 = ((Edge)o2).sign; return Edge.getEdge(sign1.getLexHead()).wordPos == Edge.getEdge(sign2.getLexHead()).wordPos && sign1.getCategory().equalsNoLF(sign2.getCategory()); //&& sign1.getUnfilledDeps().equals(sign2.getUnfilledDeps()); } }; // a cell pairs a sorted list with an edge map private class Cell implements Serializable { private static final long serialVersionUID = 1L; final List list = new ArrayList(); final Map map = createEdgeMap(); int size() { return list.size(); } Edge get(Edge edge) { return map.get(edge); } // add edge, preserving cell limit; return true iff given edge added boolean add(Edge edge) { if (map.containsKey(edge)) return false; return addEdgeSorted(edge, list, map, _cellLimit); } List getSignsSorted() { List retval = new ArrayList(list.size()); for (Edge e : list) retval.add(e.sign); return retval; } SignHash getSigns() { SignHash retval = new SignHash(); for (Edge e : list) retval.insert(e.sign); return retval; } }; // adds edge to sorted list and optional map, preserving limit; returns true iff edge added // nb: all lexical edges kept private boolean addEdgeSorted(Edge edge, List list, Map map, int limit) { int index = Collections.binarySearch(list, edge, edgeComparator); // convert index to insertion point index = Math.abs(index) - 1; // if somehow negative, use last position if (index < 0) index = list.size(); // check if last and at limit boolean limitActive = limit > 0 && !edge.sign.isLexical(); if (limitActive && index >= limit) return false; // otherwise add edge list.add(index, edge); if (map != null) map.put(edge, edge); // remove last if over limit if (limitActive && list.size() > limit) { Edge last = list.remove(list.size()-1); if (map != null) map.remove(last); } return true; } /** Compares edges based on their relative score, in descending order, then their signs. */ public static final Comparator edgeComparator = new Comparator() { public int compare(Edge edge1, Edge edge2) { if (edge1.score != edge2.score) return -1 * Double.compare(edge1.score, edge2.score); else return SignHash.compareTo(edge1.sign, edge2.sign); } }; /** The chart. */ protected Cell[][] _table; /** Its size. */ protected int _size; /** The count of edges created before unpacking. */ protected int _numEdges = 0; /** The count of edges created while unpacking. */ protected int _numUnpackingEdges = 0; /** The max cell size before unpacking. */ protected int _maxCellSize = 0; /** The rules. */ protected RuleGroup _rules; /** The sign scorer (defaults to the null scorer). */ protected SignScorer _signScorer = SignScorer.nullScorer; /** The "n" for n-best pruning (or 0 if none). */ protected int _pruneVal = 0; /** The time limit (0 if none). */ protected int _timeLimit = 0; /** The start time. */ protected long _startTime = 0; /** The edge limit (0 if none). */ protected int _edgeLimit = 0; /** The cell limit on non-lexical edges (0 if none). */ protected int _cellLimit = 0; /** Constructor. */ public Chart(int s, RuleGroup _R) { _rules = _R; _size = s; _table = new Cell[_size][_size]; } /** Sets the sign scorer. */ public void setSignScorer(SignScorer signScorer) { _signScorer = signScorer; } /** Sets the n-best pruning val. */ public void setPruneVal(int n) { _pruneVal = n; } /** Sets the time limit. */ public void setTimeLimit(int timeLimit) { _timeLimit = timeLimit; } /** Sets the start time. */ public void setStartTime(long startTime) { _startTime = startTime; } /** Sets the edge limit. */ public void setEdgeLimit(int edgeLimit) { _edgeLimit = edgeLimit; } /** Sets the cell limit on non-lexical edges. */ public void setCellLimit(int cellLimit) { _cellLimit = cellLimit; } /** Returns the edge count prior to unpacking. */ public int edgeCount() { return _numEdges; } /** Returns the edge count while unpacking. */ public int unpackingEdgeCount() { return _numUnpackingEdges; } /** Returns the max cell size prior to unpacking. */ public int maxCellSize() { return _maxCellSize; } //----------------------------------------------------------- // Chart construction /** * Inserts a sign at the given cell (modulo pruning). * Returns true if an edge for the sign is added as a new equiv class. */ public boolean insert(int x, int y, Sign w) { Cell cell = get(x, y); boolean retval = false; // make edge Edge edge = new Edge(w); if (w.isLexical()) edge.setWordPos(x); // get representative edge Edge rep = cell.get(edge); // if none, add as representative if (rep == null) { edge.initAltEdges(); retval = cell.add(edge); } // otherwise add as an alternative else { addEdgeSorted(edge, rep.altEdges, null, _pruneVal); } // update edge count, max cell size _numEdges++; if (cell.size() > _maxCellSize) _maxCellSize = cell.size(); // done return retval; } /** Returns the given cell (ensuring non-null). */ protected Cell get(int x, int y) { if (_table[x][y] == null) _table[x][y] = new Cell(); return _table[x][y]; } /** Returns the signs for a given cell (ensuring non-null). */ protected SignHash getSigns(int x, int y) { Cell cell = get(x, y); return cell.getSigns(); } /** Inserts edges into (x,y) that result from applying unary rules to those already in (x,y). * @throws ParseException */ protected void insertCell(int x, int y) throws ParseException { if (_table[x][y] == null) return; List inputs = _table[x][y].getSignsSorted(); List nextInputs = new ArrayList(inputs.size()); // repeat until no more inputs while (inputs.size() > 0) { // apply rules for (Sign sign : inputs) { checkLimits(); List results = _rules.applyUnaryRules(sign); for (Sign result : results) { // check for unary rule cycle; skip result if found if (!result.getDerivationHistory().containsCycle()) { // insert result boolean newEdgeClass = insert(x, y, result); // add to next inputs if it yielded a new equiv class if (newEdgeClass) nextInputs.add(result); } } } // move all results to inputs inputs.clear(); inputs.addAll(nextInputs); nextInputs.clear(); } } /** Inserts edges into (x3,y3) resulting from combining those in (x1,y1) and (x2,y2). * @throws ParseException */ protected void insertCell(int x1, int y1, int x2, int y2, int x3, int y3) throws ParseException { if (_table[x1][y1] == null) return; if (_table[x2][y2] == null) return; List inputs1 = _table[x1][y1].getSignsSorted(); List inputs2 = _table[x2][y2].getSignsSorted(); for (Sign sign1 : inputs1) { for (Sign sign2 : inputs2) { checkLimits(); List results = _rules.applyBinaryRules(sign1, sign2); for (Sign result : results) insert(x3, y3, result); } } } /** * Inserts fragmentary edges into (x3,y3), if non-empty, resulting from combining * those in (x1,y1) and (x2,y2) using the glue rule. * @throws ParseException */ protected void insertCellFrag(int x1, int y1, int x2, int y2, int x3, int y3) throws ParseException { if (_table[x1][y1] == null) return; if (_table[x2][y2] == null) return; if (!cellIsEmpty(x3, y3)) return; List inputs1 = _table[x1][y1].getSignsSorted(); List inputs2 = _table[x2][y2].getSignsSorted(); for (Sign sign1 : inputs1) { for (Sign sign2 : inputs2) { checkLimits(); List results = _rules.applyGlueRule(sign1, sign2); for (Sign result : results) insert(x3, y3, result); } } } // check edge and time limit private void checkLimits() throws ParseException { if (_edgeLimit > 0 && _numEdges > _edgeLimit) { throw new ParseException(ParseException.EDGE_LIMIT_EXCEEDED); } if (_timeLimit > 0) { int timeSoFar = (int) (System.currentTimeMillis() - _startTime); if (timeSoFar > _timeLimit) { throw new ParseException(ParseException.TIME_LIMIT_EXCEEDED); } } } /** Returns whether the given cell is empty. */ public boolean cellIsEmpty(int x, int y) { Cell cell = get(x, y); return cell.list.isEmpty(); } //----------------------------------------------------------- // Unpacking /** Unpacks the edges in the given cell as an n-best list. */ public List unpack(int x, int y) { Cell cell = get(x, y); // recursively unpack each edge @SuppressWarnings("unchecked") Set unpacked = new THashSet(new TObjectIdentityHashingStrategy()); @SuppressWarnings("unchecked") Set startedUnpacking = new THashSet(new TObjectIdentityHashingStrategy()); for (Edge edge : cell.list) unpack(edge, unpacked, startedUnpacking); // collect and sort results EdgeHash merged = new EdgeHash(); for (Edge edge : cell.list) { merged.addAll(edge.altEdges); } List retval = new ArrayList(merged.asEdgeSet()); Collections.sort(retval, edgeComparator); // prune if (_pruneVal > 0) { while (retval.size() > _pruneVal) retval.remove(retval.size()-1); } // restore alts for (Edge edge : cell.list) edge.restoreAltEdges(); // return return retval; } // recursively unpack edge, unless already visited private void unpack(Edge edge, Set unpacked, Set startedUnpacking) { // check visited if (unpacked.contains(edge)) return; if (startedUnpacking.contains(edge)) { System.err.println("Warning, revisiting edge before unpacking complete: " + edge); System.err.println(edge.sign.getDerivationHistory().toString()); return; } startedUnpacking.add(edge); // OR: recursively unpack alts, merging resulting alts EdgeHash merged = new EdgeHash(); for (Edge alt : edge.altEdges) { // AND: unpack inputs, make alts, add to merged unpackAlt(alt, unpacked, startedUnpacking, merged); } // score boolean complete = (edge.sign.getWords().size() == _size); for (Edge m : merged.asEdgeSet()) { m.setScore(_signScorer.score(m.sign, complete)); } // sort List mergedList = new ArrayList(merged.asEdgeSet()); Collections.sort(mergedList, edgeComparator); // prune if (_pruneVal > 0) { while (mergedList.size() > _pruneVal) mergedList.remove(mergedList.size()-1); } // replace edge's alts edge.replaceAltEdges(mergedList); // add to unpacked set unpacked.add(edge); } // recursively unpack inputs, make alt combos and add to merged private void unpackAlt(Edge alt, Set unpacked, Set startedUnpacking, EdgeHash merged) { // unpack via input signs DerivationHistory history = alt.sign.getDerivationHistory(); Sign[] inputSigns = history.getInputs(); // base case: no inputs if (inputSigns == null) { merged.insert(alt); return; } // otherwise recursively unpack Edge[] inputEdges = new Edge[inputSigns.length]; for (int i = 0; i < inputSigns.length; i++) { inputEdges[i] = Edge.getEdge(inputSigns[i]); unpack(inputEdges[i], unpacked, startedUnpacking); } // then make edges for new combos, using same rule, and add to merged (if unseen) Rule rule = history.getRule(); List altCombos = inputCombos(inputEdges, 0); List results = new ArrayList(1); for (Sign[] combo : altCombos) { // use this alt for same combo if (sameSigns(inputSigns, combo)) { merged.insert(alt); continue; } results.clear(); ((AbstractRule)rule).applyRule(combo, results); // TODO: bypass rule app for efficiency? (requires doing something about var subst) if (results.isEmpty()) continue; // (rare?) Sign sign = results.get(0); // assuming single result merged.insert(new Edge(sign)); // make edge for new alt _numUnpackingEdges++; } } // returns a list of sign arrays, with each array of length inputEdges.length - i, // representing all combinations of alt signs from i onwards private List inputCombos(Edge[] inputEdges, int index) { Edge edge = inputEdges[index]; // base case, inputEdges[last] if (index == inputEdges.length-1) { List altEdges = edge.altEdges; List retval = new ArrayList(altEdges.size()); for (Edge alt : altEdges) { retval.add(new Sign[] { alt.sign }); } return retval; } // otherwise recurse on index+1 List nextCombos = inputCombos(inputEdges, index+1); // and make new combos List altEdges = edge.altEdges; List retval = new ArrayList(altEdges.size() * nextCombos.size()); for (Edge alt : altEdges) { for (int i = 0; i < nextCombos.size(); i++) { Sign[] nextSigns = nextCombos.get(i); Sign[] newCombo = new Sign[nextSigns.length+1]; newCombo[0] = alt.sign; System.arraycopy(nextSigns, 0, newCombo, 1, nextSigns.length); retval.add(newCombo); } } return retval; } // checks for same signs private boolean sameSigns(Sign[] a, Sign[] b) { if (a.length != b.length) return false; for (int i=0; i < a.length; i++) if (a[i] != b[i]) return false; return true; } //----------------------------------------------------------- // Lazy Unpacking /** * Lazily unpacks the edges in the given cell as an n-best list * using a variant of "cube pruning". The algorithm essentially * follows Algorithm 2 of Huang and Chiang (2005), with checking * for spurious ambiguity. */ @SuppressWarnings("unchecked") public List lazyUnpack(int x, int y) { // if no pruning value set, use basic unpacking algorithm if (_pruneVal <= 0) return unpack(x, y); // recursively sort edge alts Cell cell = get(x, y); // make top-level candidate list and derivs map List topcands = new ArrayList(_pruneVal); Map> derivsmap = new THashMap(new TObjectIdentityHashingStrategy()); for (Edge edge : cell.list) { List cands = getCandidates(edge, derivsmap); topcands.addAll(cands); } sortAndPrune(topcands); // NB: no single edge for top cell, so must treat it as a special case of findKBest List retval = new ArrayList(_pruneVal); EdgeHash merged = new EdgeHash(); while (merged.size() < _pruneVal && !topcands.isEmpty()) { appendNext(topcands, merged, derivsmap); } retval.addAll(merged.asEdgeSet()); // rescore edges if apropos if (_signScorer instanceof ReRankingScorer) { ReRankingScorer rescorer = (ReRankingScorer) _signScorer; rescorer.setFullModel(true); for (Edge e : retval) { e.score = rescorer.score(e.sign, true); } rescorer.setFullModel(false); } Collections.sort(retval, edgeComparator); // done return retval; } // lazily find k-best derivations, if edge not already visited private void findKBest(Edge edge, Map> derivsmap) { if (derivsmap.containsKey(edge)) return; List cands = getCandidates(edge, derivsmap); EdgeHash merged = new EdgeHash(); while (merged.size() < _pruneVal && !cands.isEmpty()) { appendNext(cands, merged, derivsmap); } List derivs = new ArrayList(_pruneVal); derivs.addAll(merged.asEdgeSet()); Collections.sort(derivs, edgeComparator); derivsmap.put(edge, derivs); } // appends next candidate, expands frontier private void appendNext(List cands, EdgeHash merged, Map> derivsmap) { // append next Candidate cand = cands.remove(0); merged.add(cand.edge); // check for lex cand if (cand.indices == null) return; // enumerate frontier for (int i=0; i < cand.indices.length; i++) { // inc nextIndices at i int[] nextIndices = new int[cand.indices.length]; for (int m=0; m < nextIndices.length; m++) nextIndices[m] = cand.indices[m]; nextIndices[i]++; Edge next = getEdgeForIndices(cand.edge, cand.inputReps, nextIndices, derivsmap); // add next candidate, if any, if not already there if (next != null) { Candidate nextCand = new Candidate(next, cand.inputReps, nextIndices); if (!cands.contains(nextCand)) { int index = Collections.binarySearch(cands, nextCand); index = Math.abs(index) - 1; // convert index to insertion point if (index >= 0) cands.add(index, nextCand); else cands.add(nextCand); } } } } // candidate is an edge plus an array of indices for keeping track of // where to pull candidates from next (or null if lexical), // using the input representatives private static class Candidate implements Comparable { Edge edge; Edge[] inputReps; int[] indices; Candidate(Edge edge, Edge[] inputReps, int[] indices) { this.edge = edge; this.inputReps = inputReps; this.indices = indices; } public int compareTo(Candidate c) { int retval = edgeComparator.compare(edge, c.edge); if (retval != 0) return retval; if (indices == null && c.indices == null) return 0; if (indices == null && c.indices != null) return -1; if (indices != null && c.indices == null) return 1; if (indices.length < c.indices.length) return -1; if (indices.length > c.indices.length) return 1; for (int i=0; i < indices.length; i++) { if (indices[i] < c.indices[i]) return -1; if (indices[i] > c.indices[i]) return 1; } return 0; } public boolean equals(Object o) { if (!(o instanceof Candidate)) return false; Candidate c = (Candidate)o; if (indices != null && c.indices == null) return false; if (indices == null && c.indices != null) return false; if (indices != null && c.indices != null) { if (indices.length != c.indices.length) return false; for (int i=0; i < indices.length; i++) { if (indices[i] != c.indices[i]) return false; } } return edge.equals(c.edge); } } // get candidates for unpacking an edge private List getCandidates(Edge edge, Map> derivsmap) { List retval = new ArrayList(_pruneVal); // make initial candidate for each alt // nb: should only get initial candidates for representative edges, // but may as well ensure that at least this edge is included List alts = new ArrayList(edge.getAltEdges()); if (alts.isEmpty()) alts.add(edge); for (Edge alt : alts) { Sign[] inputs = alt.sign.getDerivationHistory().getInputs(); // lex case: no indices if (inputs == null) { retval.add(new Candidate(alt, null, null)); continue; } // otherwise get edge for best inputs Edge[] inputReps = new Edge[inputs.length]; int[] indices = new int[inputs.length]; for (int i=0; i < inputs.length; i++) { inputReps[i] = Edge.getEdge(inputs[i]); indices[i] = 0; } Edge e = getEdgeForIndices(alt, inputReps, indices, derivsmap); if (e != null) { retval.add(new Candidate(e, inputReps, indices)); } } // sort and prune sortAndPrune(retval); // done return retval; } // returns the edge for the given input indices, or null if none private Edge getEdgeForIndices(Edge edge, Edge[] inputReps, int[] indices, Map> derivsmap) { DerivationHistory history = edge.sign.getDerivationHistory(); Sign[] combo = new Sign[inputReps.length]; for (int i = 0; i < inputReps.length; i++) { Edge inputEdge = inputReps[i]; // recurse findKBest(inputEdge, derivsmap); // get derivs List inputDerivs = derivsmap.get(inputEdge); // check index, return null if out of bounds if (indices[i] < inputDerivs.size()) combo[i] = inputDerivs.get(indices[i]).sign; else return null; } // return edge if combo is same as input signs Sign[] inputSigns = history.getInputs(); if (sameSigns(inputSigns, combo)) return edge; // otherwise return new edge for combo Rule rule = history.getRule(); List results = new ArrayList(1); ((AbstractRule)rule).applyRule(combo, results); // TODO: bypass rule app for efficiency? (requires doing something about var subst) if (results.isEmpty()) return null; // (rare?) Sign sign = results.get(0); // assuming single result Edge retval = new Edge(sign); // make edge for new combo _numUnpackingEdges++; // score it boolean complete = (sign.getWords().size() == _size); retval.setScore(_signScorer.score(sign, complete)); // done return retval; } // sort and prune candidate list private void sortAndPrune(List cands) { Collections.sort(cands); while (cands.size() > _pruneVal) cands.remove(cands.size()-1); } //----------------------------------------------------------- /** Saves the chart entries to the given file. */ public void saveChartEntries(File file) throws IOException { ObjectOutputStream out = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(file))); out.writeObject(_table); out.flush(); out.close(); } /** Loads the chart entries from the given file. */ public void loadChartEntries(File file) throws IOException { ObjectInputStream in = new ObjectInputStream(new BufferedInputStream(new FileInputStream(file))); try { // read entries _table = (Cell[][]) in.readObject(); // restore size, unpacking edge count _size = _table.length; _numUnpackingEdges = 0; } catch (ClassNotFoundException e) { in.close(); throw (RuntimeException) new RuntimeException().initCause(e); } in.close(); } //----------------------------------------------------------- /** Returns the number of entries in each cell in the chart. */ public String toString() { StringBuffer sb = new StringBuffer(); for (int i = 0; i < _size; i++) { for (int j = 0; j < _size; j++) { sb.append(get(i, j).size()).append('\t'); } sb.append('\n'); } return sb.toString(); } /** Prints the signs in the chart to System.out. */ public void printChart() { int[] sizes = new int[_size]; int rows = 0; for (int i = 0; i < _size; i++) { for (int j = i; j < _size; j++) if (get(i, j).size() > sizes[i]) sizes[i] = get(i, j).size(); rows += sizes[i]; } String[][] toprint = new String[rows][_size]; String[] words = new String[_size]; int maxwidth = 0; for (int i = 0, row = 0; i < _size; row += sizes[i++]) { for (int j = 0; j < _size; j++) for (int s = 0; s < sizes[i]; s++) { SignHash cell = getSigns(i, j); if (i == j) words[i] = cell.asSignSet().iterator().next().getOrthography(); if (cell.size() >= s + 1) { toprint[row + s][j] = ((Sign) cell.toArray()[s]) .getCategory().toString(); if (toprint[row + s][j].length() > maxwidth) maxwidth = toprint[row + s][j].length(); } } } int fullwidth = _size * (maxwidth + 3) - 1; System.out.print(" "); for (String w : words) { System.out.print(w); int pad = (maxwidth + 3) - w.length(); for (int p = 0; p < pad; p++) System.out.print(" "); } System.out.print("|"); System.out.println(); for (int p = 0; p < fullwidth; p++) System.out.print("-"); System.out.print("| "); System.out.println(); for (int i = 0, entry = sizes[0], e = 0; i < rows; i++) { if (i == entry) { System.out.print("|"); for (int p = 0; p < fullwidth; p++) System.out.print("-"); System.out.print("|"); System.out.println(); entry += sizes[++e]; } System.out.print("| "); for (int j = 0; j < _size; j++) { int pad = 1 + maxwidth; if (toprint[i][j] != null) { System.out.print(toprint[i][j]); pad -= toprint[i][j].length(); } for (int p = 0; p < pad; p++) System.out.print(" "); System.out.print("| "); } System.out.println(); } System.out.print("|"); for (int p = 0; p < fullwidth; p++) System.out.print("-"); System.out.print("| "); System.out.println(); } } ================================================ FILE: src/opennlp/ccg/parse/DerivationHistory.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; import opennlp.ccg.synsem.*; import opennlp.ccg.grammar.*; import java.io.Serializable; import java.util.*; /** * Record the steps taken in a derivation. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.12 $, $Date: 2009/12/16 22:39:32 $ */ public class DerivationHistory implements Serializable, Comparable { private static final long serialVersionUID = 2867339743258182859L; private Sign[] _inputs; private Sign _output; private Rule _rule; private boolean _noHistory = false; private transient int _complexity = -1; /** Constructor for a sign with no prior history. */ public DerivationHistory(Sign output) { _noHistory = true; _output = output; } /** Constructor for a sign created by rule. */ public DerivationHistory(Sign[] inputs, Sign output, Rule rule) { _inputs = new Sign[inputs.length]; for (int i=0; i < inputs.length; i++) { _inputs[i] = inputs[i]; } _output = output; _rule = rule; } /** Returns true iff the history is empty. */ public boolean isEmpty() { return _noHistory; } /** Returns the inputs (or null if none). */ public Sign[] getInputs() { return _inputs; } /** Returns the output. */ public Sign getOutput() { return _output; } /** Returns the rule. */ public Rule getRule() { return _rule; } /** Returns the derivation history in vertical list form. */ public String toString() { return toString(maxRuleLen()); } // returns the derivation history given the max rule len, for alignment private String toString(int maxRuleLen) { StringBuffer sb = new StringBuffer(); // lex item if (_noHistory) { sb.append("(lex) "); for (int i = 5; i < maxRuleLen; i++) { sb.append(' '); } sb.append(_output.toString()).append('\n'); return sb.toString(); } // inputs for (int i=0; i < _inputs.length; i++) { sb.append(_inputs[i].getDerivationHistory().toString(maxRuleLen)); } // type-changing rule (possibly) String ruleName = _rule.name(); TypeChangingRule tcr = Grammar.theGrammar.rules.getTypeChangingRule(ruleName); if (tcr != null) { sb.append("(gram) "); for (int i = 6; i < maxRuleLen; i++) { sb.append(' '); } sb.append(tcr.toString()).append('\n'); } // this rule and result sb.append('(').append(ruleName).append(") "); for (int i = (ruleName.length() + 2); i < maxRuleLen; i++) { sb.append(' '); } sb.append(_output.toString()).append('\n'); // done return sb.toString(); } // returns the max length of rule names (including parens) private int maxRuleLen() { if (_noHistory) { return 6; } int max = 0; for (int i=0; i < _inputs.length; i++) { max = Math.max(max, _inputs[i].getDerivationHistory().maxRuleLen()); } max = Math.max(max, _rule.name().length() + 2); return max; } /** Returns the complexity of the derivation, as the sum of the number of steps, plus the number of composition or substitution steps, plus the number of crossing steps. */ public int complexity() { if (_complexity > 0) return _complexity; if (_noHistory) return 0; int retval = 1; String ruleName = _rule.name(); if (ruleName.length() > 1 && (ruleName.charAt(0) == '>' || ruleName.charAt(0) == '<')) { if (ruleName.charAt(1) == 'B' || ruleName.charAt(1) == 'S') { retval++; if (ruleName.length() == 3 && ruleName.charAt(2) == 'x') retval++; } } for (int i=0; i < _inputs.length; i++) { retval += _inputs[i].getDerivationHistory().complexity(); } _complexity = retval; return retval; } /** Returns whether the derivation contains a unary rule cycle. */ public boolean containsCycle() { if (_noHistory || _inputs.length != 1) return false; List rulesSeen = new ArrayList(4); rulesSeen.add(_rule); return _inputs[0].getDerivationHistory().containsCycle(rulesSeen); } // recursive cycle check private boolean containsCycle(List rulesSeen) { if (_noHistory || _inputs.length != 1) return false; if (rulesSeen.contains(_rule)) return true; rulesSeen.add(_rule); return _inputs[0].getDerivationHistory().containsCycle(rulesSeen); } /** Recursively compares derivation histories by their complexity. */ public int compareTo(DerivationHistory dh) { int c1 = complexity(); int c2 = dh.complexity(); if (c1 < c2) return -1; if (c1 > c2) return 1; if (_noHistory) return 0; if (_inputs.length < dh._inputs.length) return -1; if (_inputs.length > dh._inputs.length) return 1; for (int i=0; i < _inputs.length; i++) { int cmp = _inputs[i].getDerivationHistory().compareTo(dh._inputs[i].getDerivationHistory()); if (cmp != 0) return cmp; } return 0; } } ================================================ FILE: src/opennlp/ccg/parse/Edge.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2007 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; import opennlp.ccg.synsem.*; import java.util.*; import java.io.Serializable; import java.text.*; /** *

    * An edge is a wrapper for a sign, ie a sign together * with a score, and optionally a list of alternative edges. * A representative edge is an edge that represents (stands in for) * other edges with the same category (but different LFs) during the * chart construction process, stored in the list of alternative edges; * it is considered disjunctive when there is more than one alternative. * Note that initially a representative edge will be in its list * of alternatives, but it can be removed during pruning. *

    * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/12/22 22:19:00 $ */ public class Edge implements Serializable { private static final long serialVersionUID = 1L; /** Class for storing back-refs from signs. */ public static class EdgeRef implements Serializable { private static final long serialVersionUID = 1L; /** The edge. */ public final Edge edge; /** Constructor. */ public EdgeRef(Edge edge) { this.edge = edge; } } /** Returns the edge associated with this sign, or null if none. */ public static Edge getEdge(Sign sign) { EdgeRef eref = (EdgeRef) sign.getData(EdgeRef.class); return (eref != null) ? eref.edge : null; } /** The sign. */ protected Sign sign; /** The edge score. */ protected double score; /** Word position, for lexical edges (otherwise -1). */ protected int wordPos = -1; /** The alternative edges (none initially). */ protected List altEdges = null; /** Saved list of alternative edges, for restoring chart after unpacking. */ protected transient List savedAltEdges = null; /** Constructor (score defaults to 0.0). */ public Edge(Sign sign) { this(sign, 0.0); } /** Constructor with score. */ public Edge(Sign sign, double score) { this.sign = sign; this.score = score; sign.addData(new EdgeRef(this)); } /** Returns the sign. */ public Sign getSign() { return sign; } /** Returns the score. */ public double getScore() { return score; } /** Sets the score. */ public void setScore(double score) { this.score = score; } /** Returns the word position of a lexical edge (otherwise -1). */ public int getWordPos() { return wordPos; } /** Sets the word position of a lexical edge. */ public void setWordPos(int pos) { wordPos = pos; } /** Returns whether this edge is a representative. */ public boolean isRepresentative() { return altEdges != null; } /** Returns whether this edge is disjunctive. */ public boolean isDisjunctive() { return altEdges != null && altEdges.size() > 1; } /** Returns the list of alt edges, or the empty list if none. */ public List getAltEdges() { if (altEdges == null) return Collections.emptyList(); else return altEdges; } /** Initializes the alt edges list with a default capacity, adding this edge. */ public void initAltEdges() { initAltEdges(3); } /** Initializes the alt edges list with the given capacity, adding this edge. */ public void initAltEdges(int capacity) { // check uninitialized if (altEdges != null) throw new RuntimeException("Alt edges already initialized!"); altEdges = new ArrayList(capacity); altEdges.add(this); } /** Replaces the alt edges, saving the current ones for later restoration. */ public void replaceAltEdges(List newAlts) { savedAltEdges = altEdges; altEdges = newAlts; } /** Recursively restores saved alt edges, if any. */ public void restoreAltEdges() { if (savedAltEdges != null) { // restore altEdges = savedAltEdges; savedAltEdges = null; // recurse for (Edge alt : altEdges) { Sign[] inputs = alt.sign.getDerivationHistory().getInputs(); if (inputs != null) { for (Sign s : inputs) getEdge(s).restoreAltEdges(); } } } } /** Returns a hash code for this edge, based on its sign. (Alternatives and the score are not considered.) */ public int hashCode() { return sign.hashCode() * 23; } /** Returns a hash code for this edge based on the sign's surface words. (Alternatives and the score are not considered.) */ public int surfaceWordHashCode() { return sign.surfaceWordHashCode() * 23; } /** Returns whether this edge equals the given object. (Alternatives and the score are not considered.) */ public boolean equals(Object obj) { if (obj == this) return true; if (!(obj instanceof Edge)) return false; Edge edge = (Edge) obj; return sign.equals(edge.sign); } /** Returns whether this edge equals the given object based on the sign's surface words. (Alternatives and the score are not considered.) */ public boolean surfaceWordEquals(Object obj) { if (obj == this) return true; if (!(obj instanceof Edge)) return false; Edge edge = (Edge) obj; return sign.surfaceWordEquals(edge.sign); } /** * Returns a string for the edge in the format * [score] orthography :- category. */ public String toString() { StringBuffer sbuf = new StringBuffer(); if (score >= 0.001 || score == 0.0) sbuf.append("[" + nf3.format(score) + "] "); else sbuf.append("[" + nfE.format(score) + "] "); sbuf.append(sign.toString()); return sbuf.toString(); } // formats to three decimal places private static final NumberFormat nf3 = initNF3(); private static NumberFormat initNF3() { NumberFormat f = NumberFormat.getInstance(); f.setMinimumIntegerDigits(1); f.setMinimumFractionDigits(3); f.setMaximumFractionDigits(3); return f; } // formats to "0.##E0" private static final NumberFormat nfE = new DecimalFormat("0.##E0"); } ================================================ FILE: src/opennlp/ccg/parse/EdgeHash.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-7 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; import gnu.trove.*; import java.util.*; /** * A set of edges, unique up to surface words. * Edges whose signs have lower derivational complexity are kept during insertion. * NB: This is just like EdgeHash in the realize package, except that * it deals with parse edges. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2007/12/20 05:51:10 $ */ public class EdgeHash extends THashSet { private static final long serialVersionUID = 1L; /** Hashing strategy that uses Edge's surfaceWordHashCode and surfaceWordEquals methods. */ protected static TObjectHashingStrategy surfaceWordHashingStrategy = new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(java.lang.Object o) { return ((Edge)o).surfaceWordHashCode(); } public boolean equals(java.lang.Object o1, java.lang.Object o2) { return ((Edge)o1).surfaceWordEquals((Edge)o2); } }; /** Default constructor. */ public EdgeHash() { super(surfaceWordHashingStrategy); } /** * Returns this as a set of edges. */ @SuppressWarnings("unchecked") public Set asEdgeSet() { return (Set) this; } /** * Adds an edge, keeping the one whose sign has lower derivational complexity * if there is an equivalent one there already; returns the old * edge if it was displaced, the new edge if there was no equivalent * old edge, or null if the edge was not actually added. * iff the edge is actually inserted. */ public Edge insert(Edge edge) { int pos = index(edge); if (pos >= 0) { Edge oldEdge = (Edge) _set[pos]; if (oldEdge == edge) return null; int complexity = edge.sign.getDerivationHistory().complexity(); int oldComplexity = oldEdge.sign.getDerivationHistory().complexity(); if (complexity < oldComplexity) { _set[pos] = edge; return oldEdge; } else return null; } else { add(edge); return edge; } } } ================================================ FILE: src/opennlp/ccg/parse/ParseException.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-9 Jason Baldridge, Gann Bierner and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; /** * Any exception having to do with reading the lexicon or rules, etc. * * @author Gann Bierner * @version $Revision: 1.3 $, $Date: 2009/12/20 18:54:41 $ */ public class ParseException extends Exception { private static final long serialVersionUID = 1L; /** Time limit exceeded message. */ public static final String TIME_LIMIT_EXCEEDED = "Time limit exceeded"; /** Edge limit exceeded message. */ public static final String EDGE_LIMIT_EXCEEDED = "Edge limit exceeded"; /** * Class constructor * * @param s * the error message */ public ParseException(String s) { super(s); } public String toString() { return getMessage(); } } ================================================ FILE: src/opennlp/ccg/parse/Parser.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-9 Jason Baldridge, Gann Bierner and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; import opennlp.ccg.TextCCG; import opennlp.ccg.lexicon.*; import opennlp.ccg.synsem.*; import opennlp.ccg.grammar.*; import opennlp.ccg.hylo.EPsScorer; import opennlp.ccg.hylo.HyloHelper; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.unify.*; import opennlp.ccg.util.Pair; import java.util.*; import java.util.prefs.Preferences; /** * The parser is a CKY chart parser for CCG, optionally * with iterative beta-best supertagging and n-best output. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.38 $, $Date: 2011/08/27 19:27:00 $ */ public class Parser { /** Preference key for time limit on parsing. */ public static final String PARSE_TIME_LIMIT = "Parse Time Limit"; /** A constant indicating no time limit on parsing. */ public static final int NO_TIME_LIMIT = 0; /** Preference key for edge limit on parsing. */ public static final String PARSE_EDGE_LIMIT = "Parse Edge Limit"; /** A constant indicating no edge limit on parsing. */ public static final int NO_EDGE_LIMIT = 0; /** Preference key for pruning the number of signs kept per equivalence class. */ public static final String PARSE_PRUNING_VALUE = "Parse Pruning Value"; /** Preference key for pruning the number of edges kept per cell. */ public static final String PARSE_CELL_PRUNING_VALUE = "Parse Cell Pruning Value"; /** A constant indicating no pruning of signs per equivalence class. */ public static final int NO_PRUNING = 0; /** Preference key for whether to use lazy unpacking. */ public static final String PARSE_LAZY_UNPACKING = "Parse Lazy Unpacking"; /** The grammar. */ public final Grammar grammar; /** The lexicon used to create edges. */ public final Lexicon lexicon; /** The rules used to create edges. */ public final RuleGroup rules; /** Flag for whether to show the chart for failed parses. */ public boolean debugParse = false; /** The sign scorer (or null if none). */ protected SignScorer signScorer = null; /** The "n" for n-best pruning. (Default is none.) */ protected int pruneVal = -1; /** The cell pruning value. (Default is none.) */ protected int cellPruneVal = -1; /** The lazy unpacking flag. (Default is none.) */ protected Boolean lazyUnpacking = null; /** Supertagger to use. (Default is none.) */ protected Supertagger supertagger = null; /** Flag for whether to use the supertagger in the most-to-least restrictive direction. */ protected boolean stMostToLeastDir = true; /** Time limit in milliseconds. (Default is none.) */ protected int timeLimit = -1; /** Edge limit. (Default is none.) */ protected int edgeLimit = -1; // start time for chart construction private long startTime = 0; // lex lookup time private int lexTime = 0; // parse time private int parseTime = 0; // chart construction time private int chartTime = 0; // unpacking time private int unpackingTime = 0; // time limit to use private int timeLimitToUse = NO_TIME_LIMIT; // edge limit to use private int edgeLimitToUse = NO_EDGE_LIMIT; // pruning value to use private int pruneValToUse = NO_PRUNING; // pruning value to use private int cellPruneValToUse = NO_PRUNING; // lazy unpacking flag to use private boolean lazyUnpackingToUse = true; // current chart private Chart chart = null; // parse results private ArrayList result; // parse scores private ArrayList scores; // flag for whether to glue fragments currently private boolean gluingFragments = false; /** Constructor. */ public Parser(Grammar grammar) { this.grammar = grammar; this.lexicon = grammar.lexicon; this.rules = grammar.rules; } /** Sets the sign scorer. */ public void setSignScorer(SignScorer signScorer) { this.signScorer = signScorer; } /** Sets the time limit. */ public void setTimeLimit(int timeLimit) { this.timeLimit = timeLimit; } /** Sets the edge limit. */ public void setEdgeLimit(int edgeLimit) { this.edgeLimit = edgeLimit; } /** Sets the n-best pruning val. */ public void setPruneVal(int n) { pruneVal = n; } /** Sets the cell pruning val. */ public void setCellPruneVal(int n) { cellPruneVal = n; } /** Sets the lazy unpacking flag. */ public void setLazyUnpacking(Boolean b) { this.lazyUnpacking = b; } /** Sets the supertagger. */ public void setSupertagger(Supertagger supertagger) { this.supertagger = supertagger; } /** Sets the supertagger most-to-least restrictive direction flag. */ public void setSupertaggerMostToLeastRestrictiveDirection(boolean bool) { stMostToLeastDir = bool; } /** * Parses a string. * * @param s the string * @exception ParseException thrown if a parse can't be found for the * entire string */ public void parse(String s) throws ParseException { // tokenize List words = lexicon.tokenizer.tokenize(s); // parse words parse(words); } /** * Parses a list of words. */ public void parse(List words) throws ParseException { // set up timing: use limit from prefs unless explicitly set Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); if (timeLimit >= 0) timeLimitToUse = timeLimit; else timeLimitToUse = prefs.getInt(PARSE_TIME_LIMIT, NO_TIME_LIMIT); if (edgeLimit >= 0) edgeLimitToUse = edgeLimit; else edgeLimitToUse = prefs.getInt(PARSE_EDGE_LIMIT, NO_EDGE_LIMIT); if (pruneVal >= 0) pruneValToUse = pruneVal; else pruneValToUse = prefs.getInt(PARSE_PRUNING_VALUE, NO_PRUNING); if (cellPruneVal >= 0) cellPruneValToUse = cellPruneVal; else cellPruneValToUse = prefs.getInt(PARSE_CELL_PRUNING_VALUE, NO_PRUNING); if (lazyUnpacking != null) lazyUnpackingToUse = lazyUnpacking; else lazyUnpackingToUse = prefs.getBoolean(PARSE_LAZY_UNPACKING, true); // supertagger case: iterative beta-best if (supertagger != null) { parseWithSupertagger(words); return; } // otherwise just once try { // init long lexStartTime = System.currentTimeMillis(); UnifyControl.startUnifySequence(); // get entries for each word List entries = new ArrayList(words.size()); for (Word w : words) { entries.add(lexicon.getSignsFromWord(w)); } lexTime = (int) (System.currentTimeMillis() - lexStartTime); // do parsing parseEntries(entries); } catch (LexException e) { setGiveUpTime(); String msg = "Unable to retrieve lexical entries:\n\t" + e.toString(); if (debugParse) System.out.println(msg); throw new ParseException(msg); } catch (ParseException e) { setGiveUpTime(); // show chart for failed parse if apropos if (debugParse) { System.out.println(e); System.out.println("Chart for failed parse:"); chart.printChart(); } // rethrow throw e; } } // iterative beta-best parsing private void parseWithSupertagger(List words) throws ParseException { // set supertagger in lexicon grammar.lexicon.setSupertagger(supertagger); // ensure gluing off gluingFragments = false; // reset beta if (stMostToLeastDir) supertagger.resetBeta(); else supertagger.resetBetaToMax(); // loop boolean done = false; while (!done) { try { // init long lexStartTime = System.currentTimeMillis(); UnifyControl.startUnifySequence(); // get filtered entries for each word List entries = new ArrayList(words.size()); supertagger.mapWords(words); for (int i=0; i < words.size(); i++) { supertagger.setWord(i); Word word = words.get(i); entries.add(lexicon.getSignsFromWord(word)); } lexTime = (int) (System.currentTimeMillis() - lexStartTime); // do parsing parseEntries(entries); // done done = true; // reset supertagger in lexicon, turn gluing off grammar.lexicon.setSupertagger(null); gluingFragments = false; } catch (LexException e) { // continue if more betas if (stMostToLeastDir && supertagger.hasMoreBetas()) { supertagger.nextBeta(); } // otherwise give up else { setGiveUpTime(); // reset supertagger in lexicon, turn gluing off grammar.lexicon.setSupertagger(null); gluingFragments = false; // throw parse exception String msg = "Unable to retrieve lexical entries:\n\t" + e.toString(); if (debugParse) System.out.println(msg); throw new ParseException(msg); } } catch (ParseException e) { // check if limits exceeded boolean outwith = e.getMessage() == ParseException.EDGE_LIMIT_EXCEEDED || e.getMessage() == ParseException.TIME_LIMIT_EXCEEDED; // continue if more betas and limits not exceeded if (stMostToLeastDir && supertagger.hasMoreBetas() && !outwith) supertagger.nextBeta(); // or if limits exceeded and moving in the opposite direction else if (!stMostToLeastDir && supertagger.hasLessBetas() && outwith) supertagger.previousBeta(); // otherwise try glue rule, unless already on else if (!gluingFragments) { supertagger.resetBeta(); // may as well use most restrictive supertagger setting with glue rule gluingFragments = true; } // otherwise give up else { setGiveUpTime(); // show chart for failed parse if apropos if (debugParse) { System.out.println(e); System.out.println("Chart for failed parse:"); chart.printChart(); } // reset supertagger in lexicon, turn gluing off grammar.lexicon.setSupertagger(null); gluingFragments = false; // rethrow throw e; } } } } /** * Returns the results of the parse. */ public List getResult() { return result; } /** * Returns the corresponding scores for the results of the parse. */ public List getScores() { return scores; } /** Returns the edge count prior to unpacking. */ public int edgeCount() { return (chart != null) ? chart.edgeCount() : 0; } /** Returns the edge count while unpacking. */ public int unpackingEdgeCount() { return (chart != null) ? chart.unpackingEdgeCount() : 0; } /** Returns the max cell size prior to unpacking. */ public int maxCellSize() { return (chart != null) ? chart.maxCellSize() : 0; } /** Returns the lexical lookup time for the latest parse. */ public int getLexTime() { return lexTime; } /** Returns the overall parse time (but excluding lex lookup) for the latest parse. */ public int getParseTime() { return parseTime; } /** Returns the time spent constructing the chart. */ public int getChartTime() { return chartTime; } /** Returns the time spent unpacking. */ public int getUnpackingTime() { return unpackingTime; } /** Returns the supertagger's final beta value (or 0 if none). */ public double getSupertaggerBeta() { return (supertagger != null) ? supertagger.getCurrentBetaValue() : 0; } // parses from lex entries private void parseEntries(List entries) throws ParseException { startTime = System.currentTimeMillis(); // set up chart initializeChart(entries); if (signScorer != null) chart.setSignScorer(signScorer); chart.setPruneVal(pruneValToUse); chart.setTimeLimit(timeLimitToUse); chart.setStartTime(startTime); chart.setEdgeLimit(edgeLimitToUse); chart.setCellLimit(cellPruneValToUse); // do parsing parse(entries.size()); } // initialize the chart private void initializeChart(List entries) { chart = new Chart(entries.size(), rules); for (int i=0; i < entries.size(); i++) { SignHash wh = entries.get(i); for (Sign sign : wh.getSignsSorted()) { Category cat = sign.getCategory(); UnifyControl.reindex(cat); chart.insert(i, i, sign); } } } // actual CKY parsing private void parse(int size) throws ParseException { // fill in chart for (int i=0; i=0; i--) { for (int k=i; k=0; i--) { for (int k=i; k(); scores = new ArrayList(); // unpack top List unpacked = (lazyUnpackingToUse) ? chart.lazyUnpack(0,size - 1) : chart.unpack(0, size - 1); // add signs for unpacked edges for (Edge edge : unpacked) { result.add(edge.sign); scores.add(edge.score); } // check non-empty if (result.size() == 0) { throw new ParseException("Unable to parse"); } } // set parse time when giving up private void setGiveUpTime() { chartTime = (int) (System.currentTimeMillis() - startTime); parseTime = chartTime; unpackingTime = 0; } /** * Adds the supertagger log probs to the lexical signs of the gold standard parse. */ public void addSupertaggerLogProbs(Sign gold) { List words = gold.getWords(); supertagger.mapWords(words); addSupertaggerLogProbs(gold, gold); for (int i=0; i < words.size(); i++) { supertagger.setWord(i); } } // recurses through derivation, adding lex log probs to lexical signs private void addSupertaggerLogProbs(Sign gold, Sign current) { // lookup and add log prob for lex sign if (current.isLexical()) { supertagger.setWord(gold.wordIndex(current)); Map stags = supertagger.getSupertags(); Double lexprob = stags.get(current.getSupertag()); if (lexprob != null) { current.addData(new SupertaggerAdapter.LexLogProb((float) Math.log10(lexprob))); } } // otherwise recurse else { Sign[] inputs = current.getDerivationHistory().getInputs(); for (Sign s : inputs) addSupertaggerLogProbs(gold, s); } } /** * Returns the oracle best sign among those in the n-best list for the given LF, * using the f-score on all EPs, together with a flag indicating whether the gold LF * was found (as indicated by an f-score of 1.0). * NB: It would be better to return the forest oracle, but the nominal conversion would * be tricky to do correctly. */ public Pair oracleBest(LF goldLF) { Sign retval = null; double bestF = 0.0; for (Sign sign : result) { Category cat = sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); LF parsedLF = cat.getLF(); if (parsedLF != null) { index = HyloHelper.convertNominals(parsedLF, sign, index); EPsScorer.Results score = EPsScorer.score(parsedLF, goldLF); if (score.fscore > bestF) { retval = sign; bestF = score.fscore; } } } return new Pair(retval, (bestF == 1.0)); } } ================================================ FILE: src/opennlp/ccg/parse/Supertagger.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse; import opennlp.ccg.lexicon.*; import java.util.*; /** * A parsing supertagger must extend the SupertaggerAdapter interface for plugging a * supertagger into the lexicon. It must additionally support methods for * calculating and caching contextual supertagging assignments, so that * supertags can be retrieved just based on the current word index. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2010/12/08 15:24:26 $ */ public interface Supertagger extends SupertaggerAdapter { /** * Maps the given words to their predicted categories, * so that the beta-best categories can be returned by calls to setWord * and getSupertags. */ public void mapWords(List words); /** * Sets the current word to the one with the given index, * so that the beta-best categories for it can be returned by a call to * getSupertags. */ public void setWord(int index); } ================================================ FILE: src/opennlp/ccg/parse/postagger/BasicPOSTagger.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.postagger; import opennlp.ccg.parse.postagger.ml.POSPriorModel; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.parse.tagger.ml.MaxentModel; import opennlp.ccg.parse.supertagger.ml.FeatureExtractor; import opennlp.ccg.util.Pair; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.parse.tagger.util.ResultSink; import opennlp.ccg.parse.tagger.sequencescoring.SequenceScorer; import opennlp.ccg.parse.tagger.Constants; /** * A non-dummy POS tagger. * * @author Dennis N. Mehay */ public class BasicPOSTagger extends POSTagger { private FeatureExtractor posFex = null; private MaxentModel tagMod = null; private static final Comparator> comp = new Comparator>() { public int compare(Pair pr0, Pair pr1) { // sorts descending by prob (the double member of the pair). if(pr0.a == pr1.a) { return 0; } else if (pr0.a < pr1.a) { return 1; } else { return -1; } } }; public BasicPOSTagger(MaxentModel tagMod, FeatureExtractor posFex, String tagSequenceModel) { this.posFex = posFex; this.tagMod = tagMod; int ord = SequenceScorer.findOrder(tagSequenceModel); try { posSeqMod = new SequenceScorer(ord, tagSequenceModel); // set the search algorithm. posSeqMod.setAlgorithm(Constants.TaggingAlgorithm.FORWARDBACKWARD); // set the search beam width posSeqMod.setSearchBeam(5); } catch (IOException ex) { Logger.getLogger(BasicPOSTagger.class.getName()).log(Level.SEVERE, null, ex); } } public List tagSentence(List sentence) { List result = new ArrayList(sentence.size()); // the prob-string taggings (to be filtered, etc. before adding them to the taggings of the TaggedWord list). List>> taggings = new ArrayList>>(sentence.size()); Map sentMap = new HashMap(sentence.size()); int ind = 0; for(Word w : sentence) { sentMap.put(ind++, new TaggedWord(w)); } List>> ftss = posFex.getSentenceFeatures(sentMap); double[] distro = null; int wordIndex = 0; for(Collection> fts : ftss) { distro = tagMod.eval(fts); List> distroList = new ArrayList>(distro.length); ind = 0; for(double prob : distro) { distroList.add(new Pair(prob, ind++)); } Collections.sort(distroList, comp); // widen beta a little bit (we're going to do some fwd-bwd rescoring inp a minute, but we don't // want to do the fwd-bwd alg over ALL possible tags -- too inefficient). List> tagging = new ArrayList>(distro.length); double best = distroList.get(0).a; double widenedBeta = beta/8; String goldPOS = sentence.get(wordIndex).getPOS(); for(Pair outcome : distroList) { if( (outcome.a >= (widenedBeta * best)) || (includeGold && tagMod.getOutcome(outcome.b).equals(goldPOS)) ) { tagging.add(new Pair(outcome.a, tagMod.getOutcome(outcome.b))); } else { if(!includeGold) { // if not still potentially fishing for a gold POS tag, then break (they're in sorted order). break; } } } taggings.add(tagging); wordIndex++; } // rescore using forward-backward. taggings = posSeqMod.rescoreSequence(taggings); // add these rescored taggings to the list of TaggedWord's. int wInd = 0; for(List> tagging : taggings) { TaggedWord tmpWd = new TaggedWord(sentence.get(wInd++)); tmpWd.setPOSTagging(tagging); result.add(tmpWd); } // now filter down to the beta-best. return betaBestFilter(result); } public static void main(String[] args) throws IOException { String usage = "\nBasicPOSTagger -c (-i [defaults to ]) (-o [defaults to ])\n"+ " (-e [test tagger; assumes input is gold-standard corpus])\n"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } SRILMFactoredBundleCorpusIterator inp = null; BufferedWriter out = null; try { String inputCorp = "", output = "", configFile = null; boolean test = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-i")) { inputCorp = args[++i]; continue; } if (args[i].equals("-o")) { output = args[++i]; continue; } if (args[i].equals("-e")) { test = true; continue; } if (args[i].equals("-c")) { configFile = args[++i]; continue; } System.out.println("Unrecognized option: " + args[i]); } ResultSink rs = new ResultSink(ResultSink.ResultSinkType.POSTAG); try { inp = new SRILMFactoredBundleCorpusIterator( (inputCorp.equals("")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp)))); } catch (FileNotFoundException ex) { System.err.print("Input corpus " + inputCorp + " not found. Exiting..."); Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit(-1); } try { out = (output.equals("")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output))); } catch (IOException ex) { System.err.print("Output file " + output + " not found. Exiting..."); Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit(-1); } POSTagger post = POSTagger.posTaggerFactory(configFile); for (List inLine : inp) { List taggedSent = post.tagSentence(inLine); List>> sentTagging = new ArrayList>>(taggedSent.size()); for(TaggedWord tw : taggedSent) { sentTagging.add(tw.getPOSTagging()); } if(test) { rs.addSent(sentTagging, inLine); } out.write("" + System.getProperty("line.separator")); for(TaggedWord tw : taggedSent) { out.write(tw.getForm()); for(Pair tg : tw.getPOSTagging()) { out.write("\t" + tg.b + "\t" + tg.a); } out.write(System.getProperty("line.separator")); } out.write("" + System.getProperty("line.separator")); } out.flush(); if(test) { System.err.println(rs.report()); } } catch(Throwable t) { t.printStackTrace(); } finally { try { inp.close(); out.close(); } catch (IOException ex) { Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); } } } } ================================================ FILE: src/opennlp/ccg/parse/postagger/DummyPOSTagger.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.postagger; import java.util.ArrayList; import java.util.List; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.util.Pair; /** * A "DummyPOSTagger" simply grabs the single POS tag in the Words themselves * and puts them into a TaggedWord with a singleton list of probability 1.0 POS * tags. * * @author Dennis N. Mehay */ public class DummyPOSTagger extends POSTagger { public List tagSentence(List sentence) { List result = new ArrayList(sentence.size()); for(Word w : sentence) { List> tmpTagging = new ArrayList>(1); tmpTagging.add(new Pair(1.0,w.getPOS())); TaggedWord tmp = new TaggedWord(w); tmp.setPOSTagging(tmpTagging); result.add(tmp); } return result; } } ================================================ FILE: src/opennlp/ccg/parse/postagger/POSTagSequenceGetter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.postagger; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.List; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; /** * @author Dennis N. Mehay */ public class POSTagSequenceGetter { public static void main(String[] args) throws FileNotFoundException, IOException { String usage = "\nPOSTagSequenceGetter -i -o \n"; String input = null, output = null; if(args == null || args.length == 0 || args[0].equals("-h")) { System.err.println(usage); System.exit(0); } for(int i = 0; i < args.length; i++) { if(args[i].equals("-i")) { input = args[++i]; continue; } if(args[i].equals("-o")) { output = args[++i]; continue; } System.err.println("unknown command-line option: " + args[i]); } BufferedReader in = new BufferedReader(new FileReader(new File(input))); SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(in); BufferedWriter out = new BufferedWriter(new FileWriter(new File(output))); for(List sent : corp) { out.write(" "); for(Word w : sent) { out.write(w.getPOS()+" "); } out.write(""+System.getProperty("line.separator")); } out.close(); } } ================================================ FILE: src/opennlp/ccg/parse/postagger/POSTagger.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.postagger; import opennlp.ccg.parse.tagger.Constants.TaggingAlgorithm; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.postagger.ml.POSPriorModel; import opennlp.ccg.parse.postagger.ml.POSTagFex; import opennlp.ccg.parse.tagger.ml.MaxentModel; import opennlp.ccg.parse.tagger.ml.ZLMEM; import opennlp.ccg.parse.tagger.sequencescoring.SequenceScorer; import opennlp.ccg.util.Pair; import opennlp.ccg.parse.tagger.util.ConfigFileProcessor; /** * Interface for POS taggers. * * @author Dennis N. Mehay */ public abstract class POSTagger { protected SequenceScorer posSeqMod = null; public TaggingAlgorithm alg = TaggingAlgorithm.FORWARDBACKWARD; public double beta = 1.0; protected boolean includeGold = false; /** In goes a list of {@code Word}s; out comes a list of {@code TaggedWord}s*/ public abstract List tagSentence(List sentence); /** Set the tagging algorithm. */ public void setTaggingAlgorithm(TaggingAlgorithm newAlg) { alg = newAlg; posSeqMod.setAlgorithm(alg); } /** Set the beam width (by default, it's 1.0 -- i.e., single-best). */ public void setBeta(double beta) { this.beta = beta; } /** Say whether or not we will include gold tags (e.g., for training). */ public void setIncludeGold(boolean includeGoldOrNot) { includeGold = includeGoldOrNot; } /** * Filter the POS tags by the beta filter and return the (potentially) trimmed-down results. * It is assumed that the tags of the tagged word are sorted in descending order of * probability. */ public List betaBestFilter(List sentence) { List res = new ArrayList(sentence.size()); for (TaggedWord tw : sentence) { Word w = tw.getWord(); double best = tw.getPOSTagging().get(0).a; int endIndex = 0; for (Pair tagging : tw.getPOSTagging()) { if (tagging.a >= (beta * best)) { endIndex++; } else { break; } } List> tmpTagging = new ArrayList>(tw.getPOSTagging().size()); for (Pair tg : tw.getPOSTagging()) { tmpTagging.add(tg); } tmpTagging.subList(endIndex, tmpTagging.size()).clear(); TaggedWord twTmp = new TaggedWord(w); twTmp.setPOSTagging(tmpTagging); res.add(twTmp); } return res; } /** * Build a POS tagger from a config file. * A non-dummy config file might contain: * ... * # this is a comment * taggerType=basic * priorModel=/home/.../posprior/prior.flm * # note that CaSE dOES Not matter for the key (but does for the value, e.g. a file name). * PRIORmodelvocab=/home/.../posprior/vocab.voc * # you can also repeat opions. the last one will take effect. * priormodelvocab=/home/.../posprior/vocab2.voc * * # did you see that empty line get ignored? * # notice that spaces around the '=' get ignored, as well. * maxentModel = /home/.../maxentmodels/myposmod.mod * # this last must be an ARPA-formatted n-gram model over POS tags (7-grams work well). * sequenceModel=/home/.../pos.lm * # lastly, the tagging beam width (1.0 means "single-best" -- i.e., a unitagger). * beta=0.1 */ public static POSTagger posTaggerFactory(String configFile) { POSTagger res = null; try { String[] pathKeys = { "maxentmodel", "priormodel", "priormodelvocab", "sequencemodel"}; Map opts = ConfigFileProcessor.readInConfig(configFile, pathKeys); if (opts.get("taggertype").equalsIgnoreCase("dummy")) { return new DummyPOSTagger(); } MaxentModel mem = new ZLMEM(new File(opts.get("maxentmodel"))); POSPriorModel posPrior = null; if (opts.get("priormodel") != null) { posPrior = new POSPriorModel(opts.get("priormodel"), opts.get("priormodelvocab")); } POSTagFex fexer = new POSTagFex(posPrior); res = new BasicPOSTagger(mem, fexer, opts.get("sequencemodel")); res.setBeta(Double.parseDouble(opts.get("beta"))); TaggingAlgorithm alg = (opts.get("taggingalgorithm") == null || opts.get("taggingalgorithm").equals("forward-backward")) ? TaggingAlgorithm.FORWARDBACKWARD : TaggingAlgorithm.FORWARD; res.setTaggingAlgorithm(alg); res.setIncludeGold((opts.get("includegold") == null || opts.get("includegold").equals("false")) ? false : true); } catch (FileNotFoundException ex) { Logger.getLogger(POSTagger.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(POSTagger.class.getName()).log(Level.SEVERE, null, ex); } return res; } } ================================================ FILE: src/opennlp/ccg/parse/postagger/ml/POSPriorModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.postagger.ml; import opennlp.ccg.parse.supertagger.util.ProbPairComparator; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.ngrams.ConditionalProbabilityTable; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.util.Interner; import opennlp.ccg.util.Pair; /** * (c) (2009) Dennis N. Mehay * @author Dennis N. Mehay * * Model for predicting p(POS | word). Uses an ARPA-formatted * SRILM-trained "unigram" factored LM for this, where each "unigram" is * a bundle of word:pos. */ public class POSPriorModel extends ConditionalProbabilityTable { public static final String WORD = DefaultTokenizer.WORD_ATTR; public static final String POS_TAG = DefaultTokenizer.POS_ATTR; private Interner> pairs = new Interner>(); /** * Re-usable list for attr-val pairs of word-pos-supertag inputs to the prior model * (i.e., for predicting p(STag | word, POS). */ public List> attrVals = new ArrayList>(5); /** * A comparator for sorting Pair's where the Double is a probability * (effectively sorts by descending order of probability). */ private ProbPairComparator ppcomp = new ProbPairComparator(); /** All the priors. Reference them when getting beta-best, beta-worst, etc. */ List> priors = new ArrayList>(1000); /** String[] of all possible POS outcomes. */ private String[] posVocab = null; /** Construct a prior model with the FLM config file and corresponding vocab file. */ public POSPriorModel(String flmFile, String vocabFile) throws IOException { super(flmFile); String post = null; BufferedReader br = new BufferedReader(new FileReader(new File(vocabFile))); post = br.readLine().trim(); // get next POS tag from the vocab. while ((post != null) && !post.trim().startsWith(POS_TAG + "-")) { post = br.readLine(); } if (post != null) { post = post.trim().split("-")[1]; } Collection allSupertags = new HashSet(); // find out how many outcomes we have. int cnt = 0; while (post != null) { cnt++; allSupertags.add(post); while ((post != null) && !post.trim().startsWith(POS_TAG + "-")) { post = br.readLine(); } if (post != null) { post = post.trim().split("-")[1]; } } // initialize the arrays to this size. posVocab = new String[cnt]; cnt = 0; // fill the vocab array with all possible POS tags. for (String posTag : allSupertags) { posVocab[cnt++] = posTag.intern(); } br.close(); } /** Get the prior probability of this POS/word combo. */ public double getPriorOf(String pos, String word) { attrVals.clear(); Pair surfaceForm = pairs.intern(new Pair(WORD, word.intern())); attrVals.add(surfaceForm); Pair partOfSpeech = pairs.intern(new Pair(POS_TAG, pos.intern())); attrVals.add(partOfSpeech); return score(attrVals); } /** Get the POS-dict restricted prior distribution (sorted descending by prob.) */ public List> getPriors(Word w) { List> sortedTags = new ArrayList>(posVocab.length); for (String postag : posVocab) { sortedTags.add(new Pair(getPriorOf(postag, w.getForm()), postag)); } Collections.sort(sortedTags, ppcomp); return sortedTags; } /* added by DCE, to facilitate use in hypertagging * Identical to above method, but accepts a String (name of EP) rather than * a Word object. */ public List> getPriors(String s) { s.intern(); List> sortedTags = new ArrayList>(posVocab.length); for (String postag : posVocab) { sortedTags.add(new Pair(getPriorOf(postag, s), postag)); } Collections.sort(sortedTags, ppcomp); return sortedTags; } public static void main(String[] args) throws IOException { String usage = "\nPOSPriorModel -vocab (-c [default = ]) (-o [default = ])\n"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } SRILMFactoredBundleCorpusIterator in = null; BufferedWriter out = null; BufferedWriter voc = null; try { String inputCorp = "", output = "", vocabFile = "vocab.voc"; for (int i = 0; i < args.length; i++) { if (args[i].equals("-c")) { inputCorp = args[++i]; continue; } if (args[i].equals("-o")) { output = args[++i]; continue; } if (args[i].equals("-vocab")) { vocabFile = args[++i]; continue; } System.out.println("Unrecognized option: " + args[i]); } try { in = new SRILMFactoredBundleCorpusIterator( (inputCorp.equals("")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp)))); } catch (FileNotFoundException ex) { System.err.print("Input corpus " + inputCorp + " not found. Exiting..."); Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit( -1); } try { out = (output.equals("")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output))); } catch (IOException ex) { System.err.print("Output file " + output + " not found. Exiting..."); Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit( -1); } try { voc = new BufferedWriter(new FileWriter(new File(vocabFile))); } catch (IOException ex) { Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); } Map vocab = new HashMap(); for (List inLine : in) { for (Word w : inLine) { String pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()), wform = WORD + "-" + DefaultTokenizer.escape(w.getForm()); vocab.put(pos, (vocab.get(pos) == null) ? 1 : vocab.get(pos) + 1); vocab.put(wform, (vocab.get(wform) == null) ? 1 : vocab.get(wform) + 1); out.write(wform + ":" + pos + " "); } out.write(System.getProperty("line.separator")); } out.flush(); for (String str : vocab.keySet()) { voc.write(str + System.getProperty("line.separator")); } voc.flush(); } finally { try { out.close(); in.close(); voc.close(); } catch (IOException ex) { Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); } } } } ================================================ FILE: src/opennlp/ccg/parse/postagger/ml/POSTagFex.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.postagger.ml; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import opennlp.ccg.parse.tagger.Constants; import opennlp.ccg.parse.supertagger.ml.FeatureExtractor; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.util.Pair; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.lexicon.Word; /** * Feature extractor for POS taggers. * * The inputs are "TaggedWord"s simply for consistency of interface. * There should be no tags assigned to the words (short, perhaps, TOBI * tags or the like). * * @author Dennis N. Mehay */ public class POSTagFex implements FeatureExtractor { private POSPriorModel posPrior = null; /** Constructor with a prior model (replaces tagging dictionary). */ public POSTagFex(POSPriorModel posPrior) { this.posPrior = posPrior; } /** Constructor without prior model. Prior features will not be used. */ public POSTagFex() { this(null); } public static final String curL = "X"; public static final String prevL = "X-1"; public static final String prevPrevL = "X-2"; public static final String nextL = "X+1"; public static final String nextNextL = "X+2"; private static final String[] lxfLabs = {prevPrevL, prevL, curL, nextL, nextNextL}; private static final String prefix = "prefix", suffix = "suffix"; private static final String hyphen = "containsHyphen"; private static final String caps = "containsUC"; private static final String num = "containsNum"; private static final String neConn = "containsNEConnector"; private static final String priorF = "PPOS"; /** The string that connects elements of a fused named entity. */ private String neConnecter = "_"; /** Get a word's features for applying the tagger (i.e., not training mode). */ public Collection> getFeatures(Map sentence, Integer wordIndex) { return getFeatures(sentence, wordIndex, false); } /** Get a sentence of words' features for applying the tagger (i.e., not training mode). */ public List>> getSentenceFeatures(Map sentence) { return getSentenceFeatures(sentence, false); } /** * Get the features for a word in context. training == true iff the output class is to be collected as well. * * TODO: This and supertagger feature extractor (fex) should be merged into a more general, parameterizable * sentence-level contextual feature extractor. (VERY todo-ish, though.) */ public Collection> getFeatures(Map sentence, Integer wordIndex, boolean training) { Collection> result = new ArrayList>(30); TaggedWord current, prev, prevPrev, next, nextNext; current = sentence.get(wordIndex); // -------- The left periphery ------------ int wind = wordIndex.intValue(); if (wind > 1) { prev = sentence.get(wind - 1); prevPrev = sentence.get(wind - 2); } else if (wind > 0) { prev = sentence.get(wind - 1); prevPrev = Constants.OOB; } else { prev = prevPrev = Constants.OOB; } // -------- The right periphery ----------- int tempSize = sentence.size(); if ((tempSize - (wind + 1)) >= 2) { next = sentence.get(wind + 1); nextNext = sentence.get(wind + 2); } else if (tempSize - (wind + 1) >= 1) { next = sentence.get(wind + 1); nextNext = Constants.OOB; } else { next = nextNext = Constants.OOB; } Double activation = Constants.one; if (training) { result.add(new Pair(current.getPOS(), activation)); } // we do not use tag-sequence features in this model. // these are in a separate sequence model (n-gram model over POS sequences). // standard contextual features (word to the left, current word, word to the right, etc.). // these features are from Ratnaparkhi (1996). result.add(new Pair(curL + "=" + current.getForm(), activation)); result.add(new Pair(prevL + "=" + prev.getForm(), activation)); result.add(new Pair(prevPrevL + "=" + prevPrev.getForm(), activation)); result.add(new Pair(nextL + "=" + next.getForm(), activation)); result.add(new Pair(nextNextL + "=" + nextNext.getForm(), activation)); // features that replace the tagging dictionary. // add real-valued (activation = prior log-prob) features for each of the beta-best prior // tags, given this word. if(posPrior != null) { List> priors = posPrior.getPriors(current.getWord()); double beta = 0.1; double best = priors.get(0).a; String wform = current.getForm(); for(Pair prior : priors) { if(prior.a > (beta * best)) { // add the features PPOS=: and PPOS_word=_:. result.add(new Pair(priorF + "=" + prior.b, prior.a)); result.add(new Pair(priorF + "_word" + "=" + prior.b + "_" + wform, prior.a)); } else { break; } } } // these are in addition to Ratnaparkhi's (1996) contextual features. // now for conjunctions of features: w-2w-1=..., w-1w+1=..., w+1w+2=... (same for posp). // (i.e., bigram features over words and parts of speech and bigrams of words and POSs that straddle the current token). // N.B. only use single-best POSs (maybe change later). TaggedWord[] wds = {prevPrev, prev, current, next, nextNext}; for (int j = 1; j < wds.length; j++) { result.add(new Pair(lxfLabs[j - 1] + "|" + lxfLabs[j] + "=" + wds[j - 1].getForm() + "|" + wds[j].getForm(), activation)); // also, if at the current word slot, add bigrams that straddle the current word. if (j == 2) { result.add(new Pair(lxfLabs[j - 1] + "|" + lxfLabs[j + 1] + "=" + wds[j - 1].getForm() + "|" + wds[j + 1].getForm(), activation)); } } // affix features from Ratnaparkhi (1996). // if the word's length is > 4, then extract the 1-, 2-, 3- and 4-character affixes. if(current.getForm().length() > 4) { StringBuffer prefixes = new StringBuffer(4), suffixes = new StringBuffer(4); char[] wdForm = current.getForm().toCharArray(); // prefixes. int cursor = 0; for(cursor = 0; cursor < 4; cursor++) { prefixes.append(wdForm[cursor]); result.add(new Pair(prefix+"="+prefixes.toString(), Constants.one)); } // suffixes. for(cursor = wdForm.length-1; cursor >= wdForm.length-5; cursor--) { suffixes.insert(0, wdForm[cursor]); result.add(new Pair(suffix+"="+suffixes.toString(), Constants.one)); } } // now do "contains hyphen", "contains number", "contains uppercase letter" and contains fused NE connecter (_) features. // also from Ratnaparkhi (1996). if(current.getForm().contains("-")) { result.add(new Pair(hyphen, Constants.one)); } if(current.getForm().matches(".*[0-9]+.*")) { result.add(new Pair(num, Constants.one)); } if(!current.getForm().toLowerCase().equals(current.getForm())) { result.add(new Pair(caps, Constants.one)); } // if we see a NE connector, this is likely a NNP (in English, e.g.). if(current.getForm().contains(neConnecter)) { result.add(new Pair(neConn, Constants.one)); } return result; } /** * Get the features for a sentence of words in context. * training == true iff the output classes are to be collected as well. */ public List>> getSentenceFeatures(Map sentence, boolean training) { List>> result = new ArrayList>>(30); List keys = new ArrayList(sentence.keySet().size()); for(Integer wordIndex : sentence.keySet()) { keys.add(wordIndex); } Collections.sort(keys); for(Integer wordIndex : keys) { result.add(getFeatures(sentence, wordIndex, training)); } return result; } public static void main(String[] args) throws IOException { String usage = "POSTagFex (-h [gets this message]) (-i [defaults to ]) (-o [defaults to ])\n"+ " (-p [.flm] -v )\n"; if(args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } String input = "", output = "", priorModF = null, priorVocab = null; for(int j = 0; j < args.length; j++) { if(args[j].equals("-i")) { input = args[++j]; continue; } if(args[j].equals("-o")) { output = args[++j]; continue; } if(args[j].equals("-p")) { priorModF = args[++j]; continue; } if(args[j].equals("-v")) { priorVocab = args[++j]; continue; } System.err.println("Unrecognized option: " + args[j]); } SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator( input.equals("") ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(input)))); BufferedWriter out = new BufferedWriter( output.equals("") ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)))); POSPriorModel posPriorMod = null; if(priorModF != null) { posPriorMod = new POSPriorModel(priorModF, priorVocab); } POSTagFex fexer = new POSTagFex(posPriorMod); for(List sentence : corp) { Map sent = new HashMap(sentence.size()); int index = 0; for(Word w : sentence) { sent.put(index++, new TaggedWord(w)); } List>> ftss = fexer.getSentenceFeatures(sent, true); for(Collection> fts : ftss) { index = 0; for(Pair ft : fts) { // if we're at the first item, print out the label. if (index == 0) { out.write(ft.a); } else { out.write(" " + ft.a + ":" + ft.b); } index++; } out.write(System.getProperty("line.separator")); } } out.flush(); } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/JavaSupertaggingApp.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger; import opennlp.ccg.parse.tagger.util.ResultSink; import opennlp.ccg.parse.supertagger.ml.STFex; import opennlp.ccg.parse.supertagger.ml.FeatureExtractor; import opennlp.ccg.parse.tagger.ml.ZLMEM; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.parse.tagger.io.PipeDelimitedFactoredBundleCorpusIterator; import opennlp.ccg.parse.tagger.Constants; import java.io.*; import java.util.*; import static java.util.Arrays.*; import joptsimple.*; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.supertagger.io.*; import opennlp.ccg.parse.supertagger.ml.*; import opennlp.ccg.parse.supertagger.util.*; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.6 $, $Date: 2010/09/21 04:12:41 $ */ public class JavaSupertaggingApp { public static void main(String[] args) throws Exception { try { // instantiate command-line option parser, setting up type-safe expectations about // what should be passed for the options. OptionParser parser = new OptionParser(); parser.acceptsAll(asList("train", "R"), "extract training features."); parser.acceptsAll(asList("tag", "T"), "supertag a POS-tagged file."); parser.acceptsAll(asList("test","E"), "test tagger against gold standard."); parser.acceptsAll(asList("tagdictextract", "D"), "extract tagging dictionaries."); parser.acceptsAll(asList("h", "?"), "show help."); OptionSpec tokenisation = parser.acceptsAll(asList("delimiter", "d")).withRequiredArg().ofType(String.class).describedAs("SRILM factor bundles or C&C-style" + "(pipe-delimited) factor bundles [choose one of: \"SRILM\", \"candc\"]"); OptionSpec goldstandspec = parser.acceptsAll(asList("g","gold")).withRequiredArg().ofType(File.class). describedAs("the gold standard tagged file [file must have same bundle format as input corpus, \"SRILM\" or \"candc\"]"); OptionSpec inputspec = parser.acceptsAll(asList("i", "input")).withRequiredArg().ofType(File.class).describedAs("training or tagging/testing file"); OptionSpec outputspec = parser.acceptsAll(asList("o", "output")).withRequiredArg().ofType(File.class).describedAs("output location (for training feats or tags)"); OptionSpec modspec = parser.acceptsAll(asList("m", "model")).withRequiredArg().ofType(File.class).describedAs("textual model file (ZhangLe maxent-style) [for tagging/testing only]"); OptionSpec priormodspec = parser.acceptsAll(asList("priorModelF")).withRequiredArg().ofType(String.class).describedAs("config file for ARPA-formatted FLM [for tagging/testing and feature extraction"+ "MUST also give vocab file]"); OptionSpec vocabspec = parser.acceptsAll(asList("vocabF")).withRequiredArg().ofType(String.class).describedAs("vocab file for ARPA-formatted FLM [for tagging/testing and feature extraction]"); OptionSpec kspec = parser.accepts("K").withRequiredArg().ofType(Integer.class).describedAs("K parameter of Clark and Curran [for tagging/testing only]"); OptionSpec betaspec = parser.accepts("beta").withRequiredArg().ofType(Double.class).describedAs("beam width for supertagger [for tagging only]"); OptionSpec wdictspec = parser.acceptsAll(asList("w", "worddict")).withRequiredArg().ofType(File.class).describedAs("path to the word-based tagging dictionary file"); OptionSpec pdictspec = parser.acceptsAll(asList("p", "posdict")).withRequiredArg().ofType(File.class).describedAs("path to the POS-based tagging dictionary file"); OptionSpec seqModel = parser.acceptsAll(asList("s","seqModel")).withOptionalArg().ofType(String.class).describedAs("the tag sequence model (for forward-backward tagging)"); OptionSpec fbBeam = parser.acceptsAll(asList("fbBeamWidth")).withOptionalArg().ofType(Integer.class).describedAs("maximum width of the forward-backward beam [default = 5]"); OptionSpec tagAlgorithm = parser.acceptsAll(asList("taggingAlgorithm")).withOptionalArg().ofType(String.class).describedAs("tagging algorithm. choose from {forward-backward, forward} [default = forward-backward]"); OptionSet options = parser.parse(args); if (options.has("?") || args.length == 0) { parser.printHelpOn(System.out); System.exit(0); } assert (options.valueOf(tokenisation).equalsIgnoreCase("candc") || options.valueOf(tokenisation).equalsIgnoreCase("srilm")); // Must say whether we are tagging (or testing) or training (extracting features, actually). assert (options.has("tag") || options.has("train") || options.has("test") || options.has("D")); // Can't both train and tag/test, or train and extract tagging dict, or tag/test and do the last. assert !(options.has("train") && (options.has("tag") || options.has("test"))); assert !(options.has("train") && options.has("D")); assert !((options.has("tag") || options.has("test")) && options.has("D")); // either we're doing forward-backward tagging, or we're not. assert (options.has("seqModel") || !(options.has("seqInterp") || options.has("fbBeamWidth"))); // Can't have a model file input when we are training.... assert !(options.has("train") && options.has("m")); // ... or when extracting a tag dict. assert !(options.has("D") && options.has("m")); // Must have tagging dict files when tagging or extracting tag dicts, // and additionally beta and K when tagging . assert (!(options.has("tag") || options.has("test") || options.has("D")) || (options.has("p") && options.has("w"))); assert (!(options.has("tag") || options.has("test")) || (options.has("K") && options.has("beta"))); // can't use prior model if no vocab file is given (so that the prior model knows which // classes to make probabilistic predictions over) or no POS dictionary is given (so // that we can restrict our priors to those supertags that have occurred with a particular // POS). assert (!(options.has("priorModelF") && (!options.has("vocabF") || !options.has("p")))); STPriorModel stPrior = null; if (options.has("priorModelF")) { stPrior = new STPriorModel(options.valueOf(priormodspec), options.valueOf(vocabspec), new XMLPOSDictionaryReader(options.valueOf(pdictspec)).read()); } if (options.has("tag") || options.has("test")) { long start = System.currentTimeMillis(); // tag (and potentially measure performance against the gold-standard). //File mod = options.valueOf(modspec); //Integer k = options.valueOf(kspec); Double beta = options.valueOf(betaspec); ZLMEM maxentModel; String seqMod = options.has("seqModel") ? options.valueOf(seqModel) : null; Integer fbWidth = options.has("fbBeamWidth") ? options.valueOf(fbBeam) : 5; String algStr = options.has("taggingAlgorithm") ? options.valueOf(tagAlgorithm) : "forward-backward"; Constants.TaggingAlgorithm alg = algStr.equalsIgnoreCase("forward") ? Constants.TaggingAlgorithm.FORWARD : Constants.TaggingAlgorithm.FORWARDBACKWARD; STTaggerWordDictionary wd = null; STTaggerPOSDictionary pd = null; if(options.has("w")) wd = new XMLWordDictionaryReader(options.valueOf(wdictspec)).read(); if(options.has("p")) pd = new XMLPOSDictionaryReader(options.valueOf(pdictspec)).read(); WordAndPOSDictionaryLabellingStrategy tagger = new WordAndPOSDictionaryLabellingStrategy( wd, pd, (options.has("K") ? options.valueOf(kspec).intValue() : 20), maxentModel = new ZLMEM(options.valueOf(modspec)), new STFex(stPrior), seqMod, alg); tagger.setMaxSearchBeam(fbWidth); maxentModel.verbose = true; Iterator> corpus = null; Iterator> goldCorpus = null; if(options.valueOf(tokenisation).equalsIgnoreCase("srilm")) { corpus = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(inputspec)))); } else if(options.valueOf(tokenisation).equalsIgnoreCase("candc")) { corpus = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(inputspec)))); } if(options.has("test") && options.valueOf(tokenisation).equalsIgnoreCase("srilm")) { goldCorpus = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(goldstandspec)))); } else if(options.has("test") && options.valueOf(tokenisation).equalsIgnoreCase("candc")) { goldCorpus = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(options.valueOf(goldstandspec)))); } BufferedWriter outf = new BufferedWriter(new FileWriter(options.valueOf(outputspec))); boolean test = options.has("test"); ResultSink results = new ResultSink(); int sentCnt = 0; tagger.setBetas(new double[] {beta}); while(corpus.hasNext()) { sentCnt++; List sent = corpus.next(); List>> taggings = tagger.multitag(sent, beta); if(test) { List goldsent = goldCorpus.next(); results.addSent(taggings, goldsent); } Iterator sentiter = sent.iterator(); // output file format = word goldtag tag1 ... tagK outf.write(""+System.getProperty("line.separator")); for(List> tagging : taggings) { Word nextw = sentiter.next(); outf.write(nextw.getForm() + "\t1\t" + nextw.getPOS() + "\t1.0\t" + tagging.size() + "\t");// + nextw.getSupertag() + " "); //outf.write(nextw.getForm() + "|||"+ nextw.getStem() + "|||" + nextw.getPOS() + "|||"); String tags = ""; for(Pair tg : tagging) { //tags+="^"+tg.b+":"+tg.a; tags+= "\t" + tg.b + "\t"+tg.a; } // write out the multitagging, minus the initial space (tab). outf.write(tags.substring(1) + System.getProperty("line.separator")); //// write out the multitagging, minus the initial ^. //outf.write(tags.substring(1) + " "); } outf.write(""+System.getProperty("line.separator")); if(sentCnt % 10 == 0) { outf.flush(); } } outf.flush(); outf.close(); if(test) { System.err.println(results.report()); } long end = System.currentTimeMillis(); System.err.println("Time to tag: " + ((end - start + 0.0)/1000) + " seconds."); } else if (options.has("tagdictextract")) { // extract tagging dictionaries. File wd = options.valueOf(wdictspec); File pd = options.valueOf(pdictspec); File inf = options.valueOf(inputspec); TaggingDictionaryExtractor tde = new TaggingDictionaryExtractor(inf,wd,pd,options.valueOf(tokenisation)); System.err.println("Extracting dictionaries from: "+inf.toString()+" into files: "+wd.toString()+" and: "+pd.toString()+"\n(wdict and posdict, resp.)."); tde.extract(); } else { // train (extract features). File inf = options.valueOf(inputspec); File outf = options.valueOf(outputspec); FeatureExtractor fexer = (stPrior == null) ? new STFex() : new STFex(stPrior); ZhangLeTrainingExtractor fexApp = new ZhangLeTrainingExtractor(inf, outf, options.valueOf(tokenisation), fexer); System.err.println("Extracting features from file: " + inf.toString() + ", and placing extracted features in: " + outf.toString() + "."); fexApp.writeFeats(); } } catch (OptionException e) { throw e; } catch (Exception e) { throw e; //System.err.println("Something went wrong. Double-check your inputs."); } } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/LabellingStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger; import java.util.Collection; import java.util.List; import opennlp.ccg.lexicon.Word; import opennlp.ccg.util.Pair; //import ml.MaxentModel; /** * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2010/09/21 04:12:41 $ */ public interface LabellingStrategy { /** * A method to reset the K parameter (for word dictionaries). * @param newK An int to replace the int value of K. * @returns null. */ public void setK(int newK); /** Multitag a TaggedWord and throw away the probabilities. */ public List multitag(Word thisword, Collection> context, double beta); /** Multitag a TaggedWord and retain the probabilities. */ public List> multitagWithScores(Word thisword, Collection>contex, double beta); } ================================================ FILE: src/opennlp/ccg/parse/supertagger/WordAndPOSDictionaryLabellingStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import opennlp.ccg.parse.supertagger.io.XMLWordDictionaryReader; import opennlp.ccg.parse.supertagger.io.XMLPOSDictionaryReader; import opennlp.ccg.parse.supertagger.ml.STFex; import opennlp.ccg.parse.supertagger.ml.FeatureExtractor; import opennlp.ccg.parse.tagger.ProbIndexPair; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.parse.tagger.Constants; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.*; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.*; import opennlp.ccg.parse.Supertagger; import opennlp.ccg.parse.postagger.DummyPOSTagger; import opennlp.ccg.parse.postagger.POSTagger; import opennlp.ccg.parse.supertagger.ml.STPriorModel; import opennlp.ccg.parse.tagger.ml.MaxentModel; import opennlp.ccg.parse.supertagger.util.*; import opennlp.ccg.parse.tagger.Constants.TaggingAlgorithm; import opennlp.ccg.parse.tagger.ml.ZLMEM; import opennlp.ccg.util.Pair; import opennlp.ccg.parse.tagger.sequencescoring.SequenceScorer; import opennlp.ccg.parse.tagger.util.ConfigFileProcessor; import opennlp.ccg.parse.tagger.util.ResultSink; /** * A `labelling strategy' for a CCG supertagger that * restricts the output of the model based on word and POS `tagging * dictionaries' in the following way: * * if a word w occurs at least K times in training, the model's output * is constrained to the outcomes seen with w during training. If w * did not occur at least K times during training, the model's output is * constrained to the outcomes seen with w's POS tag during training. * In the off chance that the POS tag was not seen in training, the model's * best prediction is used. * * @author Dennis N. Mehay * @version $Revision: 1.22 $, $Date: 2011/03/22 03:20:25 $ */ public class WordAndPOSDictionaryLabellingStrategy implements LabellingStrategy, Supertagger { // print warnings? private boolean verbose = false; // use tagging dictionaries? private boolean useWordDict = false; private boolean usePOSDict = false; private SequenceScorer seqScorer = null; private STTaggerWordDictionary wd; private STTaggerPOSDictionary pd; private int K, usualK, finalK; private MaxentModel mo; // extracts features from the context of a word. private FeatureExtractor fexer = new STFex(); // postagger for non-gold-POS supertagging. private POSTagger posTagger; // the current tagging. private List tagging; // POS-specific multipliers to "tighten" or "loosen" up the tagging beam width // ("beta") as needed. E.g., the beta for period/full stop might not need to be // very permissive, while those for lexical verbs or some fancy punctuation marks // might need to be. public Map betaMultipliers = new HashMap(); public double minMultiplier = 1.0; /** Constructor without n-gram model (for scoring tag sequences). */ public WordAndPOSDictionaryLabellingStrategy(STTaggerWordDictionary wd, STTaggerPOSDictionary pd, int K, MaxentModel mo, FeatureExtractor fexer) { this(wd, pd, K, mo, fexer, null, null); } /** Constructor WITH n-gram model (for scoring tag sequences). */ public WordAndPOSDictionaryLabellingStrategy( STTaggerWordDictionary wd, STTaggerPOSDictionary pd, int K, MaxentModel mo, FeatureExtractor fexer, String tagSequenceModel, Constants.TaggingAlgorithm alg) { this(wd, pd, K, mo, fexer, tagSequenceModel, alg, new DummyPOSTagger()); } /** Constructor with n-gram model and POS tagger */ public WordAndPOSDictionaryLabellingStrategy( STTaggerWordDictionary wd, STTaggerPOSDictionary pd, int K, MaxentModel mo, FeatureExtractor fexer, String tagSequenceModel, Constants.TaggingAlgorithm alg, POSTagger posTagger) { this.wd = wd; this.pd = pd; this.mo = mo; this.posTagger = posTagger; if (K > 0) { this.K = K; } else { this.K = 0; } usualK = K; finalK = K; this.fexer = fexer; try { if (tagSequenceModel != null) { // find the n-gram order of the model. int ord = SequenceScorer.findOrder(tagSequenceModel); // load it into the SequenceScorer. seqScorer = new SequenceScorer(ord, tagSequenceModel); seqScorer.setSearchBeam(5); seqScorer.setAlgorithm(alg); } } catch (IOException ex) { Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex); } } public void useWordDict(boolean useIt) { useWordDict = useIt; } public void usePOSDict(boolean useIt) { usePOSDict = useIt; } public WordAndPOSDictionaryLabellingStrategy(STTaggerWordDictionary wd, STTaggerPOSDictionary pd, int K, MaxentModel mo) { this(wd, pd, K, mo, new STFex()); } /** * Set the maximum width of the beam in the forward-backward tagger. */ public void setMaxSearchBeam(int maxSearchBeam) { if (seqScorer != null) seqScorer.setSearchBeam(maxSearchBeam); } /** * Reset the K parameter. */ public void setK(int newK) { this.K = newK; } /** * Set the usual K parameter. */ public void setUsualK(int newK) { this.usualK = newK; } /** * Set the final K parameter. */ public void setFinalK(int newK) { this.finalK = newK; } /** * A method that returns all labels given by the model that both (1) are assigned probability `p' s.t.: * p>=(`beta'*), where `beta' is a factor passed in by the client of this method and * where is the probability of the most probably outcome of the model and (2) (if the word * (obtained from the String[] `context') has occured at least K times in training) * are in the STTaggerWordDictionary under the entry for said word. If the word did * not occur at least K times in training, the output set is constrained by a STTaggerPOSDictionary. * In the off chance that a POS did not occur in the training data, the models predictions themselves are * submitted to the `beta constraint'. * * @param context A String[] of contextual predicates (in the maximum entropy modelling sense) * @param mo A MaxentModel. * @param beta A double specifying how close in probability all returned outcomes must be. * @return An ArrayList of labels that meet the above constraints. */ public List multitag(Word w, Collection> context, double beta) { List> temp = this.multitagWithScores(w, context, beta); ArrayList res = new ArrayList(temp.size()); for (Pair t : temp) { res.add(t.b); } return res; } /** * A method to return the set of labels that are greater than or equal to * the best label multiplied by a factor `beta', given a model and a String[] * of contextual predicates. * @param thisWord a opennlp.ccg.lexicon.Word representing the current word being tagged. * @param context A Collection> of contextual predicates * (in the maximum entropy modelling sense) with their corresponding activations (real-valued, hence the * Double). * @param model A model for generating the base predictions. * @param beta A positive double specifying how close to the best label * each label returned must be. * @return An ArrayList> of the outcomes * {o: score(o)>=[beta * score(bestLabel)]}. */ public List> multitagWithScores(Word thisWord, Collection> context, double beta) { // All the scores of the outcomes (the index of each double score // is the key which allows us to retrieve the outcome from the model). double[] ocs = mo.eval(context); // Sort in descending order of probability. ProbIndexPair[] sortedOutcomes = new ProbIndexPair[ocs.length]; for (int i = 0; i < ocs.length; i++) { sortedOutcomes[i] = new ProbIndexPair(new Double(ocs[i]), new Integer(i)); } Arrays.sort(sortedOutcomes); String tempOutcome = ""; String word = thisWord.getForm(); String pos = thisWord.getPOS(); ArrayList> retVal = new ArrayList>(30); // Find the best outcomes seen with the word in training that // meet the `beta' constraint. // ******************************************************************************************* double bestOutcomeProb, currentOutcomeProb; bestOutcomeProb = 0; // mww: changed to not always include front of list, as it may not meet dict constraints ProbIndexPair temp; // Now loop to see how many make the cut. // (But make sure to be sensitive to the dictionary, if necessary.) // See whether the word has a freq of this.K in the training corpus. Collection wordPermittedOutcomes = (wd != null) ? this.wd.getEntry(word, this.K) : null; if (wordPermittedOutcomes != null && useWordDict) { // The word (lemma) was seen at least K times in training. // Get all beta-OK outcomes that are in the dictionary entry. for (int ocInd = 0; ocInd < sortedOutcomes.length; ocInd++) { temp = sortedOutcomes[ocInd]; tempOutcome = mo.getOutcome(temp.b.intValue()); currentOutcomeProb = temp.a.doubleValue(); if (wordPermittedOutcomes.contains(tempOutcome)) { if (bestOutcomeProb == 0) { bestOutcomeProb = currentOutcomeProb; } if (currentOutcomeProb >= (bestOutcomeProb * beta)) { // Beta constraint. // The cut-off was met, add the outcome. retVal.add(new Pair(temp.a, tempOutcome)); // update max, for first selected outcome if (currentOutcomeProb > bestOutcomeProb) { bestOutcomeProb = currentOutcomeProb; } } else { // Else, since our ProbIndexPair[] is sorted by probablity, there will be no more // outcomes that make the (beta) cut. break; } } // If the word is not in the dictionary specified outcomes, move along. } } else { // Revert to the POS dictionary. Collection posPermittedOutcomes = null; if (pos != null) { posPermittedOutcomes = (pd != null) ? this.pd.getEntry(pos) : null; } else { if(verbose) { System.err.println("warning: null POS for: " + word);} // mww: check for null pos } if (posPermittedOutcomes != null && usePOSDict) { // Get all beta-OK outcomes that are in the POS dictionary entry. for (int ocInd2 = 0; ocInd2 < sortedOutcomes.length; ocInd2++) { temp = sortedOutcomes[ocInd2]; tempOutcome = mo.getOutcome(temp.b.intValue()); currentOutcomeProb = temp.a.doubleValue(); if (posPermittedOutcomes.contains(tempOutcome.trim())) { if (bestOutcomeProb == 0) { bestOutcomeProb = currentOutcomeProb; } if (currentOutcomeProb >= (bestOutcomeProb * beta)) { // Beta constraint. // Made the cut-off, add the outcome. retVal.add(new Pair(temp.a, tempOutcome)); // update max, for first selected outcome if (currentOutcomeProb > bestOutcomeProb) { bestOutcomeProb = currentOutcomeProb; } } else { // Else, since our ProbIndexPair[] is sorted by probablity, there will be no more // outcomes that make the (beta) cut. break; } } // If the word is not in the dictionary specified outcomes, move along. } } else { // Otherwise, just get all model predictions that meet the beta constraint, // ignoring the word and POS dictionaries. for (int ocInd3 = 0; ocInd3 < sortedOutcomes.length; ocInd3++) { temp = sortedOutcomes[ocInd3]; currentOutcomeProb = temp.a.doubleValue(); if (bestOutcomeProb == 0) { bestOutcomeProb = currentOutcomeProb; } if (currentOutcomeProb >= (bestOutcomeProb * beta)) { // Made the cut-off, add the outcome. retVal.add(new Pair(temp.a, mo.getOutcome(temp.b.intValue()))); // update max, for first selected outcome if (currentOutcomeProb > bestOutcomeProb) { bestOutcomeProb = currentOutcomeProb; } } else { // Else, since our ProbIndexPair[] is sorted by probability, there will be no more // outcomes that make the cut. break; } } } } // include the gold standard tag, if not in there. if(includeGold) { // assume input word has the gold tag in it. String gold = thisWord.getSupertag(); // check whether gold is in the output. boolean containsGold = false; for(Pair tg : retVal) { if(tg.b.equals(gold)) { containsGold = true; break; } } if(!containsGold) { // insert it containsGold = false; for(ProbIndexPair oc : sortedOutcomes) { if(mo.getOutcome(oc.b).equals(gold)) { retVal.add(new Pair(oc.a, mo.getOutcome(oc.b))); containsGold = true; break; } } } if(!containsGold) { // if the gold-standard still isn't in there, it must not be part of the tag set, add it with epsilon probability. // we're assuming that gold tags are needed for a training routine that doesn't care about supertag probabilities // (as in Clark and Curran (2007)). // check to see whether we are in the log domain (by checking for negative scores -- kind of a hack). retVal.add(new Pair((sortedOutcomes[0].a < 0) ? -99 : 1.0112214926104486e-43, thisWord.getSupertag())); } } // ******************************************************************************************* return retVal; } // get the current tagging (now only used to grab the POS tagging). public List getCurrentTagging() { return tagging; } // set the current tagging (now only used to set the current POS tagging). public void setCurrentTagging(List tgging) { tagging = tgging; } public List>> multitag(List sentence, double beta) { List>> results = new ArrayList>>(sentence.size()); Map sent = new TreeMap(); int cnt = 0; List taggedSent = posTagger.tagSentence(sentence); setCurrentTagging(taggedSent); for (TaggedWord werd : taggedSent) { sent.put(new Integer(cnt++), werd); } List>> contexts = fexer.getSentenceFeatures(sent); // Iterate simultaneously through both the words and the contextual features. Iterator wds = sentence.iterator(); Word w = null; Iterator>> ctxts = contexts.iterator(); Collection> context = null; int cursor = 0; while (wds.hasNext() && ctxts.hasNext()) { // get the next word. w = wds.next(); if(w.getPOS() == null) { w = Word.createFullWord(w, w.getForm(), tagging.get(cursor).getPOSTagging().get(0).b, w.getSupertag(), w.getSemClass()); } context = ctxts.next(); if (seqScorer != null) { // increase the tag ambiguity (for re-scoring using forward-backward). double newBeta = Math.min(beta * minMultiplier, beta / 8); if(beta < 0.00001) { newBeta = Math.min(beta * minMultiplier, beta / 2); } results.add(multitagWithScores(w, context, newBeta)); } else { results.add(multitagWithScores(w, context, beta)); } cursor++; } List>> finalResults = null; if (seqScorer != null) { // rescore and filter. pass in input sentence (in case, e.g., we have set the includeGold flag). finalResults = betaBestFilter(seqScorer.rescoreSequence(results), beta, sentence); } else { finalResults = results; } return finalResults; } /** * Return a beta-best filtered subset of the tags in each multitagging list (each multitagging list is assumed to be non-empty). */ private List>> betaBestFilter(List>> multitaggings, double beta, List inputSentence) { List>> res = new ArrayList>>(multitaggings.size()); int wordIndex = 0; for (List> mtagging : multitaggings) { List> tempTagging = new ArrayList>(mtagging.size()); Word thisWord = inputSentence.get(wordIndex); // set to a (possibly different, possibly less restrictive?) beta if this POS has a beta multiplier set. Double bmult = betaMultipliers.get(thisWord.getPOS()); double possiblyNewBeta = Math.min(1.0, (bmult != null) ? (bmult * beta) : beta); double best = mtagging.get(0).a; for (Pair tg : mtagging) { if (tg.a >= (possiblyNewBeta * best) || (includeGold && tg.b.equals(thisWord.getSupertag()))) { tempTagging.add(tg); } else { if(!includeGold) { // if we're not still fishing for gold... // ...stop, since they're in sorted order. break; } } } res.add(tempTagging); wordIndex++; } return res; } //------------------------------------------------------------------------- // Supertagger interface methods (added by Michael White) /** * The sequence of beta values to use in tagging. */ protected double[] betas = null; /** * The current betaIndex. */ protected int betaIndex = 0; /** * The current tagging. */ protected List>> currentTagging = null; /** * The current word. */ protected int currentWord = 0; /** * Flag for whether to include gold tags. */ protected boolean includeGold = false; /** Sets the beta values. */ public void setBetas(double[] betas) { this.betas = betas; } /** Returns all the beta values. */ public double[] getBetas() { return betas; } /** Returns the current beta value. */ public double getCurrentBetaValue() { return betas[betaIndex]; } /** * Advances beta to the next most restrictive setting. */ public void nextBeta() { betaIndex++; } /** * Advances beta to the next less restrictive setting. */ public void previousBeta() { betaIndex--; } /** * Returns whether there are any less restrictive beta settings * remaining in the sequence. */ public boolean hasMoreBetas() { return betaIndex < betas.length - 1; } /** * Returns whether there are any more restrictive beta settings * remaining in the sequence. */ public boolean hasLessBetas() { return betaIndex > 0; } /** * Resets beta to the most restrictive value. */ public void resetBeta() { betaIndex = 0; } /** * Resets beta to the least restrictive value. */ public void resetBetaToMax() { betaIndex = betas.length - 1; } /** * Sets the flag for whether to include gold tags. */ public void setIncludeGold(boolean includeGold) { this.includeGold = includeGold; } /** * Maps the given words to their predicted categories, * so that the beta-best categories can be returned by calls to setWord * and getSupertags. */ public void mapWords(List words) { if(hasMoreBetas()) { K = usualK; } else { K = finalK; } currentTagging = multitag(words, getCurrentBetaValue()); } /** * Sets the current word to the one with the given index, * so that the beta-best categories for it can be returned by a call to * getSupertags. */ public void setWord(int index) { currentWord = index; } /** * Returns the supertags of the desired categories for the current lexical lookup * as a map from supertags to contextual probabilities (or null to accept all). */ public Map getSupertags() { Map retval = new HashMap(); List> tags = currentTagging.get(currentWord); for (Pair tag : tags) { retval.put(tag.b, tag.a); } return retval; } /** * A factory method to make a supertagger from a config file (see the sample config file: * * $OPENCCG_HOME/ccgbank/models/supertagger/st.config * * for more information). */ @SuppressWarnings("unused") public static WordAndPOSDictionaryLabellingStrategy supertaggerFactory(String configFile) { WordAndPOSDictionaryLabellingStrategy res = null; String[] pathKeys = { "priormodel", "priormodelvocab", "sequencemodel", "wdict", "posdict", "maxentmodel", "posconfig" }; Map opts = ConfigFileProcessor.readInConfig(configFile, pathKeys); boolean verbose = (opts.get("verbose").equals("true")) ? true : false; // 'S' is for string repr. String priorModS = opts.get("priormodel"), priorVocabS = opts.get("priormodelvocab"), seqModS = opts.get("sequencemodel"), wDictS = opts.get("wdict"), pDictS = opts.get("posdict"), firstKS = opts.get("firstk"), lastKS = opts.get("lastk"), maxentModS = opts.get("maxentmodel"), posConfigS = opts.get("posconfig"), betasS = opts.get("betas"), betaMults = opts.get("betamultipliers"), // POS-specific multipliers to "tighten" or "loosen" up the tagging beam width. includeGold = opts.get("includegold"); assert (maxentModS != null) : "Empty maxent model."; // either use prior model (and have prior vocab specified) or not. assert (priorModS != null && priorVocabS != null) || (priorModS == null && priorVocabS == null) : "using prior model with no vocab file."; // ensure that there are word- and pos-keyed tagging dicts if there // is no st prior model. assert (wDictS == null || pDictS == null) && priorModS == null : "need tagging dicts if no supertagging prior model and prior vocab are specified."; // need the POS-keyed tagging dict, no matter what. assert (priorModS != null && pDictS == null) : "need POS-keyed tagging dict for prior model."; // need 'K' values if not using tagging dicts. assert (priorModS == null || (firstKS != null & lastKS != null)) : "need to specify first and last 'K' value when not using prior model."; // seqMod probably shouldn't be null. warn if in verbose mode. if(seqModS == null && verbose) { System.err.println("Warning: empty sequence model. Performance will suffer."); } STPriorModel priorM = null; if(priorModS != null && priorVocabS != null) { try { priorM = new STPriorModel(priorModS, priorVocabS); } catch (IOException ex) { Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex); } } STFex fex = new STFex(priorM); STTaggerWordDictionary wD = (wDictS != null) ? new XMLWordDictionaryReader(new File(wDictS)).read() : null; STTaggerPOSDictionary pD = (pDictS != null) ? new XMLPOSDictionaryReader(new File(pDictS)).read() : null; int kay = (opts.get("firstk") == null) ? 20 : Integer.parseInt(opts.get("firstk")), firstK, lastK; firstK = (opts.get("firstk") == null) ? 20 : Integer.parseInt(opts.get("firstk")); lastK = (opts.get("lastk") == null) ? 100 : Integer.parseInt(opts.get("lastk")); double[] betaz = new double[betasS.split("\\s+").length]; int cursor = 0; for(String beta : betasS.split("\\s+")) { betaz[cursor++] = Double.parseDouble(beta); } // should we use the tagging dictionaries (yes if there is no prior model). boolean useWordDictionary = (wDictS != null); boolean usePOSDictionary = (pDictS != null); POSTagger pTagger = (posConfigS == null) ? null : POSTagger.posTaggerFactory(posConfigS); TaggingAlgorithm alg = (opts.get("taggingalgorithm") == null || opts.get("taggingalgorithm").equals("forward-backward")) ? TaggingAlgorithm.FORWARDBACKWARD : TaggingAlgorithm.FORWARD; MaxentModel mem = new ZLMEM(new File(maxentModS)); //STTaggerWordDictionary wd,STTaggerPOSDictionary pd, int K, MaxentModel mo, FeatureExtractor fexer, //String tagSequenceModel, Constants.TaggingAlgorithm alg, POSTagger posTagger res = (pTagger != null) ? new WordAndPOSDictionaryLabellingStrategy(wD, pD, kay, mem, fex, seqModS, alg, pTagger) : new WordAndPOSDictionaryLabellingStrategy(wD, pD, kay, mem, fex, seqModS, alg); res.setK(kay); res.setUsualK(firstK); res.setFinalK(lastK); res.setBetas(betaz); res.useWordDict(useWordDictionary); res.usePOSDict(usePOSDictionary); res.setIncludeGold((opts.get("includegold") == null || opts.get("includegold").equals("false")) ? false : true); // get POS-specific beta multipliers (as a string of pairs -- all space delimited). if(betaMults != null) { String[] bmts = betaMults.split("\\s+"); for(int a=0, b=1; b < bmts.length; a = a + 2, b = b + 2) { double mul = Double.parseDouble(bmts[b]); res.betaMultipliers.put(bmts[a], mul); if(mul < res.minMultiplier) { res.minMultiplier = mul; } } } return res; } public static void main(String[] args) { String usage = "\nWordAndPOSDictLabellingStrategy (-h [gets this message]) -e [defaults to not testing] -c > -beta [0.0,1.0]\n"+ " (-i [default=]) (-o [default=])\n"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } SRILMFactoredBundleCorpusIterator in = null; BufferedWriter out = null; try { String inputCorp = "", output = "", configFile = null; double beta = 1.0; boolean test = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-i")) { inputCorp = args[++i]; continue; } if (args[i].equals("-o")) { output = args[++i]; continue; } if (args[i].equals("-e")) { test = true; continue; } if (args[i].equals("-c")) { configFile = args[++i]; continue; } if (args[i].equals("-beta")) { beta = Double.parseDouble(args[++i]); continue; } System.out.println("Unrecognized option: " + args[i]); } ResultSink rs = new ResultSink(ResultSink.ResultSinkType.SUPERTAG); try { in = new SRILMFactoredBundleCorpusIterator( (inputCorp.equals("")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp)))); } catch (FileNotFoundException ex) { System.err.print("Input corpus " + inputCorp + " not found. Exiting..."); Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex); System.exit(-1); } try { out = (output.equals("")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output))); } catch (IOException ex) { System.err.print("Output file " + output + " not found. Exiting..."); Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex); System.exit(-1); } WordAndPOSDictionaryLabellingStrategy stgger = WordAndPOSDictionaryLabellingStrategy.supertaggerFactory(configFile); // for each sentence, print out: // // w1 ... ... // ... // wN ... ... // for (List inLine : in) { List>> taggedSent = stgger.multitag(inLine, beta); if(test) { rs.addSent(taggedSent, inLine); } // beginning of sentence... out.write("" + System.getProperty("line.separator")); List posTagging = stgger.getCurrentTagging(); int cursor = -1; while(++cursor < taggedSent.size()) { Word wdIn = inLine.get(cursor); // word form... out.write(wdIn.getForm()); TaggedWord posT = posTagging.get(cursor); // print out number of POS tags, followed by tab-separated probabilized POS tagging. out.write("\t" + posT.getPOSTagging().size()); for(Pair pt : posT.getPOSTagging()) { out.write("\t" + pt.b + "\t" + pt.a); } // now print out number of and list of tab-separated, probabilized supertags. out.write("\t" + taggedSent.get(cursor).size()); for(Pair stg : taggedSent.get(cursor)) { out.write("\t" + stg.b + "\t" + stg.a); } out.write(System.getProperty("line.separator")); } out.write("" + System.getProperty("line.separator")); } out.flush(); if(test) { System.err.println(rs.report()); } } catch (IOException ex) { Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex); } finally { try { out.close(); in.close(); } catch (IOException ex) { Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex); } } } } // End class WordPOSDictLabellingStrategy ================================================ FILE: src/opennlp/ccg/parse/supertagger/io/XMLPOSDictionaryReader.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.io; import java.io.File; import java.util.HashSet; import java.util.Collection; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import javax.xml.parsers.*; import opennlp.ccg.parse.supertagger.util.STTaggerPOSDictionary; import opennlp.ccg.util.Pair; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; /** * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2009/12/21 02:10:57 $ */ public class XMLPOSDictionaryReader { private File dictFile; private XMLReader reader; private Map> dict; /** Creates a new instance of XMLDictionaryReader * @param dictFile A String pointing to the location of * the XML file specifying the word dictionary. */ public XMLPOSDictionaryReader(File df) { if(!df.exists()) { throw new RuntimeException("File "+df.getAbsolutePath().toString()+" does not exist."); } this.dictFile = df; } /** * Read in the dictionary file and create a new STTaggerPOSDictionary. * @return A new STTaggerPOSDictionary. */ public STTaggerPOSDictionary read() { SAXParserFactory factory = SAXParserFactory.newInstance(); try { SAXParser parser = factory.newSAXParser(); reader = parser.getXMLReader(); reader.setContentHandler(new wdContentHandler()); reader.parse(this.dictFile.toURI().toString()); } catch(Exception e) { e.printStackTrace(); } return new STTaggerPOSDictionary(this.dict); } public static void main(String[] args) { // This is just to verify that the XML doc read in is the one // spit out. String fname = args[0]; XMLPOSDictionaryReader rdr = new XMLPOSDictionaryReader(new File(fname)); STTaggerPOSDictionary dct = rdr.read(); Iterator>> it = dct.getMappings(); Pair> tempP = null; System.out.println(""); while(it.hasNext()) { tempP = it.next(); System.out.println(" "); for(Iterator stgs = tempP.b.iterator(); stgs.hasNext(); ) { System.out.println(" "+stgs.next().trim()+" "); } System.out.println(" "); } System.out.print(""); } /* * A ContentHandler to properly interpret the "semantics" of the XML (semantics * in the CS sense of formal semantics of a structured document). */ class wdContentHandler extends DefaultHandler { private boolean inEntry = false, inSupertag = false; private String curPOS = null, currSTFrag = null; @Override public void startDocument() { dict = new TreeMap>(); } @Override public void startElement(String namespaceURI, String lname, String qname, Attributes attrs) throws SAXException { if(qname.equalsIgnoreCase("entry")) { if(this.inEntry) { throw new SAXException("Something is wrong.\nThis is not a well-formed dictionary."); } else { this.inEntry = true; String pos = attrs.getValue(0).trim(); dict. put(pos, new HashSet()); this.curPOS = pos; } } else if(qname.equalsIgnoreCase("supertag")) { if(!this.inEntry) { throw new SAXException("Something is wrong.\nThis is not a well-formed dictionary."); } else { this.inSupertag = true; this.currSTFrag = ""; } } } @Override public void endElement(String uri, String name, String qName) { if(qName.equalsIgnoreCase("entry")) { this.inEntry = false; this.curPOS = null; } else if(qName.equalsIgnoreCase("supertag")) { this.inSupertag = false; Collection tempL = dict.get(this.curPOS); tempL.add(this.currSTFrag.trim()); dict.put(this.curPOS, tempL); this.currSTFrag = null; } } @Override public void characters(char[] ch, int start, int length) { if(this.inSupertag && this.curPOS!=null) { // Get this supertag and add it to the list mapped to by this POS (i.e., the list // of supertags seen with this POS in training). String temp = new String(ch); temp = temp.substring(start, start+length); this.currSTFrag += temp; } else if(this.inSupertag) { System.err.println("Something is wrong.\nThis is not a well-formed dictionary."); } } } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/io/XMLWordDictionaryReader.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.io; import java.io.File; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Collection; import java.util.TreeMap; import javax.xml.parsers.*; import opennlp.ccg.parse.supertagger.util.STTaggerWordDictionary; import opennlp.ccg.util.Pair; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; /** * @author Dennis N. Mehay * @version $Revision: 1.3 $, $Date: 2010/09/21 04:12:41 $ */ public class XMLWordDictionaryReader { private File dictFile; private XMLReader reader; private Map>> dict; /** Creates a new instance of XMLDictionaryReader * @param dictFile A String pointing to the location of * the XML file specifying the word dictionary. */ public XMLWordDictionaryReader(File df) { if(!df.exists()) { throw new RuntimeException("File "+df.getAbsolutePath().toString()+" does not exist."); } this.dictFile = df; } /** * Read in the dictionary file and create a new STTaggerWordDictionary. * @return A new STTaggerWordDictionary. */ public STTaggerWordDictionary read() { SAXParserFactory factory = SAXParserFactory.newInstance(); try { SAXParser parser = factory.newSAXParser(); reader = parser.getXMLReader(); reader.setContentHandler(new wdContentHandler()); reader.parse(this.dictFile.toURI().toString()); } catch(Exception e) { e.printStackTrace(); } return new STTaggerWordDictionary(this.dict); } public static void main(String[] args) { // This is just to verify that the XML doc read in is the one // spit out. String fname = args[0]; XMLWordDictionaryReader rdr = new XMLWordDictionaryReader(new File(fname)); STTaggerWordDictionary dct = rdr.read(); Iterator>>> it = dct.getMappings(); Pair>> tempP = null; System.out.println(""); while(it.hasNext()) { tempP = it.next(); System.out.println(" "); for(Iterator stgs = tempP.b.b.iterator(); stgs.hasNext(); ) { System.out.println(" "+stgs.next().trim()+" "); } System.out.println(" "); } System.out.print(""); } /* * A ContentHandler to properly interpret the "semantics" of the XML (semantics * in the CS sense of formal semantics of a structured document). */ class wdContentHandler extends DefaultHandler { private boolean inEntry = false, inSupertag = false; private String curWord = null, currSTFrag = null; @Override public void startDocument() { dict = new TreeMap>>(); } @Override public void startElement(String namespaceURI, String lname, String qname, Attributes attrs) throws SAXException { if(qname.equalsIgnoreCase("entry")) { if(this.inEntry) { throw new SAXException("Something is wrong.\nThis is not a well-formed dictionary."); } else { this.inEntry = true; String word = attrs.getValue(0).trim(), freq = attrs.getValue(1).trim(); word = word.replace("&","&"); word = word.replace(">",">"); word = word.replace("<","<"); word = word.replace("'","\'"); word = word.replace(""","\""); dict. put(word, new Pair>( new Integer(Integer.parseInt(freq)), new HashSet())); this.curWord = word; } } else if(qname.equalsIgnoreCase("supertag")) { if(!this.inEntry) { throw new SAXException("Something is wrong.\nThis is not a well-formed dictionary."); } else { this.inSupertag = true; this.currSTFrag = ""; } } } @Override public void endElement(String uri, String lName, String qName) { if(qName.equalsIgnoreCase("entry")) { this.inEntry = false; this.curWord = null; } else if(qName.equalsIgnoreCase("supertag")) { this.inSupertag = false; Pair> tempL = dict.get(this.curWord); tempL.b.add(this.currSTFrag.trim()); dict.put(this.curWord, tempL); this.currSTFrag = null; } } @Override public void characters(char[] ch, int start, int length) { if(this.inSupertag && this.curWord!=null) { // Get this supertag and add it to the list mapped to by this word (i.e., the list // of supertags seen with this word in training). String temp = new String(ch); temp = temp.substring(start, start+length); this.currSTFrag += temp; } else if(this.inSupertag) { System.err.println("Something is wrong.\nThis is not a well-formed dictionary."); } } } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/ml/FeatureExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.ml; import opennlp.ccg.parse.tagger.TaggedWord; import java.util.Collection; import java.util.List; import java.util.Map; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2010/09/21 04:12:41 $ */ public interface FeatureExtractor { /** * @param sentence A {@code Map} giving the (string-indexed) sentence of * {@code Word}s to be tagged. * @param wordIndex An {@code Integer}, giving the string index of the current word. * @return A {@code Collection>} representing the * real-valued activations of features (predicates) in the context of a word to be labelled. */ public Collection> getFeatures(Map sentence, Integer wordIndex); /** * Same as getFeatures, but for the whole sentence, returning a List of contextual features, in order, one * per Word in sentence */ public List>> getSentenceFeatures(Map sentence); /** * @param sentence A Map giving the (string-indexed) sentence of * Words to be tagged. * @param wordIndex An Integer, giving the string index of the current word. * @param training A boolean indicating whether we are extracting features for training (in which case * we need the label too). * @return A Collection> representing the * real-valued activations of features (predicates) in the context of a word to be labelled. */ public Collection> getFeatures(Map sentence, Integer wordIndex, boolean training); /** * Same as getFeatures, but for the whole sentence, returning a List of contextual features, in order, one * per Word in sentence */ public List>> getSentenceFeatures(Map sentence, boolean training); } ================================================ FILE: src/opennlp/ccg/parse/supertagger/ml/STFex.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.ml; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.parse.tagger.Constants; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import opennlp.ccg.parse.postagger.POSTagger; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.7 $, $Date: 2010/12/15 07:16:03 $ */ public class STFex implements FeatureExtractor { private boolean useMultiPOS = false; public static final String LEX = "X"; public static final String POS = "P"; public static final String prevP = POS + "-1="; public static final String prevPrevP = POS + "-2="; public static final String nextP = POS + "+1="; public static final String nextNextP = POS + "+2="; public static final String curP = POS + "="; public static final String prevL = LEX + "-1="; public static final String prevPrevL = LEX + "-2="; public static final String nextL = LEX + "+1="; public static final String nextNextL = LEX + "+2="; public static final String curL = LEX + "="; public static final String priorST = "PST" + "="; public static final String wordPOSPriorST = "WPosPST" + "="; public static final String wordPriorST = "WPST" + "="; public static final String POSPriorST = "PosPST" + "="; public static final String antiPriorST = "APST" + "="; public static final String wordPOSAntiPriorST = "WPosAPST" + "="; public static final String wordAntiPriorST = "WAPST" + "="; public static final String POSAntiPriorST = "PosAPST" + "="; public static final TaggedWord outOfBounds = Constants.OOB; private static final String[] lxfLabs = {LEX + "-2", LEX + "-1", LEX, LEX + "+1", LEX + "+2"}; private static final String[] posfLabs = {POS + "-2", POS + "-1", POS, POS + "+1", POS + "+2"}; /** * An object containing a ConditionalProbabilityTable that can give a prior distribution over all * known supertags given a POS-tagged word. * * Trained as a factored LM (presumably using SRILM). */ protected STPriorModel priorMod; /** Constructor with no prior model. */ public STFex( ) { this.priorMod = null; } /** Constructor with a prior model over supertags (to be used as a feature). */ public STFex(STPriorModel priorMod) { this.priorMod = priorMod; } /** Pass in true to use multi-POS features, pass in false not to. */ public void useMultiPOS(boolean trueOrFalse) { useMultiPOS = trueOrFalse; } /** * Extracts an ArrayList representing the contextual * predicates (features) of a line of (tokenised) text (each String * represents the predicates that fire for a word in the line). * Each resulting String will have the form: * cp1 cp2 ... cpK * @param sentence A List of feature bundles. * @param wordIndex An int giving the location of the word to be tagged. * @return A Collection> of real-valued feature activations * for the word at index wordIndex */ public Collection> getFeatures(Map sentence, Integer wordIndex, boolean training) { Collection> result = new ArrayList>(30); TaggedWord current, prev, prevPrev, next, nextNext; current = sentence.get(wordIndex); // -------- The left periphery ------------ int wind = wordIndex.intValue(); if (wind > 1) { prev = sentence.get(wind - 1); prevPrev = sentence.get(wind - 2); } else if (wind > 0) { prev = sentence.get(wind - 1); prevPrev = outOfBounds; } else { prev = prevPrev = outOfBounds; } // -------- The right periphery ----------- int tempSize = sentence.size(); if ((tempSize - (wind + 1)) >= 2) { next = sentence.get(wind + 1); nextNext = sentence.get(wind + 2); } else if (tempSize - (wind + 1) >= 1) { next = sentence.get(wind + 1); nextNext = outOfBounds; } else { next = nextNext = outOfBounds; } Double activation = new Double(1.0); if (training) result.add(new Pair(current.getSupertag(), activation)); result.add(new Pair(curL + current.getForm(), activation)); if(useMultiPOS) { for(Pair tg : current.getPOSTagging()) result.add(new Pair(curP + tg.b, tg.a)); } else { result.add(new Pair(curP + current.getPOS(), activation)); } result.add(new Pair(prevL + prev.getForm(), activation)); if(useMultiPOS && prev != Constants.OOB) { for(Pair tg : prev.getPOSTagging()) result.add(new Pair(prevP + tg.b, tg.a)); } else { result.add(new Pair(prevP + prev.getPOS(), activation)); } result.add(new Pair(prevPrevL + prevPrev.getForm(), activation)); if(useMultiPOS && prevPrev != Constants.OOB) { for(Pair tg : prevPrev.getPOSTagging()) result.add(new Pair(prevPrevP + tg.b, tg.a)); } else { result.add(new Pair(prevPrevP + prevPrev.getPOS(), activation)); } result.add(new Pair(nextL + next.getForm(), activation)); if(useMultiPOS && next != Constants.OOB) { for(Pair tg : next.getPOSTagging()) result.add(new Pair(nextP + tg.b, tg.a)); } else { result.add(new Pair(nextP + next.getPOS(), activation)); } result.add(new Pair(nextNextL + nextNext.getForm(), activation)); if(useMultiPOS && nextNext != Constants.OOB) { for(Pair tg : nextNext.getPOSTagging()) result.add(new Pair(nextNextP + tg.b, tg.a)); } else { result.add(new Pair(nextNextP + nextNext.getPOS(), activation)); } // now for conjunctions of features: w-2w-1=..., w-1w+1=..., w+1w+2=... (same for posp). // (i.e., bigram features over words and parts of speech and bigrams of words and POSs that straddle the current token). // N.B. only use single-best POSs (maybe change later). TaggedWord[] wds = {prevPrev, prev, current, next, nextNext}; for (int j = 1; j < wds.length; j++) { // add bigram features (only for single-best POS). result.add(new Pair(lxfLabs[j - 1] + "|" + lxfLabs[j] + "=" + wds[j - 1].getForm() + "|" + wds[j].getForm(), activation)); result.add(new Pair(posfLabs[j - 1] + "|" + posfLabs[j] + "=" + wds[j - 1].getPOS() + "|" + wds[j].getPOS(), activation)); // also, if at the current word slot, add bigrams that straddle the current word. if (j == 2) { result.add(new Pair(lxfLabs[j - 1] + "|" + lxfLabs[j + 1] + "=" + wds[j - 1].getForm() + "|" + wds[j + 1].getForm(), activation)); result.add(new Pair(posfLabs[j - 1] + "|" + posfLabs[j + 1] + "=" + wds[j - 1].getPOS() + "|" + wds[j + 1].getPOS(), activation)); } } // If the prior model is not null, extract a feature for the beta-best (beta = 0.1) classes // predicted by the prior model (for all output classes -- supertags -- seen with this word's // POS). // Extract prior features from these. if(priorMod != null) { priorMod.computePriors(current.getWord()); for(Pair priorClassActivationPair : priorMod.getRestrictedBetaBestPriors(current.getWord(), 0.1)) { // TODO: make beta parameterizable. double act = Math.log(priorClassActivationPair.b); String wd = current.getForm().intern(), pos = current.getPOS().intern(); result.add(new Pair(priorST + priorClassActivationPair.a.intern(), act)); // log(prob) result.add(new Pair(wordPriorST + priorClassActivationPair.a.intern()+"_"+wd, act)); // log(prob) result.add(new Pair(wordPOSPriorST + priorClassActivationPair.a.intern()+"_"+wd+"_"+pos, act)); // log(prob) result.add(new Pair(POSPriorST + priorClassActivationPair.a.intern()+"_"+pos, act)); // log(prob) result.add(new Pair(antiPriorST + priorClassActivationPair.a.intern(), Math.log(1-Math.exp(act)))); // log(1-prob) // TODO: come up with sensible "anti-prior" features that simulate the filtering effect of the tagging dict. } } return result; } public List>> getSentenceFeatures(Map sentence, boolean training) { List>> res = new ArrayList>>(sentence.size()); List keys = new ArrayList(sentence.keySet().size()); for(Integer i : sentence.keySet()) { keys.add(i); } Collections.sort(keys); for(Integer wordIndex : keys) { res.add(getFeatures(sentence, wordIndex, training)); } return res; } public Collection> getFeatures(Map sentence, Integer wordIndex) { return getFeatures(sentence, wordIndex, false); } public List>> getSentenceFeatures(Map sentence) { return getSentenceFeatures(sentence, false); } // main method for extracting features from a file (for training). // pass in a supertag prior model and prior model vocab file, if desired. // (these replace tagging dictionaries). // pass in a POS tagger config file, if we aren't only using gold POS tags only. // input corpus is from stdin, output goes to stdout. public static void main(String[] args) { // we assume that the training data is being streamed in from stdin (no parse IDs, just SRILM factor bundle lines), // and that output will stream to stdout. String usage = "\n | STFex (-h [gets this message]) (-r -v ) (-p ) | "; if(args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } String priorModF = null, priorVocab = null, posConfig = null; for(int j = 0; j < args.length; j++) { if(args[j].equals("-r")) { priorModF = args[++j]; continue; } if(args[j].equals("-v")) { priorVocab = args[++j]; continue; } if(args[j].equals("-p")) { posConfig = args[++j]; continue; } System.err.println("Unrecognized option: " + args[j]); } SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new InputStreamReader(System.in))); STPriorModel stPriorMod = null; if(priorModF != null) { try { stPriorMod = new STPriorModel(priorModF, priorVocab); } catch (IOException ex) { Logger.getLogger(STFex.class.getName()).log(Level.SEVERE, null, ex); } } STFex fexer = new STFex(stPriorMod); POSTagger posT = (posConfig == null) ? null : POSTagger.posTaggerFactory(posConfig); if(posT != null) { fexer.useMultiPOS(true); } else { fexer.useMultiPOS(false); } for(List sentence : corp) { Map sent = new HashMap(sentence.size()); int index = 0; if(posT == null) { for(Word w : sentence) { sent.put(index++, new TaggedWord(w)); } } else { List posTagging = posT.tagSentence(sentence); for(TaggedWord tw : posTagging) { sent.put(index++, tw); } } List>> ftss = fexer.getSentenceFeatures(sent, true); for(Collection> fts : ftss) { index = 0; for(Pair ft : fts) { // if we're at the first item, print out the label. if (index == 0) { System.out.print(ft.a); } else { System.out.print(" " + ft.a + ":" + ft.b); } index++; } System.out.println(); } } } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/ml/STPriorModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.ml; import opennlp.ccg.parse.supertagger.util.ProbPairComparator; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.ngrams.ConditionalProbabilityTable; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.parse.tagger.ProbIndexPair; import opennlp.ccg.parse.supertagger.util.STTaggerPOSDictionary; import opennlp.ccg.util.Interner; import opennlp.ccg.util.Pair; /** * (c) (2009) Dennis N. Mehay * @author Dennis N. Mehay * * Model for predicting p(supertag | word, pos). Uses an ARPA-formatted * SRILM-trained "unigram" factored LM for this, where each "unigram" is * a bundle of word:pos:supertag. */ public class STPriorModel extends ConditionalProbabilityTable { public static final String WORD = DefaultTokenizer.WORD_ATTR; public static final String POS_TAG = DefaultTokenizer.POS_ATTR; public static final String SUPERTAG = DefaultTokenizer.SUPERTAG_ATTR; private Interner> pairs = new Interner>(); /** * Re-usable list for attr-val pairs of word-pos-supertag inputs to the prior model * (i.e., for predicting p(STag | word, POS). */ public List> attrVals = new ArrayList>(5); /** * A comparator for sorting Pair's where the Double is a probability * (effectively sorts by descending order of probability). */ private ProbPairComparator ppcomp = new ProbPairComparator(); /** All the priors. Reference them when getting beta-best, beta-worst, etc. */ List> priors = new ArrayList>(1000); /** String[] of all possible supertag outcomes. */ private String[] stagVocab = null; /** double[] containing the probability distro over all supertags. */ private double[] stagDistro = null; /** * POS-keyed tagging dictionary (to provide restrictions on what the prior model may consider. * No restrictions if null. */ private STTaggerPOSDictionary posDict = null; /** * Re-usable way of containing the probabilities and a pointer back into where they came from * in the probability distro over all supertags. */ private ProbIndexPair[] stagPointers = null; /** Construct a prior model with the FLM config file and corresponding vocab file. */ public STPriorModel(String flmFile, String vocabFile) throws IOException { // create with a null POS dictionary (i.e., no restrictions on taggings). this(flmFile, vocabFile, null); } /** Construct a prior model with the FLM config file and corresponding vocab file. */ public STPriorModel(String flmFile, String vocabFile, STTaggerPOSDictionary posDict) throws IOException { super(flmFile); this.posDict = posDict; String st = null; BufferedReader br = new BufferedReader(new FileReader(new File(vocabFile))); st = br.readLine().trim(); // get next supertag from the vocab. while ((st != null) && !st.trim().startsWith(SUPERTAG + "-")) { st = br.readLine(); } if (st != null) { st = st.trim().split("-")[1]; } Collection allSupertags = new HashSet(); // find out how many outcomes we have. int cnt = 0; while (st != null) { cnt++; allSupertags.add(st); while ((st != null) && !st.trim().startsWith(SUPERTAG + "-")) { st = br.readLine(); } if (st != null) { st = st.trim().split("-")[1]; } } br.close(); // initialize the arrays to this size. stagVocab = new String[cnt]; stagPointers = new ProbIndexPair[cnt]; stagDistro = new double[cnt]; cnt = 0; // fill the vocab array with all possible supertags. for (String stag : allSupertags) { stagVocab[cnt++] = stag.intern(); } } /** Set the POS-keyed tagging dictionary. */ public void setPOSDict(STTaggerPOSDictionary posDict) { this.posDict = posDict; } /** Get the prior probability of this supertag/POS/word combo. */ public double getPriorOf(String supertag, String word, String pos) { attrVals.clear(); Pair surfaceForm = pairs.intern(new Pair(WORD, DefaultTokenizer.escape(word).intern())); attrVals.add(surfaceForm); Pair partOfSpeech = pairs.intern(new Pair(POS_TAG, DefaultTokenizer.escape(pos).intern())); attrVals.add(partOfSpeech); attrVals.add(pairs.intern(new Pair(SUPERTAG, DefaultTokenizer.escape(supertag).intern()))); return score(attrVals); } /** Get the beta-best tags for this word, under the prior model. */ public List> getBetaBestPriors(Word w, double beta) { List> allPriors = getAllPriors(w); List> betaBestPriors = new ArrayList>(100); double best = allPriors.get(0).b; for (Pair prior : allPriors) { if (best * beta <= prior.b) { betaBestPriors.add(prior); } else { break; } } return betaBestPriors; } /** Compute all priors, subject to the POS dict constraints. */ public void computePriors(Word w) { if (posDict != null) { priors = getPOSRestrictedPriors(w); } } /** Get the POS-dict restricted prior distribution (sorted descending by prob.) */ protected List> getPOSRestrictedPriors(Word w) { Collection tagsAllowed = posDict.getEntry(w.getPOS()); if (tagsAllowed == null || tagsAllowed.size() == 0) { return priors; } else { List> sortedTags = new ArrayList>(tagsAllowed.size()); for (String tag : tagsAllowed) { sortedTags.add(new Pair(getPriorOf(tag, w.getForm(), w.getPOS()), tag)); } Collections.sort(sortedTags, ppcomp); return sortedTags; } } /** * Get the beta-best tags (using the prior model) only from among the POS-dictionary-allowed possibilities. * beta-best (def'n): {t | p(t) >= beta * p(best-tag) } */ public List> getRestrictedBetaBestPriors(Word w, double beta) { if (posDict == null) { return getBetaBestPriors(w, beta); } else { List> rez = new ArrayList>(50); double best = priors.get(0).a; for(Pair tg : priors) { if(tg.a >= (beta * best)) { rez.add(new Pair(tg.b,tg.a)); } else { break; } } return rez; } } /** * Get the beta-WORST tags (using the prior model) only from among the POS-dictionary-allowed possibilities. * beta-best (def'n): {t | p(t) >= beta * p(best-tag) } * beta-worst (def'n): {t | p(t) * beta <= p(worst-tag)} */ public List> getRestrictedBetaWorstPriors(Word w, double beta) { if (posDict == null) { throw new UnsupportedOperationException("Cannot get beta-worst without a pos-keyed tagging dict.\nNot yet implemented."); } else { List> rez = new ArrayList>(50); List> cpy = new ArrayList>(priors); Collections.reverse(cpy); double worst = cpy.get(0).a; for(Pair tg : cpy) { if((tg.a * beta) <= worst) { rez.add(new Pair(tg.b,tg.a)); } else { break; } } return rez; } } public List> getAllPriors(Word w) { return getNBestPriors(w, stagVocab.length); } /** Get the n-best supertags on the prior model, given this word (with POS). */ public List> getNBestPriors(Word w, int n) { attrVals.clear(); Pair surfaceForm = pairs.intern(new Pair(WORD, DefaultTokenizer.escape(w.getForm()).intern())); attrVals.add(surfaceForm); Pair pos = pairs.intern(new Pair(POS_TAG, DefaultTokenizer.escape(w.getPOS()).intern())); attrVals.add(pos); int cnt = 0; for (String st : stagVocab) { // remove the last stag factor, if there. if (attrVals.size() == 3) { attrVals.remove(attrVals.size() - 1); } attrVals.add(pairs.intern(new Pair(SUPERTAG, st))); // add the probability of this tag under the prior model to the distro array. double sc = score(attrVals); stagDistro[cnt] = sc; // add this probability with a pointer back to where it came from in the vocab. // (so that we can sort by probability, but then retrieve the supertag string). stagPointers[cnt] = new ProbIndexPair(sc, cnt); cnt++; } // sort descending by probability (achieved by the comparator implementation of ProbIndexPair). Arrays.sort(stagPointers); List> result = new ArrayList>(n); for (int i = 0; i < n; i++) { result.add(new Pair(stagVocab[stagPointers[i].b], stagPointers[i].a)); } return result; } public static void main(String[] args) throws IOException { String usage = "\nSTPriorModel -vocab (-c ) (-o ) (-u ) (-v [ or '-verbose'])\n"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } SRILMFactoredBundleCorpusIterator in = null; BufferedWriter out = null; BufferedWriter voc = null; try { String inputCorp = "", output = "", vocabFile = "vocab.voc"; int catCutoff = 10; for (int i = 0; i < args.length; i++) { if (args[i].equals("-c")) { inputCorp = args[++i]; continue; } if (args[i].equals("-o")) { output = args[++i]; continue; } if (args[i].equals("-vocab")) {vocabFile = args[++i]; continue; } if (args[i].equals("-u")) { catCutoff = Integer.parseInt(args[++i]); continue; } System.out.println("Unrecognized option: " + args[i]); } try { in = new SRILMFactoredBundleCorpusIterator( (inputCorp.equals("")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp)))); } catch (FileNotFoundException ex) { System.err.print("Input corpus " + inputCorp + " not found. Exiting..."); Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit( -1); } try { out = (output.equals("")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output))); } catch (IOException ex) { System.err.print("Output file " + output + " not found. Exiting..."); Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit( -1); } try { voc = new BufferedWriter(new FileWriter(new File(vocabFile))); } catch (IOException ex) { Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex); } Map vocab = new HashMap(); for (List inLine : in) { for (Word w : inLine) { String st = SUPERTAG + "-" + DefaultTokenizer.escape(w.getSupertag()), pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()), wform = WORD + "-" + DefaultTokenizer.escape(w.getForm()); vocab.put(st, (vocab.get(st) == null) ? 1 : vocab.get(st) + 1); vocab.put(pos, (vocab.get(pos) == null) ? 1 : vocab.get(pos) + 1); vocab.put(wform, (vocab.get(wform) == null) ? 1 : vocab.get(wform) + 1); } } // reopen file try { in = new SRILMFactoredBundleCorpusIterator( (inputCorp.equals("")) ? new BufferedReader(new InputStreamReader(System.in)) : new BufferedReader(new FileReader(new File(inputCorp)))); } catch (FileNotFoundException ex) { System.err.print("Input corpus " + inputCorp + " not found. Exiting..."); Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit( -1); } for (List inLine : in) { for (Word w : inLine) { String st = SUPERTAG + "-" + DefaultTokenizer.escape(w.getSupertag()), pos = POS_TAG + "-" + DefaultTokenizer.escape(w.getPOS()), wform = WORD + "-" + DefaultTokenizer.escape(w.getForm()); if (vocab.get(st) > catCutoff) { out.write(wform + ":" + pos + ":" + st + " "); } } out.write(System.getProperty("line.separator")); } out.flush(); for (String str : vocab.keySet()) { if (vocab.get(str) > catCutoff) { voc.write(str + System.getProperty("line.separator")); } } voc.flush(); } finally { try { out.close(); in.close(); voc.close(); } catch (IOException ex) { Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex); } } } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/ml/ZhangLeTrainingExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.ml; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.Collection; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.TreeMap; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.io.PipeDelimitedFactoredBundleCorpusIterator; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.parse.supertagger.ml.FeatureExtractor; import opennlp.ccg.parse.supertagger.ml.STFex; import opennlp.ccg.parse.tagger.TaggedWord; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.3 $, $Date: 2010/09/21 04:12:41 $ */ public class ZhangLeTrainingExtractor { private File outputF; private Iterator> incorp; private FeatureExtractor fexer = new STFex(); /** * Create a training feature extractor that will extract features (with results) * for every instance in the input (training) corpus corpusName. * * @param corpusName A String giving the complete * path to the input file of SRILM-compliant factored bundles. * @param outputFileName A String giving the complete * path to the output file where the features will be written. */ public ZhangLeTrainingExtractor(File corpus, File outputF, String tokenisation) { this(corpus, outputF, tokenisation, new STFex()); } public ZhangLeTrainingExtractor(File corpus, File outputF, String tokenisation, FeatureExtractor fexer) { this.fexer = fexer; this.outputF = outputF; try { if (tokenisation.equalsIgnoreCase("srilm")) { incorp = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus))); } else { incorp = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus))); } } catch (FileNotFoundException ex) { Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex); } } /** * Writes training feats to file. */ public void writeFeats() { BufferedWriter bw = null; try { try { bw = new BufferedWriter(new FileWriter(this.outputF)); } catch (IOException ex) { Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex); } if (bw != null || this.incorp != null) { List sent = null; Map snt = null; Iterator> sents = this.incorp; while (sents.hasNext()) { //for (Iterator> sents = this.incorp; sents.hasNext();) { sent = sents.next(); // turn the sent into a map from integer string indices to Words. int index = 0; snt = new TreeMap(); for (Word w : sent) { snt.put(index++, new TaggedWord(w)); } // 'true' says "we're getting training feats" for (Collection> sentFeatsWithActivation : fexer.getSentenceFeatures(snt, true)) { try { boolean isLabel = true; for (Pair ftWAct : sentFeatsWithActivation) { if (isLabel) { bw.write(ftWAct.a + " "); isLabel = false; } else { bw.write(ftWAct.a + ":" + ftWAct.b.doubleValue() + " "); } } bw.newLine(); } catch (IOException ex) { Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex); } } } } } finally { try { bw.flush(); bw.close(); } catch (IOException ex) { Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex); } catch (Exception e) { System.out.println(e); } } } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/PipedTokenizer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; import java.util.List; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2009/08/21 17:20:20 $ */ public class PipedTokenizer extends DefaultTokenizer { public PipedTokenizer() { super(); } @Override public Word parseToken(String token, boolean strictFactors) { // init String form = token; String stem = null; String POS = null; String pitchAccent = null; String supertag = null; String semClass = null; List> attrValPairs = null; // handle pipe-separated attr-val pairs int pipePos = token.indexOf('|'); String suffix = null; if (pipePos > 0) { // get word form form = token.substring(0, pipePos); // shave off word form suffix = token.substring(pipePos + 1); // get next | position pipePos = suffix.indexOf('|'); // get stem [or lemma]. could be null. stem = suffix.substring(0,pipePos); if (stem.equals("")) { stem = null;} // shave off stem/lemma suffix = suffix.substring(pipePos + 1); // get next | position pipePos = suffix.indexOf('|'); // get POS POS = suffix.substring(0,pipePos); // shave off POS suffix = suffix.substring(pipePos + 1); // see whether there is a supertag if (suffix != null && !suffix.equals("")) { // get supertag supertag = suffix.trim(); } } else { throw new RuntimeException("This file is not in the right format: \n"+ "form|lemma|POS|(Supertag) ... form|lemma|POS(Supertag)."); } // done return Word.createWord(form, pitchAccent, attrValPairs, stem, POS, supertag, semClass); } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/ProbPairComparator.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; import java.util.Comparator; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * * A little Comparator instance for comparing Pair * instances of pair by descending order of the Double value (assuming * that they are probabilities of string tags). */ public class ProbPairComparator implements Comparator> { /** * Implements the Comparator interface's work-horse method. * * Compares two Pair objects. Crucially, it does NOT * ensure that both objects are Pair before casting them. * The caller is responsible for ensuring this, and failure to do so may * result in a RuntimeException. */ public int compare(Pair pr1, Pair pr2) { if (pr1 == pr2) { return 0; } return -1 * Double.compare(pr1.a, pr2.a); } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/STTaggerDictionary.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; //import java.util.ArrayList; import java.util.Collection; /** * An interface for supertagger "dictionaries" as described * in, e.g., Clark (2002) _Supertagging for CCG_. * * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2009/08/21 17:20:20 $ */ public interface STTaggerDictionary { /** * A method for getting the dictionary entry for a particular * String key. * The key will usually be a word, lemma or a part * of speech, but you may have other interesting grammatical things * to associate with supertags. * * @param key A String representing a particular * grammatical type. * @return A Collection containing supertags (CCG lexical * categories) seen with the particular grammatical type `key'. * Returns null if that word was not seen in the * corpus or (if the implementing class has a frequency cut-off) * if that type's token frequency was not high enough. */ public Collection getEntry(String key); /** * A method to test whether this STTaggerDictionary contains * an entry for a particular String key. * The key will usually be a word, lemma or a part of speech, but * you may have other interesting grammatical things to associate with * supertags. * * @param key A String representing a particular * grammatical type. * @return A boolean value of true or * false answering the question of whether this * dictionary contains an entry for the specified key. */ public boolean containsEntry(String key); } ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/STTaggerPOSDictionary.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; //import util.Pair; import java.io.*; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.Map; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2010/09/21 04:12:41 $ */ public class STTaggerPOSDictionary implements STTaggerDictionary, Serializable { private static final long serialVersionUID = -4814356608876054823L; /** * This object represents our dictionary. The String is the * POS we want to look up, and the Collection holds all of * of the CCG lex. cat's seen with that POS. */ private Map> dict = null; /** * This constructor does not create the dictionary; that must * be done using a DictionaryMaker. */ public STTaggerPOSDictionary(Map> dict) { this.dict = dict; } /** * This method implements the interface STTaggerDictionary * by delegating to getEntry(String, int) (see below). */ public Collection getEntry(String POS) { return this.dict.get(POS); } /** * A method that returns the contents of the mapping embodied in this dictionary. * @return An Iterator of supertagger.util.Pairs * that represent the pos -> { ... supertags ...} mappings in the dictionary. */ public Iterator>> getMappings() { Iterator keyset = this.dict.keySet().iterator(); ArrayList>> preRes = new ArrayList>>(); String tempS = null; while(keyset.hasNext()) { tempS = keyset.next(); preRes.add(new Pair>(tempS, this.dict.get(tempS))); } return preRes.iterator(); } /** * A method to test whether this STTaggerDictionary contains * an entry for a particular String representing a POS tag. * * @param key A String representing a particular * POS tag. * @return A boolean value of true or * false answering the question of whether this * dictionary contains an entry for the specified POS tag. */ public boolean containsEntry(String POS) { return this.dict.containsKey(POS); } } // End class STTaggerPOSDictionary ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/STTaggerWordDictionary.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; import java.io.*; import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; import java.util.Map; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2009/11/15 04:52:26 $ */ public class STTaggerWordDictionary implements STTaggerDictionary, Serializable { private static final long serialVersionUID = -2474606825228545547L; /* This object represents our dictionary. The String is the * word we want to look up, and the Pair holds * the word frequency count (= `a') and the Collection of the CCG lex. cat's * seen with that word (= `b'). */ private Map>> dict = null; /* This constructor does not create the dictionary; that must * be done using a DictionaryMaker. */ public STTaggerWordDictionary(Map>> dict) { this.dict = dict; } /** * This method implements the interface STTaggerDictionary * by delegating to getEntry(String, int) (see below). */ public Collection getEntry(String word) { return getEntry(word, 1); } /** * A method for getting the dictionary entry for a particular * String word, only if that word appears at least * `freq' times in the corpus. * * @param key A String representing a particular * word. * @param freq An int specifying the number of times a * word should have occured before it is returned. * @return A String[] containing all supertags (CCG lexical * categories) seen with the specified word. * Returns null if the word does not appear at least * `freq' times in the corpus from which the dictionary was * created or if the word does not appear at all. * (N.B. Passing in an int `freq' value of zero will elicit the * same behavior as passing in a `freq' value of 1.) */ @SuppressWarnings("unchecked") public Collection getEntry(String word, int freq) { if (word == null) return null; // mww: extra null check Object o = this.dict.get(word); if(o==null) { return null; } Pair> p = (Pair>)o; int wfreq = p.a.intValue(); if(wfreq>=freq) { return p.b; } else { return null; } } // End method getEntry(String, int) /** * A method to test whether this STTaggerDictionary contains * an entry for a particular String representing a word. * * @param key A String representing a particular * word. * @return A boolean value of true or * false answering the question of whether this * dictionary contains an entry for the specified word. */ public boolean containsEntry(String word) { return this.dict.containsKey(word); } /** * A method that returns the contents of the mapping embodied in this dictionary. * @return An Iterator of supertagger.util.Pairs * that represent the word -> (freq, { ... supertags ...}) mappings in the dictionary. */ public Iterator>>> getMappings() { Iterator keyset = this.dict.keySet().iterator(); ArrayList>>> preRes = new ArrayList>>>(); String tempS = null; while(keyset.hasNext()) { tempS = keyset.next(); preRes.add( new Pair>>(tempS, this.dict.get(tempS))); } return preRes.iterator(); } /** * A method for getting the number of times a word was seen * in the training data with which this dictionary was created. * * @param word A String representing the word in question. * @return An int count of this word's frequency in the * corpus with which this dictionary was created. */ @SuppressWarnings("unchecked") public int getCount(String word) { Object o = dict.get(word); if(o==null) { return 0; } else { Pair> p = (Pair>)o; return p.a.intValue(); } } // End method getCount(word) } // End class STTaggerWordDictionary ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/SupertagSequenceGetter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.List; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; /** * @author Dennis N. Mehay */ public class SupertagSequenceGetter { public static void main(String[] args) throws FileNotFoundException, IOException { String usage = "\nSupertagSequenceGetter -i -o \n"; String input = null, output = null; if(args == null || args.length == 0 || args[0].equals("-h")) { System.err.println(usage); System.exit(0); } for(int i = 0; i < args.length; i++) { if(args[i].equals("-i")) { input = args[++i]; continue; } if(args[i].equals("-o")) { output = args[++i]; continue; } System.err.println("unknown command-line option: " + args[i]); } BufferedReader in = new BufferedReader(new FileReader(new File(input))); SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(in); BufferedWriter out = new BufferedWriter(new FileWriter(new File(output))); for(List sent : corp) { out.write(" "); for(Word w : sent) { out.write(w.getSupertag()+" "); } out.write(""+System.getProperty("line.separator")); } out.close(); } } ================================================ FILE: src/opennlp/ccg/parse/supertagger/util/TaggingDictionaryExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.supertagger.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.tagger.io.PipeDelimitedFactoredBundleCorpusIterator; import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator; import opennlp.ccg.util.Pair; /** * Extracts and writes out XML files containing tagging * dictionary stats. * * @author Dennis N. Mehay * @version $Revision: 1.5 $, $Date: 2010/09/21 04:12:41 $ */ public class TaggingDictionaryExtractor { /** * Create a new dictionary extractor, specifying the word and POS * dictionary files, as well as the tokenisation type (SRILM Factored bundle or * C&C/Moses-style pipe-delimited factored bundles. * @param corpus A File of plain-text, one sentence per line and no additional mark-up beyond * the ... for SRILM factored bundle style. * @param wd A File where the word-based tagging dictionary will be written. * @param posd A File where the POS-based tagging dictionary will be written. * @param tokenisationType A String telling us how to tokenise factors in the * corpus file. */ // mapping from words to a pairing of their frequencies and the lexical categories they were seen with. private Map>> wdmap = new HashMap>>(); // mapping from POS tags to the lexical categories they were seen with. private Map> posmap = new HashMap>(); private Iterator> incorp = null; // writers for dict files. private BufferedWriter wbr = null, pbr = null; // how frequently a cat must occur to make it into the dictionaries. private int minCatFreq = 1; /** * Escape characters for text appearing as XML data, between tags. * *

    The following characters are replaced with corresponding character entities : * * * * * * * *
    Character Encoding
    < <
    > >
    & &
    " "
    ' '
    * *

    Note that JSTL's {@code } escapes the exact same set of * characters as this method. That is, {@code } * is good for escaping to produce valid XML, but not for producing safe * HTML. */ public static String forXML(String aText) { if (aText == null) return null; final StringBuilder result = new StringBuilder(); final StringCharacterIterator iterator = new StringCharacterIterator(aText); char character = iterator.current(); while (character != CharacterIterator.DONE) { if (character == '<') { result.append("<"); } else if (character == '>') { result.append(">"); } else if (character == '\"') { result.append("""); } else if (character == '\'') { result.append("'"); } else if (character == '&') { result.append("&"); } else { //the char is not a special one //add it to the result as is result.append(character); } character = iterator.next(); } return result.toString(); } public TaggingDictionaryExtractor(File corpus, File wd, File posd, String tokenisationType) { this(corpus, wd, posd, tokenisationType, 10); } public TaggingDictionaryExtractor(File corpus, File wd, File posd, String tokenisationType, int catFreq) { try { wbr = new BufferedWriter(new FileWriter(wd)); pbr = new BufferedWriter(new FileWriter(posd)); minCatFreq = catFreq; if (tokenisationType.equalsIgnoreCase("srilm")) { incorp = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus))); } else { incorp = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus))); } } catch (IOException ex) { Logger.getLogger(TaggingDictionaryExtractor.class.getName()).log(Level.SEVERE, null, ex); } } /** * Extract the dictionaries. */ @SuppressWarnings("unchecked") public void extract() { try { List currsent = null; String currForm = null, currPOS = null; //Set currFormSTs = null, curPOSSTs = null; //Integer currWdCnt = null; Pair> currFormFetch = null; Set currFormSet = null, currPOSSet = null; Map catCount = new HashMap(); while (incorp.hasNext()) { // for every word in every sentence, update the counts, and add to the word- and POS-based // allowable tags. currsent = incorp.next(); for (Word w : currsent) { currForm = w.getForm(); currPOS = w.getPOS(); Object wfetch = wdmap.get(currForm); String stag = w.getSupertag(); catCount.put(stag, catCount.get(stag)==null ? 1 : catCount.get(stag) + 1); if (wfetch == null) { currFormSet = new HashSet(); currFormSet.add(w.getSupertag()); wdmap.put(currForm, new Pair>(new Integer(1), currFormSet)); } else { currFormFetch = ((Pair>) wfetch); currFormSet = currFormFetch.b; currFormSet.add(w.getSupertag()); wdmap.put(currForm, new Pair>(new Integer(currFormFetch.a.intValue() + 1), currFormSet)); } Object pfetch = posmap.get(currPOS); if (pfetch == null) { currPOSSet = new HashSet(); currPOSSet.add(w.getSupertag()); } else { currPOSSet = (Set) pfetch; currPOSSet.add(w.getSupertag()); } posmap.put(currPOS, currPOSSet); } } // now write out the dictionaries. String wrd = null; Pair> lkup = null; wbr.write("\n"); wbr.write("\n"); for (Object wdobj : wdmap.keySet()) { wrd = (String) wdobj; lkup = (Pair>) (wdmap.get(wdobj)); wbr.write("\t\n"); for (String st : lkup.b) { if(catCount.get(st) >= minCatFreq) { wbr.write("\t\t " + forXML(st) + " \n"); } } wbr.write("\t\n"); } wbr.write(""); String pos = null; Set plkup = null; pbr.write("\n"); pbr.write("\n"); for (Object pobj : posmap.keySet()) { pos = (String) pobj; plkup = (Set) posmap.get(pobj); pbr.write("\t\n"); for (String st : plkup) { if(catCount.get(st) >= minCatFreq) { pbr.write("\t\t " + forXML(st) + " \n"); } } pbr.write("\t\n"); } pbr.write(""); // clean up. wbr.flush(); wbr.close(); pbr.flush(); pbr.close(); // done } catch (FileNotFoundException ex) { Logger.getLogger(TaggingDictionaryExtractor.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException e) { Logger.getLogger(TaggingDictionaryExtractor.class.getName()).log(Level.SEVERE, null, e); } } public static void main(String[] args) throws Exception { String usage ="\nTaggingDictionaryExtractor -i -f -p -w \n\n"; if(args.length > 0 && args[0].equals("-h")) { System.out.print(usage); System.exit(0); } String inputCorp = null, wOutput = null, pOutput = null; // how frequently must a supertag category have been seen to be included in the dictionary? int catFreq = 10; for(int i = 0; i < args.length; i++) { if(args[i].equals("-i")) {inputCorp = args[++i]; continue;} if(args[i].equals("-w")) {wOutput = args[++i]; continue;} if(args[i].equals("-p")) {pOutput = args[++i]; continue;} if(args[i].equals("-f")) {catFreq = Integer.parseInt(args[++i]); continue; } System.err.println("Unknown command-line option: "+args[i]); } File in = new File(inputCorp); File wout = new File(wOutput); File pout = new File(pOutput); TaggingDictionaryExtractor tde = new TaggingDictionaryExtractor(in, wout, pout, "SRILM", catFreq); tde.extract(); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/Constants.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger; import opennlp.ccg.lexicon.Word; /** * @author Dennis N. Mehay */ public final class Constants { public static final Double one = new Double(1.0); public static final Double zero = new Double(0.0); public static final TaggedWord OOB = new TaggedWord(Word.createWord("OOS", null, null, "OOS", "OOS", "OOS", null)); public static enum Domain {PROB, LOGPROB}; public static enum TaggingAlgorithm {FORWARDBACKWARD, FORWARD}; } ================================================ FILE: src/opennlp/ccg/parse/tagger/ProbIndexPair.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger; /** * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2010/09/21 04:12:41 $ */ public class ProbIndexPair implements Comparable { public Double a; public Integer b; public ProbIndexPair(Double a, Integer b) { this.a=a; this.b=b; } public int compareTo(ProbIndexPair p) { return (-1 * (this.a).compareTo(p.a)); } public static void main(String[] args) { ProbIndexPair p1 = new ProbIndexPair(new Double(4.0), new Integer(5)); ProbIndexPair p2 = new ProbIndexPair(new Double(3.0), new Integer(5)); ProbIndexPair p3 = new ProbIndexPair(new Double(2.0), new Integer(5)); ProbIndexPair p4 = new ProbIndexPair(new Double(4.0), new Integer(5)); System.out.println("p1 < p2? "+(p1.compareTo(p2)<0)); System.out.println("p2 < p3? "+(p2.compareTo(p3)<0)); System.out.println("p1 < p3? "+(p1.compareTo(p3)<0)); System.out.println("p1 == p4? "+(p1.compareTo(p4)==0)); System.out.println("p2 > p1? "+(p2.compareTo(p1)>0)); System.out.println("p3 > p2? "+(p3.compareTo(p2)>0)); System.out.println("p3 > p1? "+(p3.compareTo(p1)>0)); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/TaggedWord.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /** * A wrapper around {@code Word}s that can hold multitaggings (for POSs and for * supertags). */ package opennlp.ccg.parse.tagger; import java.util.List; import opennlp.ccg.lexicon.Word; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay */ public class TaggedWord { // multitaggings for POSs and supertags (resp). private List> postagging; private List> stagging; // old-timey Word that holds the word form (and potentially gold POS and supertag). private Word oldWord; /** Decorators for the core functionality of the underlying word. */ public String getSupertag() { return oldWord.getSupertag(); } public String getForm() { return oldWord.getForm(); } public String getPOS() { return oldWord.getPOS(); } /** Accessor for the underlying vanilla Word. */ public Word getWord() { return oldWord; } /** Constructor with a Word. */ public TaggedWord(Word wd) { oldWord = Word.createFullWord(wd, wd.getForm(), wd.getPOS(), wd.getSupertag(), wd.getSemClass()); } /** This does the obvious thing. */ public void setSupertagging(List> stagging) { this.stagging = stagging; } /** * Set the multi-POS tagging. * Also replace the underlying single-best tagging with the * first tag of the multitag list. */ public void setPOSTagging(List> postagging) { this.postagging = postagging; oldWord = Word.createFullWord(oldWord, oldWord.getForm(), this.postagging.get(0).b, oldWord.getSupertag(), oldWord.getSemClass()); } /** This does the obvious thing. */ public List> getSupertagging() { return stagging; } /** This does the obvious thing. */ public List> getPOSTagging() { return postagging; } /** Gets the gold-standard supertag. */ public String getGoldSuper() { return oldWord.getSupertag(); } /** Gets the gold-standard POS tag. */ public String getGoldPOS() { return oldWord.getPOS(); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/io/CorpusIterator.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.io; import java.io.IOException; import java.util.Iterator; import java.util.List; import opennlp.ccg.lexicon.Word; /** * An interface that all file iterators must (should?) implement. * * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2010/09/21 04:12:41 $ */ public interface CorpusIterator { public List next() throws IOException; public boolean hasNext(); public void close(); public Iterator> iterator(); } ================================================ FILE: src/opennlp/ccg/parse/tagger/io/PipeDelimitedFactoredBundleCorpusIterator.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.io; import java.io.BufferedReader; import java.io.IOException; import java.util.Iterator; import java.util.List; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.supertagger.util.PipedTokenizer; /** * Assuming an input file of n different sentences of the form: * wordbundle1 wordbundle2 ... wordbundleM * ... [n-2 lines] * wordbundle1 ... wordbundleQ * * where the 'wordbundle's are SRILM factored LM-compliant * bundles of factors (wordform, lemma, POS, supertag, semantic class, etc.). * * We assume one sentence per line, so the ... bracketing is just * a formality (what SRILM expects). * * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2010/09/26 05:50:15 $ */ public class PipeDelimitedFactoredBundleCorpusIterator implements CorpusIterator, Iterator>, Iterable> { private BufferedReader reader; private String nextLine, nextID; public static final String SENT_START = "", SENT_END = ""; private Tokenizer toker = new PipedTokenizer(); /** Creates a new instance of SRILMFactoredBundleCorpusIterator */ public PipeDelimitedFactoredBundleCorpusIterator(BufferedReader file) { try { this.reader = file; String line = this.reader.readLine(); //while (line.length() == 0 && line != null) { // line = this.reader.readLine(); //} if (line != null && line.length() > 0) { line = line.trim(); this.nextLine = line; } else { this.nextLine = this.nextID = null; } } catch (Exception e) { e.printStackTrace(); } } /** * The client of this method is responsible for checking that there * is in fact a next line (by calling hasNext before * calling this method. * * @return A String representing the next line in the * file. * @throws java.io.IOException. */ public List next() { List currentSent = null; try { if (this.hasNext()) { String line = this.reader.readLine(); //while(line != null && (line.length()==0 || line.trim().equals(""))) { // line = this.reader.readLine(); //} if (line != null) { line = line.trim(); } if (this.nextLine.endsWith(SENT_END)) { currentSent = toker.tokenize(this.nextLine.substring(this.nextLine.indexOf(">") + 1, this.nextLine.lastIndexOf("<")).trim()); } else { currentSent = toker.tokenize(this.nextLine.substring(this.nextLine.indexOf(">") + 1).trim()); } if (line != null && !(line.trim().equals(""))) { this.nextLine = line; } else { this.nextLine = this.nextID = null; } } else { throw new IOException("There is no next line."); } } catch (IOException ex) { java.util.logging.Logger.getLogger("global").log(java.util.logging.Level.SEVERE, ex.getMessage(), ex); } return currentSent; } /** * @return A String representing the current parse ID * (a la CCGbank). */ public String getCurrentID() { return this.nextID; } /** * @return A boolean as to whether there is a next line * in the file. */ public boolean hasNext() { return this.nextLine != null; } /** * Closes the underlying BufferedReader. * */ public void close() { try { this.reader.close(); } catch (IOException ex) { java.util.logging.Logger.getLogger("global").log(java.util.logging.Level.SEVERE, ex.getMessage(), ex); } } public void remove() { throw new UnsupportedOperationException("Not supported yet."); } public Iterator> iterator() { return this; } } ================================================ FILE: src/opennlp/ccg/parse/tagger/io/SRILMFactoredBundleCorpusIterator.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.io; import java.io.BufferedReader; import java.io.IOException; import java.util.Iterator; import java.util.List; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.lexicon.Word; /** * Assuming an input file of n different sentences of the form: * wordbundle1 wordbundle2 ... wordbundleM * ... [n-2 lines] * wordbundle1 ... wordbundleQ * * where the 'wordbundle's are SRILM factored LM-compliant * bundles of factors (wordform, lemma, POS, supertag, semantic class, etc.). * * We assume one sentence per line, so the ... bracketing is just * a formality (what SRILM expects). * * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2010/09/26 05:50:15 $ */ public class SRILMFactoredBundleCorpusIterator implements CorpusIterator, Iterator>, Iterable> { private BufferedReader reader; private String nextLine, nextID; public static final String SENT_START = "", SENT_END = ""; private Tokenizer toker = new DefaultTokenizer(); /** Creates a new instance of SRILMFactoredBundleCorpusIterator */ public SRILMFactoredBundleCorpusIterator(BufferedReader file) { try { this.reader = file; String line = this.reader.readLine(); if (line != null && line.length() > 0) { line = line.trim(); this.nextLine = line; } else { this.nextLine = this.nextID = null; } } catch (Exception e) { e.printStackTrace(); } } /** * The client of this method is responsible for checking that there * is in fact a next line (by calling hasNext before * calling this method. * * @return A String representing the next line in the * file. * @throws java.io.IOException. */ public List next() { List currentSent = null; try { if (this.hasNext()) { String line = this.reader.readLine(); //while(line != null && (line.length()==0 || line.trim().equals(""))) { // line = this.reader.readLine(); //} if (line != null) { line = line.trim(); } if (this.nextLine.endsWith(SENT_END)) { currentSent = toker.tokenize(this.nextLine.substring(this.nextLine.indexOf(">") + 1, this.nextLine.lastIndexOf("<")).trim()); } else { currentSent = toker.tokenize(this.nextLine.substring(this.nextLine.indexOf(">") + 1).trim()); } if (line != null && !(line.trim().equals(""))) { this.nextLine = line; } else { this.nextLine = this.nextID = null; } } else { throw new IOException("There is no next line."); } } catch (IOException ex) { java.util.logging.Logger.getLogger("global").log(java.util.logging.Level.SEVERE, ex.getMessage(), ex); } return currentSent; } /** * @return A String representing the current parse ID * (a la CCGbank). */ public String getCurrentID() { return this.nextID; } /** * @return A boolean as to whether there is a next line * in the file. */ public boolean hasNext() { return this.nextLine != null; } /** * Closes the underlying BufferedReader. * */ public void close() { try { this.reader.close(); } catch (IOException ex) { java.util.logging.Logger.getLogger("global").log(java.util.logging.Level.SEVERE, ex.getMessage(), ex); } } public void remove() { throw new UnsupportedOperationException("Not supported yet."); } public Iterator> iterator() { return this; } } ================================================ FILE: src/opennlp/ccg/parse/tagger/ml/MaxentModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.ml; import java.util.Collection; import opennlp.ccg.util.Pair; /** * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2010/09/21 04:12:41 $ */ public interface MaxentModel { public double[] eval(Collection> context); public String getOutcome(int indexOfOutcome); } ================================================ FILE: src/opennlp/ccg/parse/tagger/ml/TaggerFeature.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.ml; /** * @author Dennis N. Mehay */ public class TaggerFeature { public String name; public Double activation; public TaggerFeature(String name, Double activation) { this.name = name.intern(); this.activation = activation; } } ================================================ FILE: src/opennlp/ccg/parse/tagger/ml/ZLMEM.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.ml; import java.io.File; import java.util.Collection; import opennlp.ccg.util.Pair; import opennlp.ccg.parse.tagger.Constants; /** * Decorates ZLMaxentModel, making it a MaxentModel. * * @author Dennis N. Mehay * @version $Revision: 1.2 $, $Date: 2010/09/26 05:50:15 $ */ public class ZLMEM extends ZLMaxentModel implements MaxentModel { public ZLMEM(File model) { super(model); } /** * @param context: A collection of String,Double pairs, representing the contextual input * features and their activations. * @return a double[] which represents a probability distribution over output classes, each * retrievable by its index with getOutcome(index); */ public double[] eval(Collection> context) { // Have to turn a collection of pairs into a String[] of feature:activation Strings. // Sloppy and inefficient. // TODO: A better solution would be to refactor ZLMaxentModel (DNM) String[] inpt = new String[context.size()]; int index = -1; for(Pair inp : context) { inpt[++index] = inp.a + ":" + inp.b; } return super.eval(context, true, Constants.Domain.PROB); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/ml/ZLMaxentModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.ml; /* A nearly literal translation of Zhang Le's pymaxent.py file * into Java (D.N. Mehay). */ import opennlp.ccg.parse.tagger.Constants.Domain; import opennlp.ccg.parse.tagger.Constants; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Map; import opennlp.ccg.util.Pair; public class ZLMaxentModel { public boolean verbose = false; private ItemMap predMap = null; private ItemMap outComeMap = null; ArrayList>> paramsMap; private boolean loaded = false; private double[] probs; private int n_outcome; // the parameters. private double[] theta; private Double one = Constants.one; public ZLMaxentModel() { } public ZLMaxentModel(File model) { load(model); } public void load(File modelFile) { if (!loaded) { loaded = true; BufferedReader br = null; try { br = new BufferedReader(new FileReader(modelFile)); String line = br.readLine(); if (line.contains("#")) { line = br.readLine(); } if (verbose) System.err.println("\nReading predicates..."); // Read in contextual predicates. int numPreds = Integer.parseInt(line); predMap = new ItemMap(); // read in predicates... for (int i = 0; i < numPreds; i++) { line = br.readLine(); predMap.add(line); } if (verbose) System.err.println("Reading outcomes..."); outComeMap = new ItemMap(); // Read in outcomes (labels). line = br.readLine(); int numOutcomes = Integer.parseInt(line); for (int j = 0; j < numOutcomes; j++) { line = br.readLine(); outComeMap.add(line); } if (verbose) System.err.println("Reading parameters..."); // Read parameters. int numParameters = predMap.size(); paramsMap = new ArrayList>>(numParameters); ArrayList> prms; int fid = 0; String ln = ""; for (int q = 0; q < numParameters; q++) { ln = br.readLine(); String[] lineParts = ln.split(" "); prms = new ArrayList>(Integer.parseInt(lineParts[0])); Integer oid; for (int p = 1; p < lineParts.length; p++) { oid = Integer.valueOf(lineParts[p]); prms.add(new Pair(oid, Integer.valueOf(fid))); fid++; } paramsMap.add(prms); } // Load theta. int nTheta = Integer.valueOf(br.readLine()); if (verbose) System.err.println("Number of parameters: " + nTheta); theta = new double[nTheta]; for (int z = 0; z < theta.length; z++) { theta[z] = Double.parseDouble(br.readLine()); } n_outcome = outComeMap.size(); // Initialise the array for computing distribution over all labels. probs = new double[n_outcome]; if (verbose) System.err.println("Number of outcomes: " + n_outcome); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { br.close(); } catch (IOException ioe) { ioe.printStackTrace(); } } } } public String getBestOutcome(double[] probs) { double maxprob = 0.0; int maxidx = -1; for (int i = 0; i < probs.length; i++) { if (probs[i] > maxprob) { maxidx = i; maxprob = probs[i]; } } return getOutcome(maxidx); } public double[] eval(Collection> context, boolean realValued) { return eval(context, realValued, Domain.PROB); } public double[] eval(Collection> context, boolean realValued, Domain domain) { // Zero out prob distribution over labels. for (int i = 0; i < probs.length; i++) { probs[i] = 0.0; // build up exponentiated scores. } for (Pair pv : context) { Integer predID = predMap.id(pv.a); if (predID != null) { ArrayList> featClassAssocs = paramsMap.get(predID.intValue()); for (Pair classAndAssoc : featClassAssocs) { if (pv.b == one) { // ln(exp(lambda * 1)) = ln(exp(lambda)^1) = ln(exp(lambda)) = lambda probs[classAndAssoc.a.intValue()] += theta[classAndAssoc.b.intValue()]; } else { // ln(exp(lambda * )) = ln(exp(lambda)^) probs[classAndAssoc.a.intValue()] += Math.log(Math.exp(theta[classAndAssoc.b.intValue()] * pv.b.doubleValue())); } } } } double sum = 0.0; // exponentiate the numerators for the denomenator sum. for (int p = 0; p < probs.length; p++) { sum += Math.exp(probs[p]); } sum = Math.log(sum); for (int q = 0; q < probs.length; q++) { probs[q] -= sum; } if (domain == Domain.PROB) { // translate back from the log domain. for (int q = 0; q < probs.length; q++) { probs[q] = Math.exp(probs[q]); } } return probs; } public String getOutcome(int index) { return outComeMap.getItem(index); } } class ItemMap { private Integer index; private Map dict = new HashMap(); private Map reverseDict = new HashMap(); public ItemMap() { index = Integer.valueOf(0); } public int add(String item) { if (dict.containsKey(item)) { return dict.get(item); } else { dict.put(item, index); reverseDict.put(index, item); index = Integer.valueOf(index.intValue() + 1); return index.intValue() - 1; } } public Integer id(String item) { if (dict.containsKey(item)) { return dict.get(item); } else { return null; } } public int size() { return dict.size(); } public String getItem(int i) { return reverseDict.get(Integer.valueOf(i)); } } class IntegerPool { private Integer[] _table; public IntegerPool(int size) { _table = new Integer[size]; for (int i = 0; i < size; i++) { _table[i] = new Integer(i); } } public Integer getInt(int i) { if (i < _table.length && i >= 0) { return _table[i]; } else { return new Integer(i); } } } ================================================ FILE: src/opennlp/ccg/parse/tagger/sequencescoring/Backpointer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.sequencescoring; import java.util.List; /** * An abstraction for lists of backpointers in trellises, lattices, etc. * Each backpointer is a List of Integer's that give backpointers to the j-th highest * scoring paths (where 1 <= j <= N==len(Backpointer) are the indices of the * internal list of the backpointer). * * @author Dennis N. Mehay */ public class Backpointer { private List bkpts; public Backpointer(List bkpts) { this.bkpts = bkpts; } public List getBkpts() { return bkpts; } public Integer get(int i) { return bkpts.get(i); } public int size() { return bkpts.size(); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/sequencescoring/FBNode.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.sequencescoring; import java.util.List; /** * A node in a forward-backward lattice. * * Holds the current label, the forward score, backward score, a list of Doubles representing * this node's contribution to the scores of the following nodes in the next * step of the lattice, a list of Strings representing the Markov history of the * optimal sequence leading up to this node, and, finally, holds a ranked array of * backpointers to the n-best optimal predecessor nodes. * * @author Dennis N. Mehay */ public class FBNode { /** How far back of a Markov history window do we have? */ public int markovHistSize; /** The label at this node. */ public String label; /** The (normalised) sum of the log-probabilites of all paths leading to this node. */ public double forwardScore = 0.0; /** The (normalised) sum of the log-probabilites of all paths starting at this node. */ public double backwardScore = 0.0; /** * How does this node contribute to each of the nodes in the next time step in * the lattice? */ public List forwardContributions; /** The list of the optimal Markov history. */ public List markovHist; /** * A list of backpointers to the nodes in the previous time step (ranked in order * of how likely the sequence including them leading to this node is). */ public List backpointers; /** * Empty constructor. Default Markov history of length 2. * All other values are set directly in the fields as they are calculated. */ public FBNode() { this(2); } /** Constructor that only specifies Markov history size. */ public FBNode(int markovHistSize) { this.markovHistSize = markovHistSize; } } ================================================ FILE: src/opennlp/ccg/parse/tagger/sequencescoring/SequenceScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.sequencescoring; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.logging.Level; import java.util.logging.Logger; import opennlp.ccg.lexicon.Word; import opennlp.ccg.ngrams.StandardNgramModel; import opennlp.ccg.util.Interner; import opennlp.ccg.util.Pair; import opennlp.ccg.parse.tagger.ProbIndexPair; import opennlp.ccg.parse.tagger.Constants; /** * Initialise with a language model over the output sequences and, * given a List of List>'s initially tagged with "observation" * probabilities (output probabilities only based on local features), * return the forward-pass re-estimated probabilites of the output * classes. * * @author Dennis N. Mehay */ public class SequenceScorer extends StandardNgramModel { /** * A Trellis to hold sequence labels (wrapped in Word classes) * functionality. */ private Trellis seqLabs; /** Trellis for initial observation model scores. */ private Trellis initScores; /** Trellis for forward-backward re-estimated scores. */ private Trellis fbScores; /** Trellis of back-pointers (for retrieving n-best sequences). */ private Trellis backPointers; /** How many of the previous (following) best predictions make it into the forward (or backward) search? */ private int searchBeam = 200; /** Re-usable private data structures. */ private List> tmpInitScores = new ArrayList>(500); private List> tmpFwdScores = new ArrayList>(500); private List> tmpSeqLabs = new ArrayList>(500); private List> tmpBkpointers = new ArrayList>(500); /** For interning Word's */ private Interner words = new Interner(); private Constants.TaggingAlgorithm alg = Constants.TaggingAlgorithm.FORWARDBACKWARD; /** Create a ForwardScorer with a sequence model (over supertags, POSs tags, words, etc.) */ public SequenceScorer(int order, String lmFile) throws IOException { super(order, lmFile); } /** * A utility method for finding the order of n-gram models (by reading in the ARPA-formatted file. * (A bit messy, I know.) */ public static int findOrder(String tagSequenceModel) { // find n-gram order of sequence model. BufferedReader reader = null; String ln = null; int ord = 0; try { reader = new BufferedReader(new FileReader(new File(tagSequenceModel))); ln = reader.readLine(); reader = new BufferedReader(new FileReader(new File(tagSequenceModel))); while (ln != null && !ln.startsWith("\\data\\")) { ln = reader.readLine(); } ln = reader.readLine(); while (ln != null & ln.startsWith("ngram ")) { ord = Integer.parseInt(ln.split(" ")[1].split("=")[0]); ln = reader.readLine(); } reader.close(); } catch (FileNotFoundException fnfe) { Logger.getLogger(SequenceScorer.class.getName()).log(Level.SEVERE, null, fnfe); } catch (IOException ioe) { Logger.getLogger(SequenceScorer.class.getName()).log(Level.SEVERE, null, ioe); } return ord; } /** Set the tagging algorithm (with one of {forward-backward, forward}). */ public void setAlgorithm(Constants.TaggingAlgorithm newAlg) { alg = newAlg; } /** * Set the maximum width of the number of previous hypothesized tags to consider * in the forward probabilities. */ public void setSearchBeam(int newBeam) { searchBeam = newBeam; } /** Rescore an observation sequence of (initially) supertagged Word's using the sequence model. */ public List>> rescoreSequence(List>> observationSequence) { // build up initial trellises. tmpInitScores.clear(); tmpFwdScores.clear(); tmpSeqLabs.clear(); tmpBkpointers.clear(); for (List> tw : observationSequence) { ArrayList scrs = new ArrayList(tw.size()); ArrayList fscs = new ArrayList(tw.size()); ArrayList sLabs = new ArrayList(tw.size()); ArrayList bpts = new ArrayList(tw.size()); for (Pair tagging : tw) { // add observation score, and convert to log-prob domain, if needed. scrs.add((tagging.a > 0) ? Math.log(tagging.a) : tagging.a); fscs.add(null); sLabs.add(words.intern(Word.createWord(tagging.b, null, null, null, null, null, null))); bpts.add(null); } tmpInitScores.add(scrs); tmpSeqLabs.add(sLabs); tmpFwdScores.add(fscs); tmpBkpointers.add(bpts); } initScores = new Trellis(tmpInitScores); // these are initially null. fbScores = new Trellis(tmpFwdScores); // these are too. backPointers = new Trellis(tmpBkpointers); seqLabs = new Trellis(tmpSeqLabs); // forward loop. // for each word... for (int u = 0; u < observationSequence.size(); u++) { List> tw = observationSequence.get(u); double normTot = 0.0; // for each of its tags within the search beam. for (int v = 0; v < tw.size(); v++) { Word currTag = seqLabs.getCoord(u, v); List bestHist = null; Double seqScore = null; Double obsScore = initScores.getCoord(u, v); if (u == 0) { // beginning of sequence. bestHist = getBestHist(u, v, order); bestHist.add(currTag); seqScore = lmScore(bestHist); double fs = seqScore + obsScore; normTot += Math.exp(fs); fbScores.setCoord(u, v, fs); } else { // use dynamic programming-computed scores to progress. List> prevTaggedWord = observationSequence.get(u - 1); ProbIndexPair[] bestPrevScores = new ProbIndexPair[Math.min(prevTaggedWord.size(), searchBeam)]; for (int z = 0; z < Math.min(prevTaggedWord.size(), searchBeam); z++) { bestHist = getBestHist(u - 1, z, order - 1); bestHist.add(currTag); seqScore = lmScore(bestHist); double fs = fbScores.getCoord(u - 1, z) + seqScore; fs += obsScore; bestPrevScores[z] = new ProbIndexPair( Double.valueOf(fs), Integer.valueOf(z)); } // sort descending based on score. Arrays.sort(bestPrevScores); // add up the prob's of all sequences leading to this node. double fsum = 0.0; for (int q = 0; q < bestPrevScores.length; q++) { fsum += Math.exp(bestPrevScores[q].a); } normTot += fsum; //fbScores.setCoord(u, v, bestPrevScores[0].a.doubleValue()); fbScores.setCoord(u, v, Math.log(fsum)); // add n-best backpointers. List bks = new ArrayList(bestPrevScores.length); for (int q = 0; q < bestPrevScores.length; q++) { bks.add(bestPrevScores[q].b); } backPointers.setCoord(u, v, new Backpointer(bks)); } } // normalise. for (int v = 0; v < tw.size(); v++) { fbScores.setCoord(u, v, Math.log(Math.exp(fbScores.getCoord(u, v)) / normTot)); } } // backward loop. int size = observationSequence.size(); if (alg == Constants.TaggingAlgorithm.FORWARDBACKWARD) { // for each word... for (int u = size - 1; u >= 0; u--) { List> tw = observationSequence.get(u); double normTot = 0.0; // for each of its tags... for (int v = 0; v < tw.size(); v++) { List bestHist = null; Double obsScore = initScores.getCoord(u, v); if (u == (size - 1)) { // right-hand end of sequence. bestHist = getBestHist(u, v, order - 1); bestHist.add(words.intern(Word.createWord("", null, null, null, null, null, null))); double bsc = fbScores.getCoord(u, v) + obsScore; normTot += Math.exp(bsc); fbScores.setCoord(u, v, bsc); } else { // use dynamic programming-computed scores to progress backwards. bestHist = getBestHist(u, v, order - 1); List> followingTaggedWd = observationSequence.get(u + 1); double backwardSum = 0.0; for (int z = 0; z < followingTaggedWd.size(); z++) { Word followingTag = words.intern(Word.createWord(followingTaggedWd.get(z).b.intern(), null, null, null, null, null, null)); if (z > 0) { bestHist.remove(bestHist.size() - 1); } bestHist.add(followingTag); backwardSum += Math.exp(lmScore(bestHist) + fbScores.getCoord(u + 1, z)); } double newSc = Math.log(backwardSum) + obsScore; normTot += Math.exp(newSc); fbScores.setCoord(u, v, newSc); } } // normalise. for (int v = 0; v < tw.size(); v++) { fbScores.setCoord(u, v, Math.log(Math.exp(fbScores.getCoord(u, v)) / normTot)); } } } // re-sort based on re-estimated scores. for (int i = 0; i < observationSequence.size(); i++) { ProbIndexPair[] fwdScrs = new ProbIndexPair[observationSequence.get(i).size()]; List> tagging = observationSequence.get(i); for (int j = 0; j < tagging.size(); j++) { double probP = Math.exp(fbScores.getCoord(i, j).doubleValue()); fwdScrs[j] = new ProbIndexPair(probP, new Integer(j)); } Arrays.sort(fwdScrs); List> newTagging = new ArrayList>(fwdScrs.length); for (int z = 0; z < fwdScrs.length; z++) { Double renorm = new Double(fwdScrs[z].a.doubleValue()); if (renorm.equals(Constants.one)) { renorm = Constants.one; } newTagging.add(new Pair(renorm, tagging.get(fwdScrs[z].b.intValue()).b)); } observationSequence.set(i, newTagging); } return observationSequence; } /** * Use the LM to score a sequence of words. */ private double lmScore(List seq) { setWordsToScore(seq, false); prepareToScoreWords(); return logprob(); } /** Follow the back-pointers to get the best sequence of up to length 'order' leading up to cell (i,j). */ private List getBestHist(int i, int j, int order) { int size = Math.max(order, 0); List retVal = null; Backpointer bp = backPointers.getCoord(i, j); if (i == -1) { // base case (off of the end of the sequence). retVal = new ArrayList(size); retVal.add(words.intern(Word.createWord("", null, null, null, null, null, null))); return retVal; } else if (i == 0) { // base case (at beginning of sequence) retVal = getBestHist(i - 1, 0, order - 1); retVal.add(seqLabs.getCoord(i, j)); return retVal; } else if (order == 0) { // base case (reached back as far as the n-gram model will need to see). retVal = new ArrayList(size); return retVal; } else { // recursive case. retVal = getBestHist(i - 1, bp.get(0).intValue(), order - 1); retVal.add(seqLabs.getCoord(i, j)); return retVal; } } } ================================================ FILE: src/opennlp/ccg/parse/tagger/sequencescoring/Trellis.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.sequencescoring; import java.util.ArrayList; import java.util.List; /** * A Trellis for sequence coding (of supertags, e.g.). * * @author Dennis N. Mehay */ public class Trellis { /** The dimensions of the Trellis. */ private int cols, rows; /** The actual nuts and bolts of the Trellis. */ private ArrayList> trellis; /** Constructor with passed-in list of lists. */ public Trellis(List> inpt) { this.reshape(inpt.size(), inpt.get(0).size(), inpt); } /** Constructor with dimensions. */ public Trellis(int cols, int rows, A dummy) { List> tr = new ArrayList>(cols); for(int i = 0; i < cols; i++) { ArrayList tmp = new ArrayList(rows); for(int j = 0; j < rows; j++) { tmp.add(dummy); } tr.add(tmp); } reshape(cols, rows, tr); } /** * Reshape the dimensions (e.g., to accomodate a new sequence with a * particular max beam width */ public void reshape(int cols, int rows, List> inpt) { this.cols = cols; this.rows = rows; this.trellis = new ArrayList>(cols); for(List la : inpt) { ArrayList row = new ArrayList(rows); for(A a : la) { row.add(a); } this.trellis.add(row); } } /** What is the max beam width? */ public int getWidth() { return rows; } /** What is the length of the sequence? */ public int getLength() { return cols; } /** Clear out values in the trellis. */ public void clear() { for(int i = 0; i < cols; i++) { this.trellis.add(new ArrayList(rows)); } } /** Get the sequence options at index i. */ public ArrayList getOptions(int i) { return this.trellis.get(i); } /** Get a node in the Trellis (referenced by 2D coordinate). */ public A getCoord(int i, int j) { try {return this.trellis.get(i).get(j); } catch(IndexOutOfBoundsException iobe) { return null; } } /** Set the value at a node in the Trellis (referenced by 2D coordinate) */ public void setCoord(int i, int j, A val) { this.trellis.get(i).set(j, val); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/util/CCGBankToSRILMFLM.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * Read in the CCGbank (auto format), line by line, and transform each lexical * item () into an SRILM factored LM bundle format: * W-word:S-word:P-pos1:T:cat1, where every thing has been escaped (e.g., colons), * "W" stands for word form, "S" for stem, "P" for POS and "T" for super_t_ag. */ package opennlp.ccg.parse.tagger.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.ccg.lexicon.DefaultTokenizer; /** * @author Dennis N. Mehay */ public class CCGBankToSRILMFLM { public static void main(String[] args) throws FileNotFoundException, IOException { String usage = "\nCCGBankToSRILMFLM -input -o \n"; if (args.length > 0 && args[0].equals("-h") || args.length == 0) { System.out.println(usage); System.exit(0); } BufferedReader reader = null; BufferedWriter writer = null; String inputCorp = "train.auto", output = "train.srilm"; for (int i = 0; i < args.length; i++) { if (args[i].equals("-i")) {inputCorp = args[++i]; continue;} if (args[i].equals("-o")) {output = args[++i]; continue;} System.out.println("Unrecognized option: " + args[i]); } reader = new BufferedReader(new FileReader(new File(inputCorp))); writer = new BufferedWriter(new FileWriter(new File(output))); String parseIDHeader = "ID="; Pattern p = Pattern.compile("()+?"); String line = reader.readLine(); while(line != null) { if(line.startsWith(parseIDHeader)) {line = reader.readLine(); continue;} line = line.trim(); Matcher m = p.matcher(line); String word = null, pos = null, cat = null; int cnt = 0; while(m.find()) { String toks = m.group(); // {} String[] parts = toks.split(" "); word = parts[4]; pos = parts[2]; cat = parts[1]; if(cnt++ > 0) { writer.write(" "); } writer.write("W-"+DefaultTokenizer.escape(word)+":" +"S-"+DefaultTokenizer.escape(word)+":" +"P-"+DefaultTokenizer.escape(pos)+":" +"T-"+DefaultTokenizer.escape(cat)); } writer.write(System.getProperty("line.separator")); line = reader.readLine(); } writer.close(); reader.close(); } } ================================================ FILE: src/opennlp/ccg/parse/tagger/util/ConfigFileProcessor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2010 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed inp the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.util; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.*; //import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; /** * @author Dennis N. Mehay */ public class ConfigFileProcessor { /** Read in config file as a {@code Map}. */ public static Map readInConfig(String configFile) { return readInConfig(configFile, null); } /** * Read in config file as a {@code Map}, resolving the given path keys * relative to the config file if not absolute. */ public static Map readInConfig(String configFile, String pathKeys[]) { Set paths = Collections.emptySet(); if (pathKeys != null) paths = new HashSet(Arrays.asList(pathKeys)); BufferedReader cf = null; Map opts = new HashMap(); try { File infile = new File(configFile); File parentDir = infile.getParentFile(); cf = new BufferedReader(new FileReader(infile)); String ln = cf.readLine(); // map options to values. while (ln != null) { if (ln.trim().equals("") || ln.trim().startsWith("#")) { ln = cf.readLine(); continue; } String[] parts = ln.trim().split("="); String key = parts[0].trim().toLowerCase(); String val = parts[1].trim(); // resolve path keys if (paths.contains(key)) { File f = new File(parentDir, val); if (!f.exists()) { f = new File(val); if (!f.exists()) throw new FileNotFoundException("Can't resolve filename: " + val); } val = f.getPath(); } opts.put(key, val); ln = cf.readLine(); } } catch (FileNotFoundException ex) { Logger.getLogger(ConfigFileProcessor.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(ConfigFileProcessor.class.getName()).log(Level.SEVERE, null, ex); } finally { try { cf.close(); } catch (IOException ex) { Logger.getLogger(ConfigFileProcessor.class.getName()).log(Level.SEVERE, null, ex); } } return opts; } } ================================================ FILE: src/opennlp/ccg/parse/tagger/util/ResultSink.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Dennis N. Mehay // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.parse.tagger.util; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import opennlp.ccg.lexicon.Word; import opennlp.ccg.util.Pair; /** * Inspired (loosely, based on my recollection) by Jason Baldridge's * similar class for tracking classifier performance. * Here we simply track the Word-by-Word * tagging performance of a CCG supertagger by passing in a multitagging * and a gold-standard answer and tabulating the results. The results * are reported by a custom report method, which returns * a String representation of the results. * * @author Dennis N. Mehay * @version $Revision: 1.1 $, $Date: 2010/09/21 04:12:42 $ */ public class ResultSink { public static enum ResultSinkType { SUPERTAG, POSTAG }; private int totalTags = 0, totalWords = 0, totalRight = 0; // for keeping pos-specific stats. private Map posToRight = new HashMap(), posTot = new HashMap(); // for tracking the total number of sentences, number totally tagged // correctly, etc. private int sentNum = 0, sentsCorrect = 0; private boolean allCorrect = true; // what type of tag are we tracking the results over? private ResultSinkType whatType; // for general pos-specific stats (e.g., N... -> , not NNP -> and NNPS -> , etc.) private Map genPOSToRight = new HashMap(), genPOSTot = new HashMap(); /** * Nullary constructor. Defaults to supertag result sink. * (TODO: add log file logging for more detailed error reporting.) */ public ResultSink() { this(ResultSinkType.SUPERTAG); } public ResultSink(ResultSinkType whatType) { this.whatType = whatType; } /** * Add and store a sentence of tagged words (List>>) * wrt a gold-standard tagged word. */ public void addSent(List>> sent, List goldTagging) { sentNum++; allCorrect = true; Iterator gold = goldTagging.iterator(); for (List> tgging : sent) { addResult(tgging, gold.next()); } if (allCorrect) { sentsCorrect++; } } /** * Add a single-word tagging result alongside its gold-standard tagging. * Compare and log whether the gold-standard tag is in the beta-best (also * log pos-specific error stats). */ public void addResult(List> tagging, Word goldTagging) { String goldTag = (whatType == ResultSinkType.SUPERTAG) ? goldTagging.getSupertag() : goldTagging.getPOS(); totalTags += tagging.size(); totalWords++; // mww: check for missing gold POS (grrr) if (goldTagging.getPOS() == null) { System.err.println("Warning: found null gold POS, skipping word: " + goldTagging); this.allCorrect = false; return; } String thisPOS = goldTagging.getPOS(), thisGenPOS = goldTagging.getPOS().substring(0, 1); Integer posT = this.posTot.get(thisPOS), gPOST = this.genPOSTot.get(thisGenPOS); if (posT == null) { this.posTot.put(thisPOS, new Integer(1)); } else { this.posTot.put(thisPOS, new Integer(posT.intValue() + 1)); } if (gPOST == null) { this.genPOSTot.put(thisGenPOS, new Integer(1)); } else { this.genPOSTot.put(thisGenPOS, new Integer(gPOST.intValue() + 1)); } // assume this tagging is incorrect, until proven otherwise. boolean gotIt = false; for (Pair tag : tagging) { if (tag.b.equals(goldTag)) { gotIt = true; totalRight++; // add one both to the pos right and total for that pos type. Integer posLkup = this.posToRight.get(thisPOS), genPOSLkup = this.genPOSToRight.get(thisGenPOS); if (posLkup == null) { this.posToRight.put(thisPOS, new Integer(1)); } else { this.posToRight.put(thisPOS, new Integer(posLkup.intValue() + 1)); } if (genPOSLkup == null) { this.genPOSToRight.put(thisGenPOS, new Integer(1)); } else { this.genPOSToRight.put(thisGenPOS, new Integer(genPOSLkup.intValue() + 1)); } break; } } // mistagged this one word, so tagging the whole sentence correctly -- // allCorrect==true -- is not possible. if (!gotIt) { this.allCorrect = false; } } public String report() { // make sure 0 counts are inserted for POS types that were never got right. for (String post : this.posTot.keySet()) { if (this.posToRight.get(post) == null) { this.posToRight.put(post, new Integer(0)); } } for (String post : this.genPOSTot.keySet()) { if (this.genPOSToRight.get(post) == null) { this.genPOSToRight.put(post, new Integer(0)); } } String rep = ""; rep += "\n\nAccuracy by POS type:\n\n"; for (String post : this.posTot.keySet()) { rep += post + ": " + ((this.posToRight.get(post).intValue() + 0.0) / (this.posTot.get(post))) + " <==> " + this.posToRight.get(post).intValue() + "/" + (this.posTot.get(post)) + " = " + (100 * ((this.posTot.get(post) - this.posToRight.get(post) + 0.0) / (totalWords - totalRight))) + " (% of total errors) \n"; } rep += "\nAccuracy by general (truncated) POS type:\n\n"; for (String post : this.genPOSTot.keySet()) { rep += post + ": " + (this.genPOSToRight.get(post).intValue() + 0.0) / (this.genPOSTot.get(post)) + " <==> " + this.genPOSToRight.get(post).intValue() + "/" + (this.genPOSTot.get(post)) + " = " + (100 * ((this.genPOSTot.get(post) - this.genPOSToRight.get(post) + 0.0) / (totalWords - totalRight))) + " (% of total errors) \n"; } rep += "\nTotal words: " + totalWords + "\nTotal sents: " + this.sentNum + "\nAggregate total tags: " + totalTags + "\nAve. tags/word: " + ((totalTags + 0.0) / (totalWords + 0.0)) + "\nWord accuracy: " + ((totalRight + 0.0) / totalWords) + "\n" + "\nSent accuracy: " + ((this.sentsCorrect + 0.0) / (this.sentNum)) + "\n\n"; return rep; } } ================================================ FILE: src/opennlp/ccg/perceptron/Alphabet.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import java.util.*; import java.io.*; import opennlp.ccg.util.*; import opennlp.ccg.lexicon.DefaultTokenizer; /** * A bidirectional mapping between feature names and indices. * * An alphabet can be read from either an alphabet file or a model file. * An alphabet file starts with the number of features on one line, * followed by one line per feature pairing the feature name with its * frequency (which is ignored). * * An alphabet can be open or closed. An closed alphabet does not * accept new features, and thus can be used to filter out features * not already in the alphabet. * * The main routine filters the input event file to an output alphabet file, * optionally with a table size and pruning threshold. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2009/11/02 03:44:15 $ */ public class Alphabet { /** Feature as a node in a trie, with each node storing the feature index. */ public static class Feature extends TrieMap { /** Constructor with index. */ public Feature(Integer index) { super(index); } /** Factory method, for adding empty child nodes. */ protected Feature createNode() { return new Feature(null); } /** Returns a string name by concatenating escaped keys using colons. */ public String name() { StringBuffer namebuf = new StringBuffer(); List keys = traceKeys(); for (int i=0; i < keys.size(); i++) { namebuf.append(DefaultTokenizer.escape(keys.get(i))); if (i+1 < keys.size()) namebuf.append(':'); } return namebuf.toString(); } /** Returns the index. */ public Integer getIndex() { return data; } } // mappings private Feature dict; private List dictInv; // size private int size = 0; // closed flag private boolean closed = false; /** Constructor with initial size. The alphabet is left open. */ public Alphabet(int size) { init(size); } /** Constructor to load an alphabet from a file. The alphabet is set to closed. */ public Alphabet(String filename) throws IOException { Reader reader = EventFile.openReader(new File(filename)); StreamTokenizer tokenizer = EventFile.initTokenizer(reader); tokenizer.nextToken(); int size = Integer.parseInt(tokenizer.sval); init(size); for (int i=0; i < size; i++) { tokenizer.nextToken(); add(tokenizer.sval); tokenizer.nextToken(); // skip freq or weight } reader.close(); closed = true; } // initializes dict, dictInv private void init(int size) { dict = new Feature(null); dictInv = new ArrayList(size); } /** Size. */ public int size() { return size; } /** Closed. */ public boolean closed() { return closed; } /** Set closed. */ public void setClosed(boolean closed) { this.closed = closed; } /** Add feature with given name, if not already present, returning added feature. */ public Feature add(String feat) { return add(parseKeys(feat)); } /** Add equivalent feature, if not already present, returning added feature. */ public Feature add(Feature f) { return add(f.traceKeys()); } /** Add feature with given keys, if not already present, returning added feature. */ public Feature add(List keys) { if (closed) throw new RuntimeException("Can't add to a closed alphabet!"); Feature node = (Feature) dict.findChildFromList(keys); return addNode(node); } /** Add feature with given keys, if not already present, returning added feature. */ public Feature addLazy(List> keyExtractors) { if (closed) throw new RuntimeException("Can't add to a closed alphabet!"); Feature node = (Feature) dict.findChildFromLazyList(keyExtractors); return addNode(node); } // adds a feature node private Feature addNode(Feature node) { if (node.data != null) return node; node.data = size++; dictInv.add(node); return node; } /** * Parses a feature name into a list of unescaped interned string keys, * breaking on colons. */ public static List parseKeys(String feat) { List retval = new ArrayList(); int current = 0; while (current < feat.length()) { int breakpos = feat.indexOf(":", current); String key; if (breakpos >= 0) { key = feat.substring(current, breakpos); current = breakpos + 1; } else { key = feat.substring(current); current = feat.length(); } retval.add(DefaultTokenizer.unescape(key).intern()); } return retval; } /** Get or add index of feature with given name (null if none when closed). */ public Feature index(String feat) { return index(parseKeys(feat)); } /** Get or add index of equivalent feature (null if none when closed). */ public Feature index(Feature f) { return index(f.traceKeys()); } /** Get or add index of feature with given keys (null if none when closed). */ public Feature index(List keys) { if (!closed) return add(keys); Feature node = (Feature) dict.getChildFromList(keys); return node; } /** Get or add index of feature with given key extractors (null if none when closed). */ public Feature indexLazy(List> keyExtractors) { if (!closed) return addLazy(keyExtractors); Feature node = (Feature) dict.getChildFromLazyList(keyExtractors); return node; } /** Get indexed feature. */ public Feature feature(int index) { return dictInv.get(index); } /** * Creates an alphabet file from an event file, * saving it to a file with features sorted by frequency, * using the given table size and pruning threshold. **/ public static void createAlphabet(String eventfile, String alphabetfile, int tablesize, int pruningthreshold) throws IOException { // open files EventFile eventFile = new EventFile(eventfile); PrintWriter out = EventFile.openWriter(new File(alphabetfile)); // init freq tally Map freqTally = new HashMap(tablesize*2); // read event file, incrementing tallies EventFile.Block block; FeatureMap goldMap = new FeatureMap(); Set seenFeats = new HashSet(); while ( (block = eventFile.nextBlock()) != null ) { // set gold map (nb: assumes single gold event) goldMap.clear(); seenFeats.clear(); for (EventFile.Event event : block.events) { if (event.count > 0) goldMap.add(event.features); } // tally distinct feats not in gold map for (EventFile.Event event : block.events) { if (event.count > 0) continue; for (FeatureVector.Iterator it = event.features.iterator(); it.hasNext(); ) { Feature feat = it.nextFeature(); float val = it.nextValue(); seenFeats.add(feat); float goldVal = goldMap.get(feat); if (val != goldVal) { Integer tally = freqTally.get(feat); if (tally != null) freqTally.put(feat, ++tally); else freqTally.put(feat, 1); } } } // tally unseen feats from gold event for (EventFile.Event event : block.events) { if (event.count == 0) continue; for (FeatureVector.Iterator it = event.features.iterator(); it.hasNext(); ) { Feature feat = it.nextFeature(); if (!seenFeats.contains(feat)) { Integer tally = freqTally.get(feat); if (tally != null) freqTally.put(feat, ++tally); else freqTally.put(feat, 1); } } } } // get tallies passing frequency threshold List> tallies = new ArrayList>(freqTally.size()); if (pruningthreshold > 0) { for (Map.Entry entry : freqTally.entrySet()) { if (entry.getValue() >= pruningthreshold) tallies.add(entry); } } else tallies.addAll(freqTally.entrySet()); // sort tallies by descending frequency // (further sorting alphabetically seems to take too long) Collections.sort( tallies, new Comparator>() { public int compare(Map.Entry entry1, Map.Entry entry2) { int val1 = entry1.getValue(); int val2 = entry2.getValue(); if (val1 > val2) return -1; if (val1 < val2) return 1; return 0; //return entry1.getKey().name().compareTo(entry2.getKey().name()); } } ); // write tallied features to file int size = tallies.size(); out.println(Integer.toString(size)); for (int i=0; i < size; i++) { Map.Entry entry = tallies.get(i); out.println(entry.getKey().name() + " " + entry.getValue()); } // close files eventFile.close(); out.close(); } /** Main routine for filtering event file to an alphabet file. */ public static void main(String[] args) throws IOException { if (args.length < 2) { System.out.println("Usage: java perceptron.Alphabet (-s ) (-p "); System.exit(0); } String eventfile = args[0]; String alphabetfile = args[1]; int tablesize = 1000000; int pruningthreshold = 0; for (int i=2; i < args.length; i++) { if (args[i].equals("-s")) tablesize = Integer.valueOf(args[++i]); if (args[i].equals("-p")) pruningthreshold = Integer.valueOf(args[++i]); } System.out.println("Writing alphabet to " + alphabetfile + " from event file " + eventfile); if (pruningthreshold > 0) System.out.println("with pruning threshold " + pruningthreshold); createAlphabet(eventfile, alphabetfile, tablesize, pruningthreshold); } } ================================================ FILE: src/opennlp/ccg/perceptron/ComposedFeatureExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import java.util.ArrayList; import opennlp.ccg.synsem.*; /** * Class for composing feature extractors. * Features from the component feature extractors are assumed to be independent. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2011/01/15 17:52:59 $ */ public class ComposedFeatureExtractor implements FeatureExtractor { /** The feature extractors. */ public final FeatureExtractor[] featureExtractors; /** Constructor. */ public ComposedFeatureExtractor(FeatureExtractor[] featureExtractors) { this.featureExtractors = featureExtractors; } /** Binary constructor. */ public ComposedFeatureExtractor(FeatureExtractor featureExtractor1, FeatureExtractor featureExtractor2) { this.featureExtractors = new FeatureExtractor[]{ featureExtractor1, featureExtractor2 }; } /** Constructor for sign scorers, some of which may be feature extractors. */ public ComposedFeatureExtractor(SignScorer[] models) { ArrayList feList = new ArrayList(models.length); for (int i = 0; i < models.length; i++) { if (models[i] instanceof FeatureExtractor) feList.add((FeatureExtractor)models[i]); } this.featureExtractors = feList.toArray(new FeatureExtractor[feList.size()]); } /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { FeatureVector[] featureVectors = new FeatureVector[featureExtractors.length]; for (int i=0; i < featureExtractors.length; i++) featureVectors[i] = featureExtractors[i].extractFeatures(sign, complete); return new ComposedFeatureVector(featureVectors); } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { for (FeatureExtractor fe : featureExtractors) { fe.setAlphabet(alphabet); } } } ================================================ FILE: src/opennlp/ccg/perceptron/ComposedFeatureVector.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; /** * Class for composing feature vectors. * Features from the component feature vectors are assumed to be independent. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2011/01/15 17:52:59 $ */ public class ComposedFeatureVector implements FeatureVector { /** The feature vectors. */ public final FeatureVector[] featureVectors; /** Constructor. */ public ComposedFeatureVector(FeatureVector[] featureVectors) { this.featureVectors = featureVectors; } /** Binary constructor. */ public ComposedFeatureVector(FeatureVector featureVector1, FeatureVector featureVector2) { this.featureVectors = new FeatureVector[]{ featureVector1, featureVector2 }; } /** Size. */ public int size() { int retval = 0; for (FeatureVector fv : featureVectors) retval += fv.size(); return retval; } /** Returns an iterator over the entries. */ public Iterator iterator() { if (featureVectors.length == 0) return EMPTY_ITERATOR; return new Iterator() { int i = 0; Iterator it = featureVectors[0].iterator(); public boolean hasNext() { if (it.hasNext()) return true; if (i == featureVectors.length-1) return false; it = featureVectors[++i].iterator(); return hasNext(); } public Alphabet.Feature nextFeature() { return it.nextFeature(); } public Float nextValue() { return it.nextValue(); } }; } } ================================================ FILE: src/opennlp/ccg/perceptron/EventFile.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008-2013 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import java.util.*; import java.util.zip.*; import java.io.*; import opennlp.ccg.synsem.Sign; /** * An abstract representation of an event file, whose syntax is a more readable version * of what's used in the TADM toolkit. A constructor flag controls whether to keep the * events in memory (defaults to false). * * An event file may be given an alphabet, which allows features to be filtered to * just those present in the alphabet, when it's closed; otherwise, the * alphabet is constructed dynamically. * * The concrete syntax of an event file is as follows. * An event file consists of a sequence of blocks. * A block starts with the number of events on a line by itself. * It is followed by each event, one per line. * Each event line has a frequency, followed by the number of feature-value pairs, * then the sequence of feature-value pairs, * where the feature name is a string with no white space. * Each feature can appear only once in an event, and must have a value greater than zero. * You can have events with a zero frequency -- these are used for dispreferred analyses * in ranking tasks such as parse selection or realization ranking. * * An example file appears below. There are two blocks, corresponding to the * parses of two different senses. The first block has two possible parses, * the first of which is correct, while the second block has three possible * parses, where the second one is the correct one. * *

     * 2
     * 1 2 feat1 1.22 feat2 3
     * 0 3 feat1 1.55 feat3 1 feat4 2.7
     * 3
     * 0 2 feat1 1.44 feat4 2.2
     * 1 1 feat1 1.33
     * 0 2 feat1 1.32 feat4 3.21
     * 
    * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/11/01 22:26:29 $ */ public class EventFile { /** A block is a list of events. */ public static class Block { /** The list. */ public List events; /** Constructor. */ public Block(List events) { this.events = events; } /** The event with the highest count (first tied if ties). */ public Event best() { Event retval = null; int max = -1; for (Event event : events) { if (event.count > max) { retval = event; max = event.count; } } return retval; } } /** An event is a feature vector with a count. */ public static class Event { /** The count. */ public int count; /** The feature vector. */ public FeatureVector features; /** Constructor. */ public Event(FeatureVector features, int count) { this.features = features; this.count = count; } /** toString. */ public String toString() { return "event: count: " + count + " " + features; } } // the alphabet private Alphabet alphabet; // the file, which can be reopened private File file; // the current reader private Reader reader; // the current tokenizer private StreamTokenizer tokenizer; // the saved blocks, if kept in memory private List blocks = null; // the iterator over saved blocks private Iterator blockIt = null; // whether the end-of-file has been reached private boolean eofReached = false; /** Constructor with filename. */ public EventFile(String filename) throws IOException { this(filename, false); } /** Constructor with filename and in-memory flag. */ public EventFile(String filename, boolean inMemory) throws IOException { this(filename, new Alphabet(10000), inMemory); } /** Constructor with filename and alphabet. */ public EventFile(String filename, Alphabet alphabet) throws IOException { this(filename, alphabet, false); } /** Constructor with filename, alphabet and in-memory flag. */ public EventFile(String filename, Alphabet alphabet, boolean inMemory) throws IOException { file = new File(filename); init(); this.alphabet = alphabet; if (inMemory) this.blocks = new ArrayList(10000); } /** Returns the alphabet. */ public Alphabet getAlphabet() { return alphabet; } /** Closes the reader. */ public void close() throws IOException { reader.close(); } /** Resets the event file for reading again. */ public void reset() throws IOException { close(); init(); } // inits the reader and tokenizer, or // if keeping blocks in memory, resets the iterator private void init() throws IOException { // in-memory case if (blocks != null && eofReached) { blockIt = blocks.iterator(); return; } // degenerate case: keeping blocks in memory but eof not reached if (blocks != null) { // dump saved blocks blocks.clear(); } // regular init reader = openReader(file); tokenizer = initTokenizer(reader); } /** Initializes the given tokenizer to recognize most chars as word chars. */ public static StreamTokenizer initTokenizer(Reader reader) throws IOException { StreamTokenizer tokenizer = new StreamTokenizer(reader); tokenizer.resetSyntax(); tokenizer.wordChars(33, 255); tokenizer.whitespaceChars(0, 32); return tokenizer; } /** Returns whether EOF has been reached. */ public boolean endOfFile() throws IOException { tokenizer.nextToken(); boolean eof = (tokenizer.ttype == StreamTokenizer.TT_EOF); tokenizer.pushBack(); return eof; } /** Reads the next event. Feature are filtered if apropos. */ private Event nextEvent() throws IOException { tokenizer.nextToken(); int count = Integer.parseInt(tokenizer.sval); tokenizer.nextToken(); int numFeats = Integer.parseInt(tokenizer.sval); FeatureList fv = new FeatureList(numFeats); for (int i=0; i < numFeats; i++) { tokenizer.nextToken(); String feat = tokenizer.sval; tokenizer.nextToken(); float val = Float.parseFloat(tokenizer.sval); Alphabet.Feature f = alphabet.index(feat); if (f != null) fv.add(f, val); } return new Event(fv, count); } /** Reads the next block, or null if none. */ public Block nextBlock() throws IOException { // first check block iterator for in-mem case if (blockIt != null) { return (blockIt.hasNext()) ? blockIt.next() : null; } // otherwise check for eof, noting completion for in-mem case if (endOfFile()) { eofReached = true; return null; } // otherwise parse next block tokenizer.nextToken(); int numEvents = Integer.parseInt(tokenizer.sval); List events = new ArrayList(numEvents); for (int i=0; i < numEvents; i++) { events.add(nextEvent()); } Block retval = new Block(events); // save block with in-mem case if (blocks != null) blocks.add(retval); // done return retval; } /** Returns a reader for the given file, using gzip inflation if the file's name ends with .gz. */ public static Reader openReader(File file) throws IOException { if (file.getName().endsWith(".gz")) return new InputStreamReader(new GZIPInputStream(new FileInputStream(file))); else return new BufferedReader(new FileReader(file)); } /** Returns a printwriter for the given file, using gzip deflation if the file's name ends with .gz. */ public static PrintWriter openWriter(File file) throws IOException { if (file.getName().endsWith(".gz")) return new PrintWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(file)))); else return new PrintWriter(new BufferedWriter(new FileWriter(file))); } /** Writes the events for a given list of signs according to the feature extractor and best sign. */ public static void writeEvents(PrintWriter pw, List signs, Sign best, FeatureExtractor fe) throws IOException { Collections.shuffle(signs); pw.println(Integer.toString(signs.size())); for (Sign s : signs) { int count = 0; if (s == best) count = 1; pw.print(count + " "); FeatureVector fvect = fe.extractFeatures(s, true); int numfeats = fvect.size(); pw.print(numfeats + " "); for (FeatureVector.Iterator it = fvect.iterator(); it.hasNext(); ) { pw.print(it.nextFeature().name() + " " + it.nextValue() + " "); } pw.println(); } } } ================================================ FILE: src/opennlp/ccg/perceptron/FeatureExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import opennlp.ccg.synsem.*; /** * Interface for mappings signs to features. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/06/22 04:32:50 $ */ public interface FeatureExtractor { /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete); /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet); } ================================================ FILE: src/opennlp/ccg/perceptron/FeatureList.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import java.util.*; /** * A feature vector represented by a list of features and a list of values. * Features are assumed to be unique. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2011/10/11 03:20:05 $ */ public class FeatureList implements FeatureVector { @SuppressWarnings("unused") private static final long serialVersionUID = 325935806787566283L; // features private ArrayList features; // values private ArrayList values; /** Default constructor. */ public FeatureList() { features = new ArrayList(); values = new ArrayList(); } /** Constructor with size. */ public FeatureList(int size) { features = new ArrayList(size); values = new ArrayList(size); } /** Constructor from feature vector. */ public FeatureList(FeatureVector fv) { for (Iterator it = fv.iterator(); it.hasNext(); ) add(it.nextFeature(), it.nextValue()); } /** Add feature-value pair. */ public void add(Alphabet.Feature feature, Float value) { features.add(feature); values.add(value); } /** Add feature vector (features assumed distinct). */ public void add(FeatureVector fv) { features.ensureCapacity(size() + fv.size()); values.ensureCapacity(size() + fv.size()); for (Iterator it = fv.iterator(); it.hasNext(); ) add(it.nextFeature(), it.nextValue()); } /** Get feature at index. */ public Alphabet.Feature getFeature(int index) { return features.get(index); } /** Get value at index. */ public Float getValue(int index) { return values.get(index); } /** Size. */ public int size() { return features.size(); } /** Iterator. */ public Iterator iterator() { return new Iterator() { java.util.Iterator itF = features.iterator(); java.util.Iterator itV = values.iterator(); public boolean hasNext() { return itF.hasNext(); } public Alphabet.Feature nextFeature() { return itF.next(); } public Float nextValue() { return itV.next(); } }; } /** toString. */ public String toString() { String retval = "features: "; for (int i=0; i < features.size(); i++) retval += "<" + getFeature(i).name() + "," + getValue(i) + "> "; return retval; } } ================================================ FILE: src/opennlp/ccg/perceptron/FeatureMap.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import java.util.*; /** * A feature vector represented using a map. * * @author Michael White * @version $Revision: 1.9 $, $Date: 2011/10/11 03:20:06 $ */ public class FeatureMap implements FeatureVector { @SuppressWarnings("unused") private static final long serialVersionUID = -5054109887068464041L; // the map private HashMap map; /** Default constructor. */ public FeatureMap() { map = new LinkedHashMap(); } /** Constructor with size. */ public FeatureMap(int size) { map = new LinkedHashMap(size*2); } /** Constructor from feature vector. */ public FeatureMap(FeatureVector fv) { this(fv.size()*4); for (Iterator it = fv.iterator(); it.hasNext(); ) map.put(it.nextFeature(), it.nextValue()); } /** Constructor from two feature vectors. */ public FeatureMap(FeatureVector fv1, FeatureVector fv2) { this((fv1.size()+fv2.size())*3); for (Iterator it = fv1.iterator(); it.hasNext(); ) map.put(it.nextFeature(), it.nextValue()); add(fv2); } /** Increments a feature count. */ public void inc(Alphabet.Feature feature) { float count = 1; Float val = map.get(feature); if (val != null) count = Math.round(val) + 1; map.put(feature, count); } /** Adds to a feature's value (starting with zero). */ public void add(Alphabet.Feature feature, Float value) { Float val = map.get(feature); if (val != null) map.put(feature, val + value); else map.put(feature, value); } /** Adds a feature vector. */ public void add(FeatureVector fv) { for (Iterator it = fv.iterator(); it.hasNext(); ) add(it.nextFeature(), it.nextValue()); } /** Returns the feature's value (zero if not present). */ public float get(Alphabet.Feature feature) { Float retval = map.get(feature); return (retval != null) ? retval : 0; } /** Clears the map. */ public void clear() { map.clear(); } /** Size. */ public int size() { return map.size(); } /** Returns an iterator over the entries. */ public Iterator iterator() { return new Iterator() { java.util.Iterator> it = map.entrySet().iterator(); Map.Entry entry = null; public boolean hasNext() { return it.hasNext(); } public Alphabet.Feature nextFeature() { entry = it.next(); return entry.getKey(); } public Float nextValue() { return entry.getValue(); } }; } } ================================================ FILE: src/opennlp/ccg/perceptron/FeatureVector.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; /** * A sparse feature vector, with size and an iterator over feature-value pairs. * * @author Michael White * @version $Revision: 1.6 $, $Date: 2011/01/15 17:52:59 $ */ public interface FeatureVector { /** * Iterator over feature-value pairs. * Features and values must be retrieved using a pair of calls * to nextFeature and nextValue, otherwise the behavior is not defined. */ public interface Iterator { /** Whether any items remain. */ public boolean hasNext(); /** Returns the next feature. */ public Alphabet.Feature nextFeature(); /** Returns the next value. */ public Float nextValue(); } /** Size. */ public int size(); /** Iterator over feature-value pairs. */ public Iterator iterator(); /** Empty iterator. */ public static Iterator EMPTY_ITERATOR = new Iterator() { public boolean hasNext() { return false; } public Alphabet.Feature nextFeature() { return null; } public Float nextValue() { return null; } }; } ================================================ FILE: src/opennlp/ccg/perceptron/Model.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import java.util.*; import java.io.*; import opennlp.ccg.perceptron.Alphabet.Feature; import opennlp.ccg.util.Pair; /** * A model is a vector of weights for an alphabet. * * A model can be read from a file, which starts with the number * of features on one line, followed by one line per feature pairing * the feature name with its weight. * * A new model with all zero weights can also be created from an alphabet. * * The main routine tests the model on an event file. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2009/11/09 18:54:30 $ */ public class Model { /** Filter interface for adjusting feature weights when loading a model. */ public interface FeatureFilter { /** Returns the modified feature weight for the given feature. */ public double adjustedWeight(String name, double weight); } /** Flag for whether to print debugging info to System.err. */ public boolean debug = false; // weight vector private double[] weights; // alphabet private Alphabet alphabet; /** Constructor with alphabet, for a new model with all zero weights. */ public Model(Alphabet alphabet) { this.alphabet = alphabet; weights = new double[alphabet.size()]; zero(); } /** Constructor to load a model from a file. The alphabet is set to closed. */ public Model(String filename) throws IOException { this(filename, null); } /** * Constructor to load a model from a file using a feature filter. The * alphabet is set to closed. */ public Model(String filename, FeatureFilter filter) throws IOException { Reader reader = EventFile.openReader(new File(filename)); StreamTokenizer tokenizer = EventFile.initTokenizer(reader); tokenizer.nextToken(); int size = Integer.parseInt(tokenizer.sval); alphabet = new Alphabet(size); weights = new double[size]; for (int i = 0; i < size; i++) { tokenizer.nextToken(); String name = tokenizer.sval; tokenizer.nextToken(); double weight = Double.parseDouble(tokenizer.sval); if (filter != null) weight = filter.adjustedWeight(name, weight); alphabet.add(name); weights[i] = weight; } reader.close(); alphabet.setClosed(true); } /** Returns the size of the model. */ public int size() { return weights.length; } /** Returns the alphabet. */ public Alphabet getAlphabet() { return alphabet; } /** Returns the weight for the given index. */ public double getWeight(int index) { return weights[index]; } /** Returns the weight for the given feature. */ public double getWeight(String feat) { return weights[alphabet.index(feat).getIndex()]; } /** Returns the weight for the given feature. */ public double getWeight(Alphabet.Feature f) { return weights[f.getIndex()]; } /** Sets the weight for the given index. */ public void setWeight(int index, double weight) { weights[index] = weight; } /** Sets the weight for the given feature. */ public void setWeight(String feat, double weight) { weights[alphabet.index(feat).getIndex()] = weight; } /** Sets the weight for the given feature. */ public void setWeight(Alphabet.Feature f, double weight) { weights[f.getIndex()] = weight; } /** Returns the dot product of the weights and features. */ public double score(FeatureVector fv) { double retval = 0.0; for (FeatureVector.Iterator it = fv.iterator(); it.hasNext(); ) { Feature feat = it.nextFeature(); Float value = it.nextValue(); Integer index = feat.getIndex(); if (index == null) continue; retval += weights[index] * value; } if (debug) System.err.println("score: " + retval + " " + fv); return retval; } /** Adds the feature vector values to the weights. */ public void add(FeatureVector fv) { for (FeatureVector.Iterator it = fv.iterator(); it.hasNext(); ) { Feature feat = it.nextFeature(); Float value = it.nextValue(); Integer index = feat.getIndex(); if (index == null) continue; weights[index] += value; } } /** Subtracts the feature vector values from the weights. */ public void subtract(FeatureVector fv) { for (FeatureVector.Iterator it = fv.iterator(); it.hasNext(); ) { Feature feat = it.nextFeature(); Float value = it.nextValue(); Integer index = feat.getIndex(); if (index == null) continue; weights[index] -= value; } } /** Adds the given model's weights to this model. The models are assumed to share the same alphabet. */ public void add(Model model) { for (int i=0; i < weights.length; i++) { weights[i] += model.weights[i]; } } /** Multiplies the weights by the given number. */ public void multiply(double num) { for (int i=0; i < weights.length; i++) { weights[i] *= num; } } /** Resets the weights to zero. */ public void zero() { for (int i=0; i < weights.length; i++) { weights[i] = 0.0; } } /** Sets this model's weights to the given model's ones, where the alphabets intersect. */ public void set(Model model) { zero(); for (int i=0; i < model.weights.length; i++) { Alphabet.Feature f = model.alphabet.feature(i); Alphabet.Feature f0 = alphabet.index(f); if (f0 == null || f0.getIndex() == null) continue; weights[f0.getIndex()] = model.weights[i]; } } /** Returns the best event (first tied if ties). */ public FeatureVector best(List fvs) { FeatureVector retval = null; double max = Double.NEGATIVE_INFINITY; for (FeatureVector fv : fvs) { double score = score(fv); if (score > max) { retval = fv; max = score; } } return retval; } /** Returns the best event (first tied if ties). */ public EventFile.Event best(EventFile.Block block) { EventFile.Event retval = null; double max = Double.NEGATIVE_INFINITY; for (EventFile.Event event : block.events) { double score = score(event.features); if (score > max) { retval = event; max = score; } } return retval; } /** Returns the accuracy on the event file. */ public double accuracy(EventFile eventFile) throws IOException { if (alphabet != eventFile.getAlphabet()) { throw new RuntimeException("Model and EventFile must share the same alphabet!"); } int correct = 0; int total = 0; EventFile.Block block; while ( (block = eventFile.nextBlock()) != null ) { total++; if (best(block) == block.best()) { correct++; if (debug) System.err.println("CORRECT"); } else { if (debug) System.err.println("WRONG; best: " + block.best()); } } if (debug) System.err.println("correct: " + correct + " total: " + total); return 1.0 * correct / total; } /** Saves the model to a file, filtering out zero weights. */ public void save(String filename) throws IOException { save(filename, 0.0); } /** Saves the model to a file, filtering out weights whose absolute value does not exceed the pruning value. */ public void save(String filename, double minPrune) throws IOException { File file = new File(filename); PrintWriter out = EventFile.openWriter(file); // calc pruned size int size = size(); int pruned = 0; for (int i=0; i < size; i++) if (Math.abs(weights[i]) <= minPrune) pruned++; int prunedSize = size - pruned; // write pruned size out.println(Integer.toString(prunedSize)); // collect unpruned weights List> featWeights = new ArrayList>(prunedSize); for (int i=0; i < size; i++) { if (Math.abs(weights[i]) <= minPrune) continue; featWeights.add(new Pair(alphabet.feature(i), weights[i])); } // sort weights by descending absolute value // (further sorting alphabetically may take too long) Collections.sort( featWeights, new Comparator>() { public int compare(Pair entry1, Pair entry2) { double val1 = Math.abs(entry1.b); double val2 = Math.abs(entry2.b); if (val1 > val2) return -1; if (val1 < val2) return 1; return 0; //return entry1.a.name().compareTo(entry2.a.name()); } } ); // write sorted weights for (Pair fw : featWeights) { out.println(fw.a.name() + " " + fw.b); } out.close(); } /** * Loads a model from a file and tests it on the given event file. */ public static void main(String[] args) throws IOException { // help if (args.length < 2) { System.out.println("Usage: java perceptron.Model (-debug)"); System.exit(0); } // args String modelfile = args[0]; String eventfile = args[1]; boolean debug = Arrays.asList(args).contains("-debug"); // load model System.out.println("Loading model from: " + modelfile); Model model = new Model(modelfile); model.debug = debug; System.out.println("model size: " + model.size()); System.out.println("debug: " + debug); // compute accuracy System.out.println("Scoring events in: " + eventfile); EventFile eventFile = new EventFile(eventfile, model.alphabet); double accuracy = model.accuracy(eventFile); System.out.println("accuracy: " + accuracy); } } ================================================ FILE: src/opennlp/ccg/perceptron/PerceptronScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import opennlp.ccg.synsem.*; /** * A sign scorer for a perceptron model. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2011/03/21 20:46:43 $ */ public class PerceptronScorer implements SignScorer { /** The feature extractor. */ public final FeatureExtractor featureExtractor; /** The model. */ public final Model model; /** Constructor. */ public PerceptronScorer(FeatureExtractor featureExtractor, Model model) { this.featureExtractor = featureExtractor; this.model = model; featureExtractor.setAlphabet(model.getAlphabet()); } /** * Returns a score for the given sign and completeness flag, where higher * numbers are better than lower numbers. * In particular, returns the score assigned by the model to the features * extracted from the given sign with the given completeness flag. */ public double score(Sign sign, boolean complete) { return model.score(featureExtractor.extractFeatures(sign, complete)); } } ================================================ FILE: src/opennlp/ccg/perceptron/ReRankingPerceptronScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; import opennlp.ccg.synsem.*; /** * A re-ranking sign scorer for a perceptron model. * Note that at present, n-best re-ranking has been found to work better * for parsing, but not for realization, where forest re-ranking (ie using * the perceptron scorer throughout) seems to work better. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2011/03/21 20:46:32 $ */ public abstract class ReRankingPerceptronScorer extends PerceptronScorer implements ReRankingScorer { /** Flag for whether to use the full model. */ protected boolean useFullModel = false; /** Sets the full model flag. */ public void setFullModel(boolean on) { useFullModel = on; } /** The base scorer, for use when the full model is turned off. */ protected SignScorer baseScorer; /** Returns the base scorer, using the given feature extractor if desired. */ abstract protected SignScorer getBaseScorer(FeatureExtractor featureExtractor); /** Constructor that configures the base scorer using getBaseScorer. */ public ReRankingPerceptronScorer(FeatureExtractor featureExtractor, Model model) { super(featureExtractor, model); baseScorer = getBaseScorer(featureExtractor); } /** Scores the sign with the full or base model, according to the full model flag. */ public double score(Sign sign, boolean complete) { if (useFullModel) return super.score(sign, complete); else return baseScorer.score(sign, complete); } } ================================================ FILE: src/opennlp/ccg/perceptron/Trainer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008-2013 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.perceptron; //import java.util.*; import java.io.*; /** * Trains an averaged perceptron model by iterating through an event file. * * The main routine trains and saves a model. Options include saving * the final, non-averaged model, and saving the averaged model after * every N iterations. There's also an option to keep the events in * memory for subsequent iterations. * * An alphabet must be supplied as input. The main routine of the * Alphabet class can be used to derive an alphabet from a training * file. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/06/18 23:38:57 $ */ public class Trainer { /** The training file. */ public final String trainfile; /** The alphabet. */ public final Alphabet alphabet; /** The number of training iterations. */ public final int iterations; /** The model file. */ public final String modelfile; /** The in-memory flag. */ public final boolean inMemory; /** The current model. */ public final Model currentModel; /** The averaged model. */ public final Model averagedModel; /** The model for averaging over the current iteration. */ private Model iterationModel; /** The number of iterations to use when saving every N iterations. */ private int saveEveryNth = 0; /** Constructor. */ public Trainer(String trainfile, String alphabetfile, int iterations, String modelfile, boolean inMemory) throws IOException { this.trainfile = trainfile; this.alphabet = new Alphabet(alphabetfile); this.iterations = iterations; this.modelfile = modelfile; this.inMemory = inMemory; this.currentModel = new Model(alphabet); this.averagedModel = new Model(alphabet); this.iterationModel = new Model(alphabet); } /** Initializes the model. */ public void initModel(String initmodelfile) throws IOException { Model model = new Model(initmodelfile); currentModel.set(model); averagedModel.set(model); } /** Train the model, reporting accuracy of the averaged model for each iteration and the final model. */ public void train() throws IOException { // open training file EventFile eventFile = new EventFile(trainfile, alphabet, inMemory); // iterate boolean converged = false; for (int i=0; i < iterations; i++) { System.out.println("iteration: " + i); // reset training file eventFile.reset(); // zero iteration model iterationModel.zero(); // init counters int updates = 0; int correct = 0; int total = 0; EventFile.Block block; // go through training file while ( (block = eventFile.nextBlock()) != null ) { total++; // update if wrong EventFile.Event modelBest = currentModel.best(block); EventFile.Event actualBest = block.best(); if (modelBest != actualBest) { updates++; currentModel.add(actualBest.features); currentModel.subtract(modelBest.features); } // see if averaged model correct EventFile.Event avgModelBest = averagedModel.best(block); if (avgModelBest == actualBest) correct++; // update iteration model iterationModel.add(currentModel); } // divide iteration model by total to yield an average for this iteration, // and divide by iteration number (+1) to yield proportion of this // iteration to averaged model; combine these two steps into one call double denominator = 1.0 * total * (i+1); iterationModel.multiply(1.0 / denominator); // multiply averaged model by i/(i+1) for proportion of previous iterations if (i > 0) { double mult = 1.0 * i / (i+1); averagedModel.multiply(mult); } // updated averaged model averagedModel.add(iterationModel); // report System.out.println("updates: " + updates); System.out.println("avg model correct: " + correct + " total: " + total + " accuracy: " + (1.0 * correct / total)); System.out.println(); if (updates == 0) { System.out.println("converged"); System.out.println(); converged = true; break; } // save every nth if (saveEveryNth > 0 && i < (iterations-1) && i % saveEveryNth == 0) { String nthModelfile = nthFilename(modelfile, i); System.out.println("Saving model to " + nthModelfile); averagedModel.save(nthModelfile); System.out.println(); } } // do one more iteration to compute accuracy if not converged if (!converged) { System.out.println("final iteration: "); // reset training file eventFile.reset(); // init counters int finalCorrect = 0; int correct = 0; int total = 0; EventFile.Block block; // go through training file while ( (block = eventFile.nextBlock()) != null ) { total++; // see if correct EventFile.Event modelBest = currentModel.best(block); EventFile.Event avgModelBest = averagedModel.best(block); EventFile.Event actualBest = block.best(); if (modelBest == actualBest) finalCorrect++; if (avgModelBest == actualBest) correct++; } // report System.out.println("final model correct: " + finalCorrect + " total: " + total + " accuracy: " + (1.0 * finalCorrect / total)); System.out.println("avg model correct: " + correct + " total: " + total + " accuracy: " + (1.0 * correct / total)); System.out.println(); } // close training file eventFile.close(); } /** Returns a filename with .N added before the extension, if any. */ public static String nthFilename(String filename, int N) { int lastdot = filename.lastIndexOf('.'); if (lastdot > 0) return filename.substring(0,lastdot) + "." + N + filename.substring(lastdot); else return filename + "." + N; } /** * Trains an averaged perceptron model from the training file using the alphabet file and the given * number of iterations, saving the file to model file. * The final (non-averaged) model can optionally be saved using the -f option, * and intermediate models can be saved every N iterations using the -n option. * The -in_mem option keeps the events in memory for subsequent iterations. */ public static void main(String[] args) throws IOException { // help if (args.length < 4) { System.out.println( "Usage: java perceptron.Trainer " + "(-i ) (-f ) (-n ) (-in_mem)" ); System.exit(0); } // args String traineventfile = args[0]; String alphabetfile = args[1]; int iterations = Integer.parseInt(args[2]); String modelfile = args[3]; String initmodelfile = null; String finalmodelfile = null; int saveEveryNth = 0; boolean inMemory = false; for (int i=4; i < args.length; i++) { if (args[i].equals("-i")) initmodelfile = args[++i]; if (args[i].equals("-f")) finalmodelfile = args[++i]; if (args[i].equals("-n")) saveEveryNth = Integer.parseInt(args[++i]); if (args[i].equals("-in_mem")) inMemory = true; } // setup, train System.out.println("Training on " + traineventfile + " using " + alphabetfile + " for " + iterations + " iterations"); if (initmodelfile != null) System.out.println("with " + initmodelfile + " as the initial model"); if (inMemory) System.out.println("keeping events in memory"); System.out.println(); Trainer trainer = new Trainer(traineventfile, alphabetfile, iterations, modelfile, inMemory); if (initmodelfile != null) trainer.initModel(initmodelfile); trainer.saveEveryNth = saveEveryNth; trainer.train(); // save model System.out.println("Saving model to " + modelfile); trainer.averagedModel.save(modelfile); if (finalmodelfile != null) { System.out.println("Saving model to " + finalmodelfile); trainer.currentModel.save(finalmodelfile); } } } ================================================ FILE: src/opennlp/ccg/realize/Chart.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-11 University of Edinburgh / Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.*; import opennlp.ccg.synsem.*; import opennlp.ccg.util.Pair; import opennlp.ccg.ngrams.NgramPrecisionModel; import opennlp.ccg.parse.DerivationHistory; import opennlp.ccg.hylo.*; import java.io.*; import java.util.*; import java.util.prefs.*; import gnu.trove.*; /** * The chart manages the creation of edges. Newly added edges are kept on an * agenda prior to rule applications. In the anytime mode, the chart completion * process can be interrupted according to the given parameters and preferences * (see Chart.combine). In the packing mode, completing the chart produces * a packed representation, which may then be unpacked if the DO_UNPACKING * preference is set. * * A single chart instance per realization request is assumed. * * @author Michael White * @version $Revision: 1.79 $, $Date: 2011/08/28 04:05:39 $ */ public class Chart { /** Preference key for time limit on edge combination. */ public static final String TIME_LIMIT = "Time Limit"; /** Preference key for time limit on finding a new best complete realization. If between o-1, the time limit is interpreted as a percentage of the time until the first realization is found. */ public static final String NEW_BEST_TIME_LIMIT = "New Best Time Limit"; /** A constant indicating no time limit on edge combination. */ public static final int NO_TIME_LIMIT = 0; /** Preference key for edge limit on edge combination. */ public static final String EDGE_LIMIT = "Edge Limit"; /** A constant indicating no edge limit on edge combination. */ public static final int NO_EDGE_LIMIT = 0; /** Preference key for pruning the number of signs kept per equivalence class. */ public static final String PRUNING_VALUE = "Pruning Value"; /** Preference key for pruning the number of signs kept per cell. */ public static final String CELL_PRUNING_VALUE = "Cell Pruning Value"; /** A constant indicating no pruning of signs per equivalence class. */ public static final int NO_PRUNING = 0; /** Preference key for whether to collect category combinations. */ public static final String USE_COMBOS = "Use Combos"; /** Preference key for whether to create a packed generation forest in the first of two realization stages (deferring unpacking to the later stage). This option pre-empts the use of collected combos in a single, anytime realization stage. */ public static final String USE_PACKING = "Use Packing"; /** Preference key for whether to unpack a generation forest in the second of two realization stages. This option is only relevant if packing is used in the first stage. */ public static final String DO_UNPACKING = "Do Unpacking"; /** The edge factory for the realization request. */ public final EdgeFactory edgeFactory; /** The pruning strategy. */ public final PruningStrategy pruningStrategy; /** Flag for whether to use depth-first search. Defaults to false. */ public boolean depthFirst = false; /** New best time limit, in ms. Set from prefs in constructor. */ public int newBestTimeLimit = NO_TIME_LIMIT; /** New best time limit, as a percentage of time from the first complete realization. Set from prefs in constructor. */ public double newBestTimeLimitPct = NO_TIME_LIMIT; /** Edge limit. Set from prefs in constructor. */ public int edgeLimit = NO_EDGE_LIMIT; /** Pruning value. Set from prefs in constructor. */ public int pruningValue = NO_PRUNING; /** Cell pruning value. Set from prefs in constructor. */ public int cellPruningValue = NO_PRUNING; /** Flag for whether to collect category combos. Set from prefs in constructor. */ public boolean collectCombos = true; /** Flag for whether to use packing. Set from prefs in constructor. */ public boolean usePacking = false; /** Flag for whether to do unpacking. Set from prefs in constructor. */ public boolean doUnpacking = true; /** Flag for whether to join best fragments if no complete realization found. Defaults to false. */ public boolean joinFragments = false; /** Flag for whether to glue fragments currently. Defaults to false. */ public boolean gluingFragments = false; // the agenda of edges that have yet to be added to the chart private List agenda = new ArrayList(); // the (representative) edges in the chart private List edges = new ArrayList(); // all unpruned (and unpacked, if apropos) edges in the chart private List allEdges = new ArrayList(); // edges to be removed from the chart, after having been superceded // by an edge with an equivalent sign (up to surface words) and // a less complex derivation private List supercededEdgesPendingRemoval = new ArrayList(); // maps signs to edges (w/o optional bits marked as covered) private Map signMap = new IdentityHashMap(); // the edges seen so far private EdgeHash edgeHash = new EdgeHash(); // maps edges to representative edges, according to their // coverage vectors and their cats, sans LFs @SuppressWarnings("unchecked") private Map catMap = new THashMap( new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(Object o) { Edge edge = (Edge) o; return edge.bitset.hashCode() + edge.sign.getCategory().hashCodeNoLF(); } public boolean equals(Object o1, Object o2) { Edge edge1 = (Edge) o1; Edge edge2 = (Edge) o2; return edge1.bitset.equals(edge2.bitset) && edge1.sign.getCategory().equalsNoLF(edge2.sign.getCategory()); } } ); // cell map: based on input coverage vectors private Map cellMap = new HashMap(); // non-empty cells: cells to avoid when gluing fragments private Set nonEmptyCells = null; // reusable bitset for checking non-empty cells private transient BitSet tmpBitSet = new BitSet(); /** * The best edge found so far (or null), * where a complete edge is always given preference * to an incomplete one. */ public Edge bestEdge = null; /** The best edge created by joining fragments, if necessary. */ public Edge bestJoinedEdge = null; /** Whether the realization search has been completed. */ public boolean done = false; /** The number of nominals in the input LF. */ public int numNominals = 0; /** The number of elementary predications in the input LF. */ public int numPreds = 0; /** The number of edges created and added to the agenda. */ public int numEdges = 0; /** The number of pruned edges removed from the chart. */ public int numPrunedRemoved = 0; /** The number of pruned edges never added to the chart. */ public int numPrunedNeverAdded = 0; /** The number of new complete best edges found after the first one. */ public int newBest = 0; /** The maximum number of edges in a cell. */ public int cellMax = 0; /** The time at which realization started. */ protected long startTime = System.currentTimeMillis(); /** The time in ms until lex lookup was completed. */ public int timeTilLex = 0; /** The time in ms until the first complete edge was found. */ public int timeTilFirst = 0; /** The time in ms until the best edge was found. */ public int timeTilBest = 0; /** The time in ms until the search was stopped. */ public int timeTilStopped = 0; /** The time in ms until the packed chart was completed. */ public int timeTilPacked = 0; /** The time in ms until the search was finished. */ public int timeTilDone = 0; /** * Constructor with explicit pruning strategy. */ // * NB: Even with a non-default pruning strategy, it could potentially help // * to set the pruning value to an estimate of the number of // * edges per equivalent category that will be stored. public Chart(EdgeFactory edgeFactory, PruningStrategy pruningStrategy) { this.edgeFactory = edgeFactory; this.pruningStrategy = pruningStrategy; Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); newBestTimeLimitPct = prefs.getDouble(NEW_BEST_TIME_LIMIT, NO_TIME_LIMIT); if (newBestTimeLimitPct >= 1) { newBestTimeLimit = (int) newBestTimeLimitPct; newBestTimeLimitPct = NO_TIME_LIMIT; } edgeLimit = prefs.getInt(EDGE_LIMIT, NO_EDGE_LIMIT); pruningValue = prefs.getInt(PRUNING_VALUE, NO_PRUNING); cellPruningValue = prefs.getInt(CELL_PRUNING_VALUE, NO_PRUNING); usePacking = prefs.getBoolean(USE_PACKING, false); collectCombos = !usePacking && prefs.getBoolean(USE_COMBOS, true); doUnpacking = usePacking && prefs.getBoolean(DO_UNPACKING, true); } /** Returns the number of (representative) edges in the chart. */ public int numEdgesInChart() { return edges.size(); } /** Returns the number of unpruned (and unpacked, if apropos) edges in the chart. */ public int numUnprunedEdges() { return allEdges.size(); } //----------------------------------------------------------------- // main algorithm routines /** Initializes the agenda. */ public void initialize() { // record number of nominals numNominals = edgeFactory.nominals.size(); numPreds = edgeFactory.preds.size(); // create various initial edges and add to the agenda for (Edge edge : edgeFactory.createInitialEdges()) addEdgeToAgenda(edge); // record time 'til lex long currentTime = System.currentTimeMillis(); timeTilLex = (int) (currentTime - startTime); } /** Returns whether there were no uncovered lexical or featural preds after lex lookup. */ public boolean noUncoveredPreds() { return !edgeFactory.hasUncoveredPreds; } /** * Reinitializes the agenda for gluing fragments. * A runtime exception is thrown if not in packing mode. */ public void reInitForGluing() { // check packing mode if (!usePacking) throw new RuntimeException("Packing mode required for gluing fragments."); // set flags here and in edge factory gluingFragments = true; edgeFactory.gluingFragments = true; edgeFactory.useIndexing = false; // add opt for uncovered preds, unless already done for relaxed relation matching if (!edgeFactory.useRelaxedRelationMatching) edgeFactory.addLFOptsForUncoveredPreds(); // add opts for rule instances edgeFactory.addLFOptsForRuleInstances(); // record non-empty cells nonEmptyCells = new HashSet(cellMap.keySet()); // add edges back to agenda, for possible gluing for (Edge edge : edges) addEdgeToAgenda(edge); } /** * Adds to the chart by iteratively moving an edge from the agenda to the chart, * creating new edges for the agenda by applying available rules, * while updating the best edge found so far, * until a stopping criterion is reached. * The basic stopping criterion is when the agenda becomes empty, and thus the * search is done. * Otherwise, the search is stopped either when the edge limit (if any) is reached, * or the time limit (if any) is reached, * or the first complete edge is found (if beyond the edge/time limit, and according to the given flag), * or until the new best time limit (if any; anytime case only) beyond the first * complete realization is exceeded. * In the packing case, unpacking is then performed according to the preference setting. * In the anytime case, if the collect combos option is set, then the combinatory rules * are only invoked when an edge with a new category is moved to the chart, in which case * any successful combinations are collected in the edge's combos data structure; * if the edge instead has an already seen category, new edges are created as * alternatives to the collected combos in its representative, much as with unpacking. */ public void combine(int timeLimitMS, boolean waitForCompleteEdge) { // until agenda is empty while (!agenda.isEmpty()) { // check for timeout long currentTime = System.currentTimeMillis(); int timeSoFar = (int) (currentTime - startTime); int timeSinceFirst = timeSoFar - timeTilFirst; boolean bestEdgeComplete = (bestEdge != null && bestEdge.complete()); if ( // edge limit case (edgeLimit != NO_EDGE_LIMIT && numEdges > edgeLimit && (!waitForCompleteEdge || bestEdgeComplete)) || // timeout case (timeLimitMS != NO_TIME_LIMIT && timeSoFar > timeLimitMS && (!waitForCompleteEdge || bestEdgeComplete)) || // new best timeout case (anytime only) (!usePacking && bestEdgeComplete && ((newBestTimeLimit != NO_TIME_LIMIT && timeSinceFirst > newBestTimeLimit) || (newBestTimeLimitPct != NO_TIME_LIMIT && (double) timeSinceFirst / timeTilFirst > newBestTimeLimitPct))) ) { // ensure best edge in chart if (!allEdges.contains(bestEdge)) { addEdgeToChart(bestEdge); } // set timing timeTilStopped = timeSoFar; // stop break; } // take edge from agenda Edge next = agenda.remove(0); // add edge to chart boolean actuallyAdded = addEdgeToChart(next); // skip if edge didn't survive pruning (anytime case), // or was folded into an existing edge (packing case) if (!actuallyAdded) { continue; } // otherwise do combos doEdgeCombos(next); } // set done packing time if (usePacking) { long donePackingTime = System.currentTimeMillis(); timeTilPacked = (int) (donePackingTime - startTime); // do unpacking, if apropos if (doUnpacking) doUnpacking(); } // set done flag, timing done = agenda.isEmpty(); if (done) { long endTime = System.currentTimeMillis(); timeTilDone = (int) (endTime - startTime); } // join best fragments, if nec. if (joinFragments && !bestEdge.complete()) joinBestFragments(); } // does binary combinations with all edges in the chart and unary ones too; // when collecting combos, invokes the combinatory rules only when nec.; // prunes superceded edges before returning private void doEdgeCombos(Edge next) { // skip semantically null edges when gluing fragments if (gluingFragments && next.bitset.isEmpty()) return; // when collecting combos ... if (collectCombos) { // existing rep case: just make alt edges from collected combos Edge nextRep = catMap.get(next); if (next != nextRep) { addNewEdges(edgeFactory.createAltEdges(next, nextRep)); // and prune any superceded edges before returning pruneSupercededEdges(); return; } } // otherwise combine edge with those in chart List edgesToUse = (usePacking || collectCombos) ? edges : allEdges; for (Edge edge : edgesToUse) { if (edge == next) continue; // skip this edge // skip fragment gluing if semantically null or if result cell non-empty if (gluingFragments) { if (edge.bitset.isEmpty()) continue; tmpBitSet.clear(); tmpBitSet.or(edge.bitset); tmpBitSet.or(next.bitset); if (nonEmptyCells.contains(tmpBitSet)) continue; } // add new combos to agenda addNewEdges(edgeFactory.createNewEdges(edge, next, collectCombos)); } // combine edge via unary rules and with semantically null edges, // adding new edges to the agenda addNewEdges(edgeFactory.createNewEdges(next, collectCombos)); // prune any superceded edges before returning pruneSupercededEdges(); } // adds all new edges to the agenda private void addNewEdges(List newEdges) { for (Edge newEdge : newEdges) { addEdgeToAgenda(newEdge); } } /** Greedily combines best fragments, updating bestJoinedEdge. */ protected void joinBestFragments() { // start with best edge bestJoinedEdge = bestEdge; // greedily find best fragments List fragments = new ArrayList(); BitSet bitset = bestEdge.bitset; while (true) { Edge bestFrag = null; for (Edge edge : allEdges) bestFrag = chooseBestFrag(bitset, bestFrag, edge); for (Edge edge : agenda) bestFrag = chooseBestFrag(bitset, bestFrag, edge); if (bestFrag == null) break; fragments.add(bestFrag); bitset = (BitSet) bitset.clone(); bitset.or(bestFrag.bitset); } // greedily join while (fragments.size() > 0) { Edge nextJoinedEdge = null; Edge nextFrag = null; for (Edge edge : fragments) { Edge joinedEdge = edgeFactory.makeJoinedEdge(bestJoinedEdge, edge); if (nextJoinedEdge == null || nextJoinedEdge.score < joinedEdge.score) { nextJoinedEdge = joinedEdge; nextFrag = edge; } Edge joinedEdgeR = edgeFactory.makeJoinedEdge(edge, bestJoinedEdge); if (nextJoinedEdge.score < joinedEdgeR.score) { nextJoinedEdge = joinedEdgeR; nextFrag = edge; } } bestJoinedEdge = nextJoinedEdge; fragments.remove(nextFrag); } } // returns edge as the new best frag if it doesn't intersect bitset // and has a better completeness or better score with same completeness; // otherwise returns bestFrag private Edge chooseBestFrag(BitSet bitset, Edge bestFrag, Edge edge) { if (edge.bitset.isEmpty() || edge.bitset.intersects(bitset)) return bestFrag; if (bestFrag == null) return edge; if (bestFrag.completeness < edge.completeness) return edge; if (bestFrag.completeness == edge.completeness && bestFrag.score < edge.score) return edge; return bestFrag; } //----------------------------------------------------------------- // unpacking /** Unpack complete edges, if any; otherwise unpack all. */ protected void doUnpacking() { @SuppressWarnings("unchecked") Set unpacked = new THashSet(new TObjectIdentityHashingStrategy()); boolean foundComplete = bestEdge.complete(); // unpack each relevant edge, updating best edge for (Edge edge : edges) { if (foundComplete && !edge.complete()) continue; unpack(edge, unpacked); updateBestEdge(edge.altEdges.get(0)); } } // recursively unpack and prune edge, unless already visited private void unpack(Edge edge, Set unpacked) { if (unpacked.contains(edge)) return; // add to unpacked set unpacked.add(edge); // OR: recursively unpack alts, merging resulting alts EdgeHash merged = new EdgeHash(); if (edge.altEdges == null) { throw new RuntimeException("No alts for: " + edge); } for (Edge alt : edge.altEdges) { // AND: unpack inputs, make alts, add to merged unpackAlt(alt, unpacked, merged); } // sort, rescore and prune List mergedList = new ArrayList(merged.asEdgeSet()); Collections.sort(mergedList, edgeComparator); List prunedEdges = pruningStrategy.pruneEdges(mergedList); numPrunedNeverAdded += prunedEdges.size(); // replace edge's alts, add to unpruned edges edge.altEdges.clear(); edge.altEdges.addAll(mergedList); allEdges.addAll(mergedList); // update signMap (for debugging) for (Edge mergedEdge : mergedList) { if (!signMap.containsKey(mergedEdge.sign)) signMap.put(mergedEdge.sign, mergedEdge); } } // recursively unpack inputs, make alt combos and add to merged private void unpackAlt(Edge alt, Set unpacked, EdgeHash merged) { // first check for opt completed edge if (alt.optCompletes != null) { // recursively unpack input edge Edge inputEdge = alt.optCompletes; unpack(inputEdge, unpacked); // then make and merge alt edges from input alt edges for (Edge inputAlt : inputEdge.altEdges) { Edge edgeToAdd = (inputAlt.sign == alt.sign) ? alt // use this alt for same sign : edgeFactory.makeAltEdge(inputAlt.sign, alt); // otherwise make edge for new alt merged.insert(edgeToAdd); } return; } // otherwise unpack via input signs DerivationHistory history = alt.sign.getDerivationHistory(); Sign[] inputSigns = history.getInputs(); // base case: no inputs if (inputSigns == null) { merged.insert(alt); return; } // otherwise recursively unpack Edge[] inputEdges = new Edge[inputSigns.length]; for (int i = 0; i < inputSigns.length; i++) { inputEdges[i] = signMap.get(inputSigns[i]); // get input edge using signMap unpack(inputEdges[i], unpacked); } // then make edges for new combos, and add to merged (if unseen) Category resultCat = alt.sign.getCategory(); boolean lefthead = (alt.sign.getLexHead() == inputSigns[0].getLexHead()); List altCombos = inputCombos(inputEdges, 0); for (Sign[] combo : altCombos) { Sign lexHead = (lefthead) ? combo[0].getLexHead() : combo[1].getLexHead(); Sign sign = Sign.createDerivedSignWithNewLF(resultCat, combo, history.getRule(), lexHead); Edge edgeToAdd = (sign.equals(alt.sign)) ? alt // use this alt for equiv sign : edgeFactory.makeAltEdge(sign, alt); // otherwise make edge for new alt merged.insert(edgeToAdd); } } // returns a list of sign arrays, with each array of length inputEdges.length - i, // representing all combinations of alt signs from i onwards private List inputCombos(Edge[] inputEdges, int index) { Edge edge = inputEdges[index]; // base case, inputEdges[last] if (index == inputEdges.length-1) { List altEdges = edge.altEdges; List retval = new ArrayList(altEdges.size()); for (Edge alt : altEdges) { retval.add(new Sign[] { alt.sign }); } return retval; } // otherwise recurse on index+1 List nextCombos = inputCombos(inputEdges, index+1); // and make new combos List altEdges = edge.altEdges; List retval = new ArrayList(altEdges.size() * nextCombos.size()); for (Edge alt : altEdges) { for (int i = 0; i < nextCombos.size(); i++) { Sign[] nextSigns = nextCombos.get(i); Sign[] newCombo = new Sign[nextSigns.length+1]; newCombo[0] = alt.sign; System.arraycopy(nextSigns, 0, newCombo, 1, nextSigns.length); retval.add(newCombo); } } return retval; } //----------------------------------------------------------------- // best edges (single best is available directly as bestEdge) // cached best edges private transient List bestEdges = null; /** * Returns the best complete edges, sorted by their score and * pruned by the pruning strategy. */ public List bestEdges() { if (bestEdges != null) return bestEdges; bestEdges = new ArrayList(); if (!bestEdge.complete()) return bestEdges; List edgesToUse = (usePacking && !doUnpacking) ? edges : allEdges; for (Edge edge : edgesToUse) { if (edge.complete()) bestEdges.add(edge); } Collections.sort(bestEdges, edgeComparator); pruningStrategy.pruneEdges(bestEdges); return bestEdges; } /** * Returns the oracle best edge for the given string, together * with a flag indicating whether the target string was found. * At present, returns the edge from bestEdges() whose * orthography equals the given string, otherwise the one * with the highest 4-gram precision score (breaking ties * using the list order). * TODO: It would be better to return the forest oracle. */ public Pair oracleBest(String target) { List edges = bestEdges(); // check orth for (Edge edge : edges) { if (edge.getSign().getOrthography().equals(target)) return new Pair(edge, true); } // check 4-gram precision Edge retval = null; double bestScore = 0; NgramPrecisionModel oracle = new NgramPrecisionModel(new String[]{target}); for (Edge edge : edges) { double score = oracle.score(edge.getSign(), true); if (score > bestScore) { retval = edge; bestScore = score; } } // done return new Pair(retval, false); } //----------------------------------------------------------------- // printing routines /** The PrintWriter to use with the printing routines. Default wraps System.out. */ public PrintWriter out = new PrintWriter(System.out); /** Prints the best edge found. */ public void printBestEdge() { printEdge(bestEdge); if (!edgeFactory.labeledNominals.isEmpty()) { try { ByteArrayOutputStream bstr = new ByteArrayOutputStream(); edgeFactory.grammar.serializeXml( bestEdge.sign.getWordsInXml(edgeFactory.labeledNominals), bstr ); out.println(bstr.toString()); } catch (java.io.IOException exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } out.println(bestEdge.sign.getBracketedString()); out.flush(); } /** Prints the best joined edge. */ public void printBestJoinedEdge() { if (bestJoinedEdge == null) return; printEdge(bestJoinedEdge); out.println(bestJoinedEdge.sign.getBracketedString()); out.flush(); } /** Prints the timing (and related) info. */ public void printTiming() { out.println(); if (!usePacking) { if (bestEdge != null && bestEdge.complete()) out.println("time 'til first (ms): " + timeTilFirst); if (bestEdge != null) out.println("time 'til best (ms): " + timeTilBest); if (timeTilStopped != 0) out.println("time 'til stopped (ms): " + timeTilStopped); } else { out.println("time 'til packed (ms): " + timeTilPacked); } if (timeTilDone != 0) out.println("time 'til done (ms): " + timeTilDone); out.println(); out.println("rule apps: " + edgeFactory.ruleApps()); out.println("# edges: " + edges.size()); out.println("# unpruned edges: " + allEdges.size()); if (!usePacking) { out.println("# pruned: " + numPrunedRemoved + " removed, " + numPrunedNeverAdded + " never added"); } if (doUnpacking) { out.println("# pruned: " + numPrunedNeverAdded); } out.println("cell max: " + cellMax); out.flush(); } /** Prints all chart edges, unsorted. */ public void printEdges() { printEdges(false); } /** Prints chart edges unsorted, using the complete edges filter according to the given flag. */ public void printEdges(boolean complete) { printEdges(complete, false); } /** * Prints chart edges using the complete edges filter according to the given flag * and sorting according to the given flag. * In the packing only case, the representative edges are shown, otherwise * the unpruned (and possibly unpacked) edges are shown. */ public void printEdges(boolean complete, boolean sort) { List edgeList = (usePacking && !doUnpacking) ? edges : allEdges; if (sort) { edgeList = new ArrayList(edgeList); Collections.sort(edgeList, edgeComparator); } for (int i=0; i < edgeList.size(); i++) { Edge edge = edgeList.get(i); if (!complete || edge.complete()) { if (!sort) printEdge(edge, i, edgeList); else printEdge(edge); } } out.flush(); } /** * Prints the agenda. */ public void printAgenda() { for (Edge edge : agenda) { printEdge(edge); } out.flush(); } /** * Prints the initial edges. */ public void printInitialEdges() { for (Edge edge : edgeFactory.initialEdges) { printEdge(edge); } out.flush(); } // prints edge with incomplete LF chunks and active alts private void printEdge(Edge edge) { printEdge(edge, -1, null); } // prints also with edge index and derivation, if index non-negative private void printEdge(Edge edge, int index, List edgeList) { String str = ""; if (index >= 0) str += index + ". "; str += edge.toString(); if (edge.incompleteLfChunk != null) { int id = edgeFactory.lfChunks.indexOf(edge.incompleteLfChunk); str += " <[" + id + "]>"; } if (edge.activeLfAlts.size() > 0) str += " "; for (List altSet : edge.activeLfAlts) { for (Alt alt : altSet) str += "?" + alt.altSet + "." + alt.numInSet; } str += edgeDerivation(edge, index, edgeList); out.println(str); // show alts subordinated in packing only case if (usePacking && !doUnpacking && edge.isDisjunctive()) { for (Edge alt : edge.altEdges) { if (alt != edge) out.println(" \\_ " + alt + edgeDerivation(alt, index, edgeList)); } } } // returns derivation, if index non-negative private String edgeDerivation(Edge edge, int index, List edgeList) { if (index < 0) return ""; if (edge.optCompletes != null) { return " (" + edgeList.indexOf(edge.optCompletes) + " optC)"; } DerivationHistory history = edge.sign.getDerivationHistory(); Sign[] inputs = history.getInputs(); if (inputs == null) return " (lex)"; String retval = " ("; for (Sign sign : inputs) { Edge repEdge = signMap.get(sign); if (repEdge != null) retval += edgeList.indexOf(repEdge) + " "; } retval += history.getRule().name() + ")"; return retval; } /** * Prints the licensed, marked initial edges. */ public void printMarkedEdges() { for (Edge edge : edgeFactory.markedEdges) { printEdge(edge); } out.flush(); } /** * Prints the licensed, instantiated purely syntactic (semantically null) edges. */ public void printInstantiatedNoSemEdges() { for (Edge edge : edgeFactory.instantiatedNoSemEdges) { printEdge(edge); } out.flush(); } /** * Prints the licensed, uninstantiated purely syntactic (semantically null) edges. */ public void printNoSemEdges() { for (Edge edge : edgeFactory.noSemEdges) { out.println(edge.toString()); } out.flush(); } /** * Prints the rule instances, with instantiated semantics. */ public void printRuleInstances() { for (Iterator it = edgeFactory.ruleInstances.iterator(); it.hasNext(); ) { out.println(it.next().toString()); } out.flush(); } /** * Prints the LF chunks. */ public void printLfChunks() { List chunks = edgeFactory.lfChunks; for (int i = 0; i < chunks.size(); i++) { BitSet chunk = chunks.get(i); out.println("chunk[" + i + "]: " + Edge.toString(chunk)); } out.flush(); } /** * Prints the LF alternatives. */ public void printLfAlts() { for (List altSet : edgeFactory.lfAlts) { for (Alt alt : altSet) { out.print("alt[" + alt.altSet + "." + alt.numInSet + "]: "); out.println(Edge.toString(alt.bitset)); } } out.flush(); } /** * Prints the LF optional parts. */ public void printLfOpts() { List opts = edgeFactory.lfOpts; for (int i = 0; i < opts.size(); i++) { BitSet opt = opts.get(i); out.println("opt[" + i + "]: " + Edge.toString(opt)); } out.flush(); } /** * Prints the elementary predications. */ public void printEPs() { List preds = edgeFactory.preds; for (int i=0; i < preds.size(); i++) { SatOp lf_i = preds.get(i); out.println("ep[" + i + "]: " + lf_i); } out.flush(); } //----------------------------------------------------------------- // chart management // in the anytime case, first checks signs to see whether an edge // whose sign is equivalent (up to surface words) and which has // an equal or higher score or equal // or lower derivational complexity has been seen already, and drops // the given edge if so (in the packing case, this equivalence check is // performed during unpacking); // if the edge replaces an (essentially) equivalent edge of lower score or higher // derivational complexity, removes the old edge from the agenda // or removes it from its equivalence class and puts it on a list of // superceded edges to be pruned from the chart; // then, in all cases, adds the given edge to the agenda, // and updates the best edge so far, with preference given to completeness, // then sign score private void addEdgeToAgenda(Edge edge) { numEdges++; if (!usePacking) { // update edgeHash, checking for equivalent edge of equal or lower complexity Edge retEdge = edgeHash.insert(edge); boolean actuallyInserted = (retEdge != null); if (!actuallyInserted) { return; } // just drop it // remove old edge, if apropos Edge oldEdge = (retEdge != edge) ? retEdge : null; if (oldEdge != null) { // check agenda first boolean onAgenda = agenda.remove(oldEdge); // if not on agenda, remove from equiv class, if present, // and add to list of superceded edges pending removal // nb: delaying pruning of superceded edges from chart // is nec. to avoid a problem with concurrent access // to allEdges in doEdgeCombos if (!onAgenda) { Edge repEdge = catMap.get(oldEdge); if (repEdge != null) { boolean inChart = repEdge.altEdges.remove(oldEdge); if (inChart) supercededEdgesPendingRemoval.add(oldEdge); } } } } if (depthFirst) { agenda.add(0, edge); } else if (edge.score == 0) { agenda.add(edge); } else { addSorted(agenda, edge); } updateBestEdge(edge); } // update bestEdge wrt given edge, and adjust timing info private void updateBestEdge(Edge edge) { if (bestEdge == null) { bestEdge = edge; long endTime = System.currentTimeMillis(); timeTilBest = (int) (endTime - startTime); if (bestEdge.complete()) timeTilFirst = timeTilBest; return; } if (bestEdge.completeness > edge.completeness) return; if (bestEdge.completeness < edge.completeness) { bestEdge = edge; long endTime = System.currentTimeMillis(); timeTilBest = (int) (endTime - startTime); if (bestEdge.complete()) timeTilFirst = timeTilBest; return; } if (edge.score > bestEdge.score) { bestEdge = edge; long endTime = System.currentTimeMillis(); timeTilBest = (int) (endTime - startTime); if (bestEdge.complete()) newBest++; } } // removes superceded edges from the chart private void pruneSupercededEdges() { for (Edge oldEdge : supercededEdgesPendingRemoval) { allEdges.remove(oldEdge); numPrunedRemoved++; } supercededEdgesPendingRemoval.clear(); } // adds the edge to the chart and makes it a representative edge if it's the // first one added for its equiv class; otherwise it's added as an alternative; // in the anytime case, prunes the edges listed as alts for the representative edge, // and adds the edge to the list of all unpruned edges, if it survives pruning; // returns true if the edge is actually added, and false if it doesn't survive // the pruning (anytime case), or is folded into an existing edge (packing case); // prunes the edge and returns false if the cell count is exceeded private boolean addEdgeToChart(Edge edge) { // check cell count if (cellPruningValue != NO_PRUNING && cellCount(edge) >= cellPruningValue) { numPrunedNeverAdded++; return false; } // inc cell count incCellCount(edge); // get representative edge for this edge Edge repEdge = catMap.get(edge); // check for same edge already in chart; pretend it's been added if (edge == repEdge) return true; // if none, make this edge into one, adding it to the chart if (repEdge == null) { edge.initAltEdges(); // nb: could try capacity of pruningValue+1 if (collectCombos) edge.initEdgeCombos(); catMap.put(edge, edge); edges.add(edge); signMap.put(edge.sign, edge); // anytime case: add to all edges list too if (!usePacking) allEdges.add(edge); // and return return true; } // otherwise add edge to alts (sorted) else { addSorted(repEdge.altEdges, edge); // packing case: return false, as edge is simply folded into repEdge if (usePacking) return false; } // anytime case: if not pruning, just add edge to all edges list, and return if (pruningValue == NO_PRUNING) { allEdges.add(edge); signMap.put(edge.sign, edge); // for debugging return true; } // otherwise do pruning List prunedEdges = pruningStrategy.pruneEdges(repEdge.altEdges); boolean edgeItselfPruned = false; for (Edge prunedEdge : prunedEdges) { if (prunedEdge != edge) { allEdges.remove(prunedEdge); numPrunedRemoved++; } else edgeItselfPruned = true; } // add edge to all edges list, if it was not pruned if (!edgeItselfPruned) { allEdges.add(edge); signMap.put(edge.sign, edge); // for debugging return true; } // otherwise false numPrunedNeverAdded++; return false; } // cell count private int cellCount(Edge edge) { Integer count = cellMap.get(edge.bitset); return (count == null) ? 0 : count; } // inc cell count private void incCellCount(Edge edge) { int count = cellCount(edge); cellMap.put(edge.bitset, ++count); if (count > cellMax) cellMax = count; } //----------------------------------------------------------------- // edge sorted insertion and comparison // adds the given edge into the already sorted list, // maintaining the sort order; // when gluing fragments, edges are sorted first by size, // otherwise by score private void addSorted(List list, Edge edge) { Comparator comparator = (gluingFragments) ? edgeSizeComparator : edgeComparator; // do binary search int index = Collections.binarySearch(list, edge, comparator); // check if search found an edge with the same sort pos if (index >= 0) { // if so, advance the index past sort equiv edges while (index < list.size()) { Edge existingEdge = list.get(index); //if (existingEdge.score == edge.score) index++; if (comparator.compare(existingEdge, edge) == 0) index++; else break; } } else { // otherwise, convert index to insertion point index = Math.abs(index) - 1; } // then add edge at index list.add(index, edge); } /** Compares edges based on their relative score, in descending order. */ public static final Comparator edgeComparator = new Comparator() { public int compare(Edge edge1, Edge edge2) { return -1 * Double.compare(edge1.score, edge2.score); } }; /** Compares edges based on their relative size then score, in descending order. */ public static final Comparator edgeSizeComparator = new Comparator() { public int compare(Edge edge1, Edge edge2) { int retval = -1 * Float.compare(edge1.completeness, edge2.completeness); if (retval != 0) return retval; return -1 * Double.compare(edge1.score, edge2.score); } }; } ================================================ FILE: src/opennlp/ccg/realize/DiversityPruningStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.synsem.Sign; import java.util.*; import gnu.trove.*; /** * Abstract n-best edge pruning strategy that keeps edges diversified * according to the notCompellinglyDifferent equivalence relation. * The edges are clustered into a ranked list of equivalence classes, * which are sequentially sampled until the limit n is reached to determine * the edges to keep. * If the singleBestPerGroup flag is set, then a maximum of one edge * per equivalence class is retained. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/12/21 03:27:18 $ */ abstract public class DiversityPruningStrategy extends NBestPruningStrategy { /** Flag for whether to keep only the single best edge among those that are not compellingly different (defaults to false). */ public boolean singleBestPerGroup = false; /** Reusable set of edges to keep. */ protected THashSet keepers = new THashSet(new TObjectIdentityHashingStrategy()); /** Returns true iff the given signs are not compellingly different. */ abstract public boolean notCompellinglyDifferent(Sign sign1, Sign sign2); /** * Returns a (possibly empty) list of edges pruned * from the given ones, which should have equivalent * categories and be sorted by score, from highest to lowest. * In particular, prunes and returns the edges that fall below the N-best * cutoff when the diversity strategy determined by notCompellinglyDifferent * is applied. * If the singleBestPerGroup flag is set, no more than one edge * per group of equivalent ones will be returned. */ public List pruneEdges(List catEdges) { // clear reusable return list retval.clear(); // ensure pruning enabled if (CAT_PRUNE_VAL == Chart.NO_PRUNING) return retval; // ensure there are edges to prune if (!singleBestPerGroup && catEdges.size() <= CAT_PRUNE_VAL) return retval; // group edges into ranked equivalence classes, // by using a list of lists, preserving order List> groups = new ArrayList>(); for (Edge edge : catEdges) { boolean foundGroup = false; for (int i = 0; i < groups.size(); i++) { List members = groups.get(i); Edge first = members.get(0); if (notCompellinglyDifferent(first.sign, edge.sign)) { members.add(edge); foundGroup = true; break; } } if (!foundGroup) { List members = new ArrayList(); members.add(edge); groups.add(members); } } // add top n to keepers by sequentially visiting groups, // according also to singleBestPerGroup flag keepers.clear(); int counter = 0; int numGroups = groups.size(); while (keepers.size() < CAT_PRUNE_VAL && (!singleBestPerGroup || counter < numGroups)) { int groupNum = counter % numGroups; int indexInGroup = counter / numGroups; List members = groups.get(groupNum); if (indexInGroup < members.size()) { keepers.add(members.get(indexInGroup)); } counter++; } // prune edges not in keepers for (Iterator it = catEdges.iterator(); it.hasNext(); ) { Edge edge = it.next(); if (!keepers.contains(edge)) { retval.add(edge); it.remove(); } } // done return retval; } } ================================================ FILE: src/opennlp/ccg/realize/Edge.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import opennlp.ccg.*; import java.util.*; import java.util.prefs.*; import java.text.*; /** *

    * An edge is a tracker for a sign, ie a sign together * with bitsets representing its coverage of the * input predicates and the semantic indices used, * along with lists of the active LF alts. * It also has a completeness percentage and a score, * as well as its most specific incomplete LF chunk (if any). * Edges are created by an EdgeFactory and managed by a Chart. *

    *

    * A representative edge is an edge that represents (stands in for) * other edges with the same category during the chart generation * process. A representative edge has a list of alternative edges, * which are assumed to share the same category; it is considered * disjunctive when there is more than one alternative. * Note that initially a representative edge will be in its list * of alternatives, but it can be removed during pruning. * Finally, for the anytime search, a representative edge can * maintain a collection of successful combinations, to avoid * invoking the combinatory rules multiple times with the same * combinations of categories. *

    * * @author Michael White * @version $Revision: 1.32 $, $Date: 2010/08/10 04:10:15 $ */ public class Edge extends Tracker { /** Preference key for showing completeness. */ public static final String SHOW_COMPLETENESS = "Show Completeness"; /** Preference key for showing coverage bitset. */ public static final String SHOW_BITSET = "Show Bitset"; /** The sign. */ protected Sign sign; /** The completeness percentage. */ public final float completeness; /** The edge score. */ public final double score; /** The most specific incomplete LF chunk (if any). */ public final BitSet incompleteLfChunk; /** The alternative edges (none initially). */ protected List altEdges = null; /** The edge combos (none initially). */ protected EdgeCombos edgeCombos = null; /** The edge, if any, that this edge is constructed from by marking optional bits as completed. */ protected Edge optCompletes = null; /** Constructor. */ public Edge(Sign sign, BitSet bitset, BitSet indices, float completeness, double score, List> activeLfAlts, BitSet incompleteLfChunk) { super(bitset, indices, activeLfAlts); this.sign = sign; this.completeness = completeness; this.score = score; this.incompleteLfChunk = incompleteLfChunk; } /** Returns the sign. */ public Sign getSign() { return sign; } /** Returns whether this edge has completeness 1.0, ie, covers all the input preds. */ public boolean complete() { return (completeness == 1.0); } /** * Returns the nominal which is the value of the index feature on the * sign's target cat, or null if none. */ public Nominal getIndexNominal() { return sign.getCategory().getIndexNominal(); } /** * Returns true iff this edge can combine with the given tracker * without violating its LF chunk constraint (if any). * Specifically, returns true when this edge has no incomplete chunk * or the tracker is semantically empty; otherwise, returns true * iff the incomplete chunk intersects with the tracker's * coverage vector. */ public boolean meetsLfChunkConstraints(Tracker tracker) { if (incompleteLfChunk == null || tracker.bitset.isEmpty()) return true; return incompleteLfChunk.intersects(tracker.bitset); } /** Returns whether this edge is a representative. */ public boolean isRepresentative() { return altEdges != null; } /** Returns whether this edge is disjunctive. */ public boolean isDisjunctive() { return altEdges != null && altEdges.size() > 1; } /** Returns the list of alt edges, or the empty list if none. */ public List getAltEdges() { if (altEdges == null) return Collections.emptyList(); else return altEdges; } /** Initializes the alt edges list with a default capacity, adding this edge. */ public void initAltEdges() { initAltEdges(3); } /** Initializes the alt edges list with the given capacity, adding this edge. */ public void initAltEdges(int capacity) { // check uninitialized if (altEdges != null) throw new RuntimeException("Alt edges already initialized!"); altEdges = new ArrayList(capacity); altEdges.add(this); } /** Initializes the edge combos. */ public void initEdgeCombos() { // check representative status if (!isRepresentative()) throw new RuntimeException("Not a representative!"); if (edgeCombos != null) throw new RuntimeException("Edge combos already initialized!"); edgeCombos = new EdgeCombos(); } /** Returns a hash code for this edge. (Alternatives are not considered.) */ public int hashCode() { int retval = sign.hashCode() + 31 * bitset.hashCode() + indices.hashCode(); retval += (int) (31000 * score); return retval; } /** * Returns a hash code for this edge based on the surface words, * ignoring the LF and ignoring the score. */ public int surfaceWordHashCode() { return sign.surfaceWordHashCode(true) + 31 * bitset.hashCode() + indices.hashCode(); } /** Returns whether this edge equals the given object. (Alternatives are not considered.) */ public boolean equals(Object obj) { if (obj == this) return true; if (!(obj instanceof Edge)) return false; Edge edge = (Edge) obj; return bitset.equals(edge.bitset) && indices.equals(edge.indices) && completeness == edge.completeness && score == edge.score && sign.equals(edge.sign); } /** * Returns whether this edge equals the given object based on the surface words, * ignoring the LF and ignoring the score. */ public boolean surfaceWordEquals(Object obj) { if (obj == this) return true; if (!(obj instanceof Edge)) return false; Edge edge = (Edge) obj; return bitset.equals(edge.bitset) && indices.equals(edge.indices) && sign.surfaceWordEquals(edge.sign, true); } /** * Returns a string for the edge in the format * {completeness} [score] orthography :- category {bitset}. */ public String toString() { Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); boolean showCompleteness = prefs.getBoolean(SHOW_COMPLETENESS, false); boolean showBitset = prefs.getBoolean(SHOW_BITSET, false); StringBuffer sbuf = new StringBuffer(); //sbuf.append(indices + " "); if (showCompleteness) { sbuf.append("{" + nf2.format(completeness) + "} "); } if (score >= 0.001 || score == 0.0) { sbuf.append("[" + nf3.format(score) + "] "); } else { sbuf.append("[" + nfE.format(score) + "] "); } sbuf.append(sign.toString()); if (showBitset) { sbuf.append(' ').append(toString(bitset)); } return sbuf.toString(); } // formats to two decimal places private static final NumberFormat nf2 = initNF2(); private static NumberFormat initNF2() { NumberFormat f = NumberFormat.getInstance(); f.setMinimumIntegerDigits(1); f.setMinimumFractionDigits(2); f.setMaximumFractionDigits(2); return f; } // formats to three decimal places private static final NumberFormat nf3 = initNF3(); private static NumberFormat initNF3() { NumberFormat f = NumberFormat.getInstance(); f.setMinimumIntegerDigits(1); f.setMinimumFractionDigits(3); f.setMaximumFractionDigits(3); return f; } // formats to "0.##E0" private static final NumberFormat nfE = new DecimalFormat("0.##E0"); /** Formats bitset compactly, with ranges hyphenated. */ public static String toString(BitSet bitset) { StringBuffer sbuf = new StringBuffer(); sbuf.append('{'); int j = 0; for (int i = bitset.nextSetBit(0); i >= 0; i = bitset.nextSetBit(j+1)) { if (j != 0) sbuf.append(','); j = bitset.nextClearBit(i); if (j == i+1) sbuf.append(Integer.toString(i)); else sbuf.append(i + "-" + (j-1)); } sbuf.append('}'); return sbuf.toString(); } } ================================================ FILE: src/opennlp/ccg/realize/EdgeCombos.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import java.util.*; /** * EdgeCombos is a simple data structure that allows a representative edge * to collect the info about successful combinations of the edge (with * other edges or unary rules) that is needed to create analogous new result edges * for edges that share the same category, without having to invoke * the combinatory rules. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2005/10/13 18:20:30 $ */ public class EdgeCombos { /** Info for a collected category combination. */ public class CatCombo { /** The (other) input edge for the category combo. */ public final Edge inputEdge; /** The result edge, to use in making alternative edges. */ public final Edge resultEdge; /** Constructor */ public CatCombo(Edge inputEdge, Edge resultEdge) { this.inputEdge = inputEdge; this.resultEdge = resultEdge; } } /** The rightward combos. */ public final List rightwardCombos = new ArrayList(5); /** The leftward combos. */ public final List leftwardCombos = new ArrayList(5); /** The unary results. */ public final List unaryResults = new ArrayList(3); /** The optional results, ie with optional parts marked as completed. */ public final List optionalResults = new ArrayList(3); /** Adds a rightward combo. */ public void addRightwardCombo(Edge inputEdge, Edge resultEdge) { rightwardCombos.add(new CatCombo(inputEdge, resultEdge)); } /** Adds a leftward combo. */ public void addLeftwardCombo(Edge inputEdge, Edge resultEdge) { leftwardCombos.add(new CatCombo(inputEdge, resultEdge)); } } ================================================ FILE: src/opennlp/ccg/realize/EdgeFactory.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-11 University of Edinburgh / Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.*; import opennlp.ccg.lexicon.*; import opennlp.ccg.grammar.*; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import opennlp.ccg.util.*; import gnu.trove.*; import java.util.*; import java.util.prefs.*; /** * The EdgeFactory is responsible for creating edges. * A single edge factory instance per realization request is assumed. * * @author Michael White * @version $Revision: 1.87 $, $Date: 2011/10/30 21:06:47 $ */ public class EdgeFactory { /** Preference key for whether to use indexing to filter edges to combine. */ public static final String USE_INDEXING = "Use Indexing"; /** Preference key for whether to (exceptionally) allow categories with no target cat index nominal to combine. */ public static final String ALLOW_MISSING_INDEX_COMBOS = "Allow Missing Index Combos"; /** Preference key for whether to use LF chunks to filter edges to combine. */ public static final String USE_CHUNKS = "Use Chunks"; /** Preference key for whether to use feature licensing; if false, the simple lex feature is used for comparison purposes. */ public static final String USE_FEATURE_LICENSING = "Use Feature Licensing"; /** The grammar used to create edges. */ public final Grammar grammar; /** The elementary predications to be covered. */ public final List preds; /** The sign scorer. */ public final SignScorer signScorer; /** The hypertagger. */ public final Hypertagger hypertagger; /** The initial, unmarked edges instantiated after lexical lookup. */ public final List initialEdges = new ArrayList(); /** The marked initial edges licensed by features in the other initial edges. */ public final List markedEdges = new ArrayList(); /** The licensed and instantiated purely syntactic (semantically null) edges. */ public final List instantiatedNoSemEdges = new ArrayList(); /** The licensed, uninstantiated purely syntactic edges. */ public final List noSemEdges = new ArrayList(); /** The rule instances, ie the type changing rules with instantiated semantics. */ public final List ruleInstances = new ArrayList(); /** The LF chunks, represented as bitsets. */ public final List lfChunks = new ArrayList(); /** The LF alts, represented as a list of lists of alts (where each list of alts forms an exclusive disjunction). */ public final List> lfAlts = new ArrayList>(); /** The LF optional parts, represented as bitsets. */ public final List lfOpts = new ArrayList(); /** Flag indicating whether there are any LF alts or optional parts. */ public boolean hasLfAltsOrOpts = false; // a bitset for all preds private final BitSet allPreds; // the lexicon used to create edges private final Lexicon lexicon; // general rules, ie the ones with no associated semantics private final RuleGroup generalRules; // rule group for rules wrapped by rule instances private final RuleGroup ruleInstancesGroup; // rule for joining fragments private final FragmentJoining fragmentRule = new FragmentJoining(); // helper class for licensing features private final FeatureLicenser featureLicenser; /** Set of nominals whose phrases are marked for labeling in the output (with mark=+). */ public final Set labeledNominals = new HashSet(); /** Map from nominals to ints, for indexing edges. */ final TObjectIntHashMap nominals = new TObjectIntHashMap(); // indexes the preds by their position, // by mapping pred keys to a list of pred indices for that key private final Map> predMap = new HashMap>(); // list of paired nominals in the input LF private final List pairedNominals = new ArrayList(); // flag for whether there are any paired nominals private boolean anyPairedNominals = false; /** Set of nominals under a BoundVar relation. */ final Set boundVarNominals = new HashSet(); // list of nominals for a particular cat or pair of cats private final List catNominals = new ArrayList(); /** * Flag for whether to use indexing. * Setting retrieved from preferences; turned off when gluing fragments. */ public boolean useIndexing = true; // flag for whether to (exceptionally) allow categories with no target cat index nominal to combine private boolean allowMissingIndexCombos = false; // flag for whether to use chunks private boolean useChunks = true; // flag for whether to use feature licensing private boolean useFeatureLicensing = true; /** * Flag for whether to debug category instantiation (defaults to false). * If true, cases of complex categories whose outermost category * is not instantiated with index nominals are reported to * System.err. Note that realization is more efficient if such * categories can be avoided in the grammar. * Uncovered EPs after lex lookup are also reported to System.err. */ public boolean debugInstantiation = false; /* The number of unary rule applications executed. */ private int unaryRuleApps = 0; /* The number of unary rule instance applications executed. */ private int unaryRuleInstApps = 0; /* The number of binary rule applications executed. */ private int binaryRuleApps = 0; /** Flag for whether to glue fragments currently. Defaults to false. */ public boolean gluingFragments = false; /** Bit vector for EPs not covered by a lexical edge or rule instance; null if none. */ protected BitSet uncoveredEPs = null; /** Flag indicating whether any lexical or featural EPs are uncovered. */ public boolean hasUncoveredPreds = false; /** Flag for whether to use relaxed relation matching. */ // XXX tmp switch protected boolean useRelaxedRelationMatching = Boolean.getBoolean("useRelaxedRelationMatching"); /** Constructor. */ public EdgeFactory(Grammar grammar, List preds, SignScorer signScorer) { this(grammar, preds, signScorer, null); } /** Constructor with hypertagger. */ public EdgeFactory(Grammar grammar, List preds, SignScorer signScorer, Hypertagger hypertagger) { this.grammar = grammar; this.preds = preds; this.signScorer = signScorer; this.hypertagger = hypertagger; lexicon = grammar.lexicon; generalRules = new RuleGroup(grammar); generalRules.borrowSupercatRuleCombos(grammar.rules); ruleInstancesGroup = new RuleGroup(grammar); ruleInstancesGroup.borrowSupercatRuleCombos(grammar.rules); allPreds = new BitSet(preds.size()); allPreds.set(0, preds.size()); Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); useIndexing = prefs.getBoolean(USE_INDEXING, true); allowMissingIndexCombos = prefs.getBoolean(ALLOW_MISSING_INDEX_COMBOS,false); useChunks = prefs.getBoolean(USE_CHUNKS, true); useFeatureLicensing = prefs.getBoolean(USE_FEATURE_LICENSING, true); if (useFeatureLicensing) { featureLicenser = new FeatureLicenser(this); } else { // if feature licensing off, use simple lex feature for comparison purposes featureLicenser = new FeatureLicenser( this, new LicensingFeature[] { LicensingFeature.simpleLexFeature } ); } UnifyControl.startUnifySequence(); extractLabeledNominals(); indexPreds(); listNominals(); listPairedNominals(); addBoundVarNominals(); fillLfChunks(); fillLfAlts(); fillLfOpts(); hasLfAltsOrOpts = lfAlts.size() > 0 || lfOpts.size() > 0; if (hypertagger != null) hypertagger.mapPreds(preds); } /** * Adds LF optional parts for any preds not covered by a lex item or rule instance, * divided up by chunks, for use with fragment gluing or relaxed relation matching. */ public void addLFOptsForUncoveredPreds() { // see if all preds covered if (uncoveredEPs == null) return; // clone what's not covered by lex items and rule instances BitSet opt = (BitSet) uncoveredEPs.clone(); // otherwise add opts for what's missing // nb: need to split up by chunks for (BitSet chunk : lfChunks) { if (opt.intersects(chunk)) { BitSet optChunk = (BitSet) opt.clone(); optChunk.and(chunk); if (!lfOpts.contains(optChunk)) lfOpts.add(optChunk); opt.andNot(optChunk); } } // add anything remaining if (!opt.isEmpty() && !lfOpts.contains(opt)) lfOpts.add(opt); // ensure hasLfAltsOrOpts set hasLfAltsOrOpts = true; // TODO deal with lf alts too (may require sorting chunks and alts by size) } /** * Adds an LF optional part for each instantiated rule instance, * for use with fragment gluing. */ public void addLFOptsForRuleInstances() { // do each rule instance for (RuleInstance ruleInstance : ruleInstances) { BitSet opt = (BitSet) ruleInstance.bitset.clone(); lfOpts.add(opt); } // ensure hasLfAltsOrOpts set if (lfOpts.size() > 0) hasLfAltsOrOpts = true; } // returns the uncovered preds, or null if none private BitSet uncoveredPreds() { // determine what's covered by lex items and rule instances BitSet retval = new BitSet(preds.size()); for (Edge edge : initialEdges) retval.or(edge.bitset); for (Edge edge : markedEdges) retval.or(edge.bitset); for (RuleInstance ruleInstance : ruleInstances) retval.or(ruleInstance.bitset); // see if all preds covered if (retval.equals(allPreds)) return null; // otherwise xor what's missing retval.xor(allPreds); // set missing lex preds flag for (int i = retval.nextSetBit(0); i >= 0; i = retval.nextSetBit(i+1)) { SatOp pred = preds.get(i); if (HyloHelper.isLexPred(pred) || HyloHelper.isAttrPred(pred)) { hasUncoveredPreds = true; break; } } // return return retval; } //----------------------------------------------------------------- // edge construction // /** Makes an edge, computing the completeness percentage, sign score, and indices, and setting the most specific incomplete LF chunk (if any). */ protected Edge makeEdge(Sign sign, BitSet bitset, List> activeLfAlts) { BitSet indices = getIndices(sign.getCategory(), null); float completeness = bitset.cardinality() / (float) preds.size(); boolean complete = (completeness == 1.0); double score = signScorer.score(sign, complete); BitSet incompleteLfChunk = getIncompleteLfChunk(bitset, activeLfAlts); return new Edge(sign, bitset, indices, completeness, score, activeLfAlts, incompleteLfChunk); } /** Makes an edge for the given alt sign from the given edge, after computing the sign's score. */ protected Edge makeAltEdge(Sign altSign, Edge edge) { double score = signScorer.score(altSign, edge.complete()); return new Edge( altSign, edge.bitset, edge.indices, edge.completeness, score, edge.activeLfAlts, edge.incompleteLfChunk ); } /** Makes an edge consisting of two joined fragments. */ public Edge makeJoinedEdge(Edge edge1, Edge edge2) { Sign sign = fragmentRule.applyRule(edge1.sign, edge2.sign); BitSet bitset = (BitSet) edge1.bitset.clone(); bitset.or(edge2.bitset); float completeness = bitset.cardinality() / (float) preds.size(); boolean complete = (completeness == 1.0); double score = signScorer.score(sign, complete); return new Edge( sign, bitset, edge1.indices, completeness, score, edge1.activeLfAlts, edge1.incompleteLfChunk ); } //----------------------------------------------------------------- // active alts // /** From the given LF alts, returns the active ones for the given bitset, updating the bitset for any completely covered alts. NB: If the given LF alts list is not the entire list, each alt is assumed to intersect. */ private List> getActiveLfAlts(List> fromLfAlts, BitSet bitset) { if (fromLfAlts.isEmpty()) return fromLfAlts; boolean checkingAllAlts = (fromLfAlts == lfAlts); BitSet tmpBitSet = new BitSet(bitset.size()); List> retval = new ArrayList>(fromLfAlts.size()); // check each 'from' alt for (List altSet : fromLfAlts) { List activeAltSet = null; // for collecting active alts boolean foundCoveredAlt = false; for (Alt alt : altSet) { // if checking all alts, check intersection with alt if (!checkingAllAlts || alt.bitset.intersects(bitset)) { // check whether alt completely covered tmpBitSet.clear(); tmpBitSet.or(bitset); tmpBitSet.and(alt.bitset); if (tmpBitSet.equals(alt.bitset)) { foundCoveredAlt = true; break; } else { // otherwise add to active alts if (activeAltSet == null) activeAltSet = new ArrayList(3); activeAltSet.add(alt); } } } if (foundCoveredAlt) { // update coverage bitset to include all alts in this set List fullAltSet = lfAlts.get(altSet.get(0).altSet); for (Alt alt : fullAltSet) bitset.or(alt.bitset); } else { // otherwise update active alts, if any if (activeAltSet != null) retval.add(activeAltSet); } } return retval; } /** Returns the active LF alts that result from combining the given ones, or null if these are incompatible. For alt sets in common, the combined alts consist of the intersection of these alt sets, or null if this intersection is empty. For alts sets not in common, the active alts are carried through unchanged. */ private List> getCombinedLfAlts(List> activeLfAlts1, List> activeLfAlts2) { if (activeLfAlts1.isEmpty()) return activeLfAlts2; if (activeLfAlts2.isEmpty()) return activeLfAlts1; List> retval = new ArrayList>(activeLfAlts1.size() + activeLfAlts2.size()); Iterator> it1 = activeLfAlts1.iterator(); Iterator> it2 = activeLfAlts2.iterator(); List altSet1 = it1.next(); List altSet2 = it2.next(); for (int i = 0; i < lfAlts.size(); i++) { // inc to alt set i, if not yet there (or beyond) if (altSet1.get(0).altSet < i && it1.hasNext()) altSet1 = it1.next(); if (altSet2.get(0).altSet < i && it2.hasNext()) altSet2 = it2.next(); // check whether only one or the other has alt set i if (altSet1.get(0).altSet == i && altSet2.get(0).altSet != i) retval.add(altSet1); else if (altSet2.get(0).altSet == i && altSet1.get(0).altSet != i) retval.add(altSet2); else if (altSet1.get(0).altSet == i && altSet2.get(0).altSet == i) { // take intersection List combined = new ArrayList(Math.min(altSet1.size(),altSet2.size())); for (Alt alt : altSet1) { if (altSet2.contains(alt)) combined.add(alt); } // check for empty intersection, returning null if (combined.isEmpty()) return null; // otherwise add combined list retval.add(combined); } } return retval; } //----------------------------------------------------------------- // misc bookkeeping // // extracts the nominal atoms marked for labeling in the output private void extractLabeledNominals() { for (Iterator it = preds.iterator(); it.hasNext(); ) { SatOp pred = it.next(); if (!HyloHelper.isAttrPred(pred)) continue; Nominal nom1 = HyloHelper.getPrincipalNominal(pred); if (!(nom1 instanceof NominalAtom)) continue; String rel = HyloHelper.getRel(pred); if (rel == null || !rel.equals("mark")) continue; labeledNominals.add(nom1); it.remove(); } } // lists the nominals in the preds private void listNominals() { for (SatOp pred : preds) { Nominal nom1 = HyloHelper.getPrincipalNominal(pred); Nominal nom2 = HyloHelper.getSecondaryNominal(pred); if (nom1 instanceof NominalAtom && !nominals.containsKey(nom1)) { nominals.put(nom1, nominals.size()); } if (nom2 instanceof NominalAtom && !nominals.containsKey(nom2)) { nominals.put(nom2, nominals.size()); } } } // create bitset for cat indices private BitSet getIndices(Category cat, Category cat2) { catNominals.clear(); cat.forall(gatherIndices); if (cat2 != null) { cat2.forall(gatherIndices); } BitSet retval = new BitSet(nominals.size()); for (Iterator it = catNominals.iterator(); it.hasNext(); ) { Object nom = it.next(); int index = nominals.get(nom); retval.set(index); } return retval; } // check for uninstantiated outer args; if found, // set the indices to allow all combos, and issue // a warning if the debugInstantiation flag is set private void checkInstantiation(List edges) { for (int i = 0; i < edges.size(); i++) { Edge edge = edges.get(i); if (noSemEdges.contains(edge)) continue; // exempt uninstantiated no sem edges if (outerArgUninstantiated(edge.sign.getCategory())) { edge.indices.set(0, nominals.size()); if (debugInstantiation) { System.err.println("Warning: outer arg uninstantiated: " + edge.sign); } } } } // returns whether the outermost arg is not instantiated private boolean outerArgUninstantiated(Category cat) { if (!(cat instanceof ComplexCat)) return false; Arg outer = ((ComplexCat)cat).getOuterArg(); catNominals.clear(); outer.forall(gatherIndices); return catNominals.isEmpty(); } // gathers values of index feature in atomic cats private CategoryFcn gatherIndices = new CategoryFcnAdapter() { public void forall(Category c) { if (!(c instanceof AtomCat)) return; FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; addCatNominal(fs.getValue("index")); addCatNominal(fs.getValue("mod-index")); } }; // adds a nominal atom to catNominals private void addCatNominal(Object indexVal) { if (indexVal instanceof NominalAtom) { if (!catNominals.contains(indexVal)) { catNominals.add(indexVal); } } } // lists the paired nominals in the input LF, as a bitset pair private void listPairedNominals() { for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); if (!"tup".equals(HyloHelper.getLexPred(pred))) continue; Nominal tupNom = HyloHelper.getPrincipalNominal(pred); Nominal nom1 = null; Nominal nom2 = null; for (int j = i+1; j < preds.size(); j++) { SatOp predJ = preds.get(j); if (!tupNom.equals(HyloHelper.getPrincipalNominal(predJ))) break; if ("Item1".equals(HyloHelper.getRel(predJ))) { nom1 = HyloHelper.getSecondaryNominal(predJ); } if ("Item2".equals(HyloHelper.getRel(predJ))) { nom2 = HyloHelper.getSecondaryNominal(predJ); } } if (nom1 == null || nom2 == null) { System.err.println("Warning, couldn't find paired nominals for tuple: " + tupNom); continue; } if (!(nom1 instanceof NominalAtom)) continue; if (!(nom2 instanceof NominalAtom)) continue; BitSet[] pair = new BitSet[2]; pair[0] = new BitSet(nominals.size()); pair[0].set(nominals.get(nom1)); pair[1] = new BitSet(nominals.size()); pair[1].set(nominals.get(nom2)); pairedNominals.add(pair); anyPairedNominals = true; } } // adds the bound var nominals private void addBoundVarNominals() { for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); String rel = HyloHelper.getRel(pred); if (rel == null || !rel.equals("BoundVar")) continue; Nominal nom2 = HyloHelper.getSecondaryNominal(pred); if (!(nom2 instanceof NominalAtom)) continue; boundVarNominals.add(nom2); // check if nom2 is a tuple for (int j = 0; j < preds.size(); j++) { SatOp predJ = preds.get(j); if (!nom2.equals(HyloHelper.getPrincipalNominal(predJ))) continue; if (!"tup".equals(HyloHelper.getLexPred(predJ))) continue; // if so, add paired items as bound vars too for (int k = j+1; k < preds.size(); k++) { SatOp predK = preds.get(k); if (!nom2.equals(HyloHelper.getPrincipalNominal(predK))) break; String relK = HyloHelper.getRel(predK); if ("Item1".equals(relK) || "Item2".equals(relK)) { Nominal nom2K = HyloHelper.getSecondaryNominal(predK); if (!(nom2K instanceof NominalAtom)) continue; boundVarNominals.add(nom2K); } } } } } /** * Returns whether the indices for the two edges are paired in the input LF. */ public boolean pairedWith(Edge edgeA, Edge edgeB) { for (int i = 0; i < pairedNominals.size(); i++) { BitSet[] pair = pairedNominals.get(i); if (pair[0].equals(edgeA.indices) && pair[1].equals(edgeB.indices)) { return true; } } return false; } // indexes the preds by their position into predMap private void indexPreds() { for (int i=0; i < preds.size(); i++) { String[] keys = predKeys(preds.get(i)); for (int j=0; j < keys.length; j++) { List indices = predMap.get(keys[j]); if (indices == null) { indices = new ArrayList(1); predMap.put(keys[j], indices); } indices.add(i); } } } // returns 0-2 keys for the given pred (where nom vars are skipped): // a lex pred is indexed by atom(pred) // a rel pred is indexed by atom and atom2 // an attr pred is indexed by atom private static String[] predKeys(LF pred) { Nominal nom = HyloHelper.getPrincipalNominal(pred); String lexPred = HyloHelper.getLexPred(pred); String rel = HyloHelper.getRel(pred); Nominal nom2 = HyloHelper.getSecondaryNominal(pred); List keys = new ArrayList(2); if (nom instanceof NominalAtom && lexPred != null) keys.add(nom.toString() + "(" + lexPred + ")"); if (nom instanceof NominalAtom && rel != null) keys.add(nom.toString() + "<" + rel + ">"); if (nom2 instanceof NominalAtom && rel != null) keys.add("<" + rel + ">" + nom2.toString()); return (String[]) keys.toArray(new String[keys.size()]); } // fills in the LF chunks list with the chunks for each pred, // then sorts them by specificity, from most to least private void fillLfChunks() { // for each pred, fill in chunks for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); TIntArrayList chunks = pred.getChunks(); if (chunks == null) continue; // for each chunk that this pred is part of for (int j = 0; j < chunks.size(); j++) { int chunkId = chunks.get(j); // ensure chunk bitset exists while (lfChunks.size() < (chunkId + 1)) { lfChunks.add(new BitSet(preds.size())); } // update chunk bitset BitSet chunk = lfChunks.get(chunkId); chunk.set(i); } } // do insertion sort, to ensure subset ordering List tmpList = new ArrayList(lfChunks); lfChunks.clear(); allChunks: for (BitSet chunk : tmpList) { for (int i=0; i < lfChunks.size(); i++) { BitSet sortedChunk = lfChunks.get(i); if (subset(chunk, sortedChunk)) { lfChunks.add(i, chunk); continue allChunks; } } lfChunks.add(chunk); } } // gets the most specific incomplete chunk for an edge, or null private BitSet getIncompleteLfChunk(BitSet bitset, List> activeLfAlts) { // check each chunk allChunks: for (BitSet lfChunk : lfChunks) { // for intersection if (!lfChunk.intersects(bitset)) { continue; } // for incomplete coverage if (!subset(lfChunk, bitset) && subset(bitset, lfChunk)) { // and for part of all active alts for (List altSet : activeLfAlts) { for (Alt alt : altSet) { if (!subset(lfChunk, alt.bitset)) continue allChunks; } } // return chunk return lfChunk; } } // otherwise null return null; } // returns true iff bitset1 is a subset of bitset2 private BitSet tmpBitSet = new BitSet(); private boolean subset(BitSet bitset1, BitSet bitset2) { tmpBitSet.clear(); tmpBitSet.or(bitset1); tmpBitSet.andNot(bitset2); return tmpBitSet.isEmpty(); } // returns true iff combining the edges would complete a chunk private boolean completesChunk(Edge edgeA, Edge edgeB) { if (edgeA.incompleteLfChunk != null) { tmpBitSet.clear(); tmpBitSet.or(edgeA.incompleteLfChunk); tmpBitSet.andNot(edgeA.bitset); tmpBitSet.andNot(edgeB.bitset); if (tmpBitSet.isEmpty()) return true; } if (edgeB.incompleteLfChunk != null) { tmpBitSet.clear(); tmpBitSet.or(edgeB.incompleteLfChunk); tmpBitSet.andNot(edgeA.bitset); tmpBitSet.andNot(edgeB.bitset); if (tmpBitSet.isEmpty()) return true; } return false; } // fills in the LF alts list with the alts for each pred private void fillLfAlts() { // for each pred for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); List alts = pred.getAlts(); if (alts == null) continue; // for each alt that this pred is part of for (Alt alt : alts) { // ensure list for alt set exists while (lfAlts.size() < (alt.altSet + 1)) { lfAlts.add(null); } List altSet = lfAlts.get(alt.altSet); if (altSet == null) { altSet = new ArrayList(5); lfAlts.set(alt.altSet, altSet); } // ensure list item for alt num is alt while (altSet.size() < (alt.numInSet + 1)) { altSet.add(null); } altSet.set(alt.numInSet, alt); // update alt bitset alt.bitset.set(i); } } } // fills in the LF opts list with the opts for each pred private void fillLfOpts() { // for each pred for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); TIntArrayList opts = pred.getOpts(); if (opts == null) continue; // for each opt that this pred is part of for (int j = 0; j < opts.size(); j++) { int optId = opts.get(j); // ensure opt bitset exists while (lfOpts.size() < (optId + 1)) { lfOpts.add(new BitSet(preds.size())); } // update opt bitset BitSet opt = lfOpts.get(optId); opt.set(i); } } } // returns the list of coart rels for the pred with the given index // NB: assumes that preds are sorted by their principal nominals, with the lex pred first private List getCoartRels(int predIndex) { SatOp pred = preds.get(predIndex); Nominal nom = HyloHelper.getPrincipalNominal(pred); List retval = null; for (int i = predIndex+1; i < preds.size(); i++) { SatOp relPred = preds.get(i); if (!nom.equals(HyloHelper.getPrincipalNominal(relPred))) break; String rel = HyloHelper.getRel(relPred); if (rel != null && grammar.lexicon.isCoartRel(rel)) { if (retval == null) retval = new ArrayList(3); retval.add(rel); } } return retval; } //----------------------------------------------------------------- // createInitialEdges // /** * Creates and returns all initial edges. * In particular, initializes all lexical edges that cover some of the input semantics; * also initializes edges for semantically null lexical items, * and initializes instances of type changing rules which * introduce their own semantics. * If a hypertagger is in place, only the beta-best edges are returned for each EP. */ public List createInitialEdges() { // marked initial edges that need to be licensed List markedEdgesForLicensing = new ArrayList(); // for each pred, create edges for signs indexed // by lexical preds and by indexed rels; // and similarly for type changing rules for (int i=0; i < preds.size(); i++) { SatOp pred = preds.get(i); String key = HyloHelper.getLexPred(pred); String rel = HyloHelper.getRel(pred); // skip if no lex pred or indexed rel (not expected) if (key == null && rel == null) continue; // update hypertagger for beta-best lookup if (hypertagger != null) hypertagger.setPred(i); Collection signs = new ArrayList(); Collection typeChangingRules = new ArrayList(); // add signs and rules for lex pred if (key != null) { List coartRels = getCoartRels(i); Collection lexPredSigns = lexicon.getSignsFromPred(key, coartRels); if (lexPredSigns != null) { signs.addAll(lexPredSigns); } Collection lexPredRules = grammar.rules.getRulesForPred(key); if (lexPredRules != null) { typeChangingRules.addAll(lexPredRules); } } // add signs and rules for indexed rel if (rel != null) { Collection indexedRelSigns = lexicon.getSignsFromRel(rel); if (indexedRelSigns != null) { signs.addAll(indexedRelSigns); } Collection indexedRelRules = grammar.rules.getRulesForRel(rel); if (indexedRelRules != null) { typeChangingRules.addAll(indexedRelRules); } } // create initial and marked edges for each sign, updating feature map for (Sign sign : signs) { List initialEdgesForSign = createInitialEdges(sign, i); if (initialEdgesForSign != null) { for (Edge initialEdge : initialEdgesForSign) { Category cat = initialEdge.sign.getCategory(); if (featureLicenser.needsLicensing(cat)) markedEdgesForLicensing.add(initialEdge); else { initialEdges.add(initialEdge); featureLicenser.updateFeatureMap(cat); } } } } // create rules instances for each rule, updating feature map for (TypeChangingRule rule : typeChangingRules) { List ruleInstancesForRule = createRuleInstances(rule, i); if (ruleInstancesForRule != null) { for (RuleInstance ruleInst : ruleInstancesForRule) { ruleInstances.add(ruleInst); featureLicenser.updateFeatureMap(ruleInst.rule.getArg()); featureLicenser.updateFeatureMap(ruleInst.rule.getResult()); } } } } // add licensed, marked initial edges int prevSize; do { // while list size is changing prevSize = markedEdgesForLicensing.size(); for (Iterator it = markedEdgesForLicensing.iterator(); it.hasNext(); ) { // check each edge Edge edge = it.next(); Category cat = edge.sign.getCategory(); if (featureLicenser.isLicensed(cat)) { // and add to marked edges if licensed markedEdges.add(edge); it.remove(); // updating feature map featureLicenser.updateFeatureMap(cat); } } } while (markedEdgesForLicensing.size() != prevSize); // initialize general rules initGeneralRules(); // initialize edges for semantically null lexical items initNoSemEdges(); // collect all initial edges List retval = new ArrayList( initialEdges.size() + markedEdges.size() + instantiatedNoSemEdges.size() + noSemEdges.size() ); retval.addAll(initialEdges); retval.addAll(markedEdges); retval.addAll(instantiatedNoSemEdges); retval.addAll(noSemEdges); // check instantiation of outermost cats checkInstantiation(retval); // set uncovered EPs uncoveredEPs = uncoveredPreds(); // warn if EPs missing and debug instantiation flag set if (uncoveredEPs != null && debugInstantiation) { System.err.println("Warning, uncovered preds after lex instantiation: " + Edge.toString(uncoveredEPs)); } // set opts for missing relations, if apropos if (useRelaxedRelationMatching) addLFOptsForUncoveredPreds(); // return return retval; } // return null if LF doesn't unify with preds private List createInitialEdges(Sign sign, int predIndex) { // get parts of sign List words = sign.getWords(); Category cat = sign.getCategory(); // instantiate List> instantiations = instantiate(cat, null, predIndex); // check for failure if (instantiations == null) return null; // otherwise fill cats and make edges List retval = new ArrayList(instantiations.size()); for (Pair inst : instantiations) { Substitution subst = inst.a; BitSet bitset = inst.b; Category filledCat = null; try { filledCat = (Category) cat.fill(subst); } catch (UnifyFailure uf) { // shouldn't happen throw new RuntimeException("Unable to fill cat: " + uf); } // index subcategorized semantically null words featureLicenser.indexSemanticallyNullWords(filledCat); // update lex origins for new sign Sign newSign = new Sign(words, filledCat); newSign.setOrigin(); // and add new edge List> activeLfAlts = getActiveLfAlts(lfAlts, bitset); retval.add(makeEdge(newSign, bitset, activeLfAlts)); } // and return them return retval; } // return null if result LF doesn't unify with preds private List createRuleInstances(TypeChangingRule rule, int predIndex) { // get parts of rule Category result = rule.getResult(); Category arg = rule.getArg(); // instantiate List> instantiations = instantiate(result, arg, predIndex); // check for failure if (instantiations == null) return null; // otherwise fill cats and make rule instances List retval = new ArrayList(instantiations.size()); for (Pair inst : instantiations) { Substitution subst = inst.a; BitSet bitset = inst.b; Category filledResult = null; Category filledArg = null; try { filledResult = (Category) result.fill(subst); filledArg = (Category) arg.fill(subst); } catch (UnifyFailure uf) { // shouldn't happen throw new RuntimeException("Unable to fill cat: " + uf); } // index subcategorized semantically null words featureLicenser.indexSemanticallyNullWords(filledArg); featureLicenser.indexSemanticallyNullWords(filledResult); // and return new rule instance BitSet indices = getIndices(filledResult, filledArg); TypeChangingRule newRule = new TypeChangingRule(filledArg, filledResult, rule.name(), rule.getFirstEP()); ruleInstancesGroup.addRule(newRule); List> activeLfAlts = getActiveLfAlts(lfAlts, bitset); RuleInstance ruleInst = new RuleInstance(newRule, bitset, indices, activeLfAlts); retval.add(ruleInst); } // and return them return retval; } // return null if cat LF doesn't unify with preds private List> instantiate(Category cat, Category cat2, int predIndex) { // unify with indexed pred UnifyControl.reindex(cat, cat2); List lfPreds = HyloHelper.getPreds(cat.getLF()); Substitution subst = null; SatOp indexedPred = preds.get(predIndex); int lfPredIndex = -1; for (int i=0; i < lfPreds.size(); i++) { LF lfPred = lfPreds.get(i); subst = new SimpleSubstitution(); try { Unifier.unify(lfPred, indexedPred, subst); lfPredIndex = i; break; } catch (UnifyFailure uf) {} } // if failed, return empty list if (lfPredIndex == -1) return null; // set indexed pred in bitset BitSet bitset = new BitSet(preds.size()); bitset.set(predIndex); // unify with rest of lfPreds, extending subst/bitset List remainingPreds = new ArrayList(lfPreds.size()); remainingPreds.addAll(lfPreds); remainingPreds.remove(lfPredIndex); int prevSize = -1; List> retval = new ArrayList>(3); List> prev = new ArrayList>(3); retval.add(new Pair(subst, bitset)); // loop until empty or no changes, in order to propagate matches while (!remainingPreds.isEmpty() && remainingPreds.size() != prevSize) { prevSize = remainingPreds.size(); for (Iterator it = remainingPreds.iterator(); it.hasNext(); ) { SatOp lfPred = it.next(); try { // fill index lfPred = (SatOp) lfPred.fill(subst); } catch (UnifyFailure uf) { // shouldn't happen throw new RuntimeException("Unable to fill lfPred: " + uf); } // find matching pred String[] lfPredKeys = predKeys(lfPred); if (lfPredKeys.length == 0) { // nb: this means the lfPred is underconstrained; // will need to check it later! continue; } List matchingPredIndices = new ArrayList(3); for (int i = 0; i < lfPredKeys.length; i++) { List indices = predMap.get(lfPredKeys[i]); if (indices != null) matchingPredIndices.addAll(indices); } if (matchingPredIndices.isEmpty()) { if (useRelaxedRelationMatching && HyloHelper.isRelPred(lfPred)) continue; // skip else return null; // fail } // try extending each subst/bitset: // first swap retval, prev, and clear retval List> tmp = prev; prev = retval; retval = tmp; retval.clear(); for (Pair inst : prev) { Substitution s = inst.a; BitSet b = inst.b; if (matchingPredIndices.size() == 1) { // reuse current instantiation int matchingPredIndex = matchingPredIndices.get(0); b.set(matchingPredIndex); if (checkAlts(b)) { try { // unify SatOp matchingPred = preds.get(matchingPredIndex); Unifier.unify(lfPred, matchingPred, s); retval.add(inst); } catch (UnifyFailure uf) {} } } else { // otherwise make copies for (int matchingPredIndex : matchingPredIndices) { Substitution s2 = new SimpleSubstitution((SimpleSubstitution)s); BitSet b2 = (BitSet)b.clone(); b2.set(matchingPredIndex); if (checkAlts(b2)) { try { // unify SatOp matchingPred = preds.get(matchingPredIndex); Unifier.unify(lfPred, matchingPred, s2); Pair inst2 = new Pair(s2, b2); retval.add(inst2); } catch (UnifyFailure uf) {} } } } } if (retval.isEmpty()) { if (useRelaxedRelationMatching && HyloHelper.isRelPred(lfPred)) { retval.addAll(prev); continue; // skip } else return null; // fail } it.remove(); } } // check for no more than one (rel) pred left over if (remainingPreds.size() > 1) return null; // done return retval; } // returns true iff no alt exclusions are violated // nb: needs to check that if there any intersections // with multiple alts, then these are only in the shared part private boolean checkAlts(BitSet b) { for (List altSet : lfAlts) { int intersects = 0; for (Alt alt : altSet) { if (alt.bitset.intersects(b)) intersects++; } if (intersects > 1) { // check intersections for (int i = 0; i < altSet.size(); i++) { Alt alt = altSet.get(i); if (alt.bitset.intersects(b)) { for (int j = i+1; j < altSet.size(); j++) { Alt alt2 = altSet.get(j); if (alt2.bitset.intersects(b)) { BitSet altOnly = (BitSet) alt.bitset.clone(); altOnly.andNot(alt2.bitset); BitSet alt2Only = (BitSet) alt2.bitset.clone(); alt2Only.andNot(alt.bitset); if (altOnly.intersects(b) && alt2Only.intersects(b)) return false; } } } } } } return true; } //----------------------------------------------------------------- // createNewEdges // /** * Returns all edges that can be created by combining the given edges, * without collecting combos. */ public List createNewEdges(Edge edge, Edge next) { return createNewEdges(edge, next, false); } /** * Returns all edges that can be created by combining the given edges; * if the collectCombos flag is true, the edges are updated with collected combos, * and additional alt edges are made for the remaining alternative edges for * the given first edge. */ public List createNewEdges(Edge edge, Edge next, boolean collectCombos) { // check for sem overlap if (edge.intersects(next)) return Collections.emptyList(); // check LF chunk constraints if (useChunks) { if (!edge.meetsLfChunkConstraints(next) || !next.meetsLfChunkConstraints(edge)) return Collections.emptyList(); } // make new edges ... List newEdges = null; // when using indexing: if (useIndexing) { // check for intersecting indices if (edge.indicesIntersect(next)) { newEdges = createNewEdges(edge, next, collectCombos, true); } // check for PairedWith relation else if (anyPairedNominals && pairedWith(edge, next)) { newEdges = createNewEdges(edge, next, collectCombos, false); } else if (anyPairedNominals && pairedWith(next, edge)) { newEdges = createNewEdges(next, edge, collectCombos, false); } // check for a missing index nominal on the target cat, // which can indicate a type-raised category that needs to combine // before its indices become adjacent else if (allowMissingIndexCombos && (edge.getIndexNominal() == null || next.getIndexNominal() == null)) { newEdges = createNewEdges(edge, next, collectCombos, true); } else { return Collections.emptyList(); } } // otherwise try everything else { newEdges = createNewEdges(edge, next, collectCombos, true); } // make alt edges for rest of edge's alts, with collectCombos option if (collectCombos && edge.altEdges.size() > 0) { int numNewEdges = newEdges.size(); // get num before adding any more for (int i = 0; i < numNewEdges; i++) { Edge resultEdge = newEdges.get(i); Sign resultSign = resultEdge.sign; Category resultCat = resultSign.getCategory(); Rule rule = resultSign.getDerivationHistory().getRule(); Sign[] resultInputs = resultSign.getDerivationHistory().getInputs(); boolean rightward = (resultInputs[0] == next.sign); boolean lefthead = (resultSign.getLexHead() == resultInputs[0].getLexHead()); for (int j = 0; j < edge.altEdges.size(); j++) { Edge furtherEdge = edge.altEdges.get(j); if (furtherEdge == edge) continue; Sign[] signs = (rightward) ? new Sign[] { next.sign, furtherEdge.sign } : new Sign[] { furtherEdge.sign, next.sign }; Sign lexHead = (rightward == lefthead) ? next.sign.getLexHead() : furtherEdge.sign.getLexHead(); Sign altSign = Sign.createDerivedSignWithNewLF(resultCat, signs, rule, lexHead); newEdges.add(makeAltEdge(altSign, resultEdge)); } } } // check instantiation of outermost cats checkInstantiation(newEdges); // done return newEdges; } // creates edges, combining in one or both directions per flag private List createNewEdges(Edge edgeA, Edge edgeB, boolean collectCombos, boolean bothDirections) { // get combined alts, checking compatibility List> combinedLfAlts = getCombinedLfAlts(edgeA.activeLfAlts, edgeB.activeLfAlts); if (combinedLfAlts == null) return Collections.emptyList(); // check whether a chunk is completed when gluing fragments boolean fragCompletion = false; if (gluingFragments) fragCompletion = completesChunk(edgeA, edgeB); // A B combos List results; if (gluingFragments) results = generalRules.applyGlueRule(edgeA.sign, edgeB.sign); else results = generalRules.applyBinaryRules(edgeA.sign, edgeB.sign); binaryRuleApps++; int numResults = results.size(); // B A combos List reversedResults = Collections.emptyList(); if (bothDirections) { if (gluingFragments) reversedResults = generalRules.applyGlueRule(edgeB.sign, edgeA.sign); else reversedResults = generalRules.applyBinaryRules(edgeB.sign, edgeA.sign); binaryRuleApps++; } int numReversedResults = reversedResults.size(); // make edges to return, updating edge combos (if apropos) List retval = Collections.emptyList(); if (numResults + numReversedResults > 0) { retval = new ArrayList(numResults + numReversedResults); BitSet union = (BitSet) edgeA.bitset.clone(); union.or(edgeB.bitset); int cardBefore = union.cardinality(); List> activeLfAlts = getActiveLfAlts(combinedLfAlts, union); // check for alt completion when gluing fragments if (gluingFragments && union.cardinality() > cardBefore) fragCompletion = true; for (int i = 0; i < numResults; i++) { Sign sign = results.get(i); if (fragCompletion) { ((AtomCat)sign.getCategory()).fragCompletion = true; } Edge resultEdge = makeEdge(sign, union, activeLfAlts); retval.add(resultEdge); if (collectCombos) { edgeA.edgeCombos.addRightwardCombo(edgeB, resultEdge); edgeB.edgeCombos.addLeftwardCombo(edgeA, resultEdge); } } for (int i = 0; i < numReversedResults; i++) { Sign sign = reversedResults.get(i); if (fragCompletion) { ((AtomCat)sign.getCategory()).fragCompletion = true; } Edge resultEdge = makeEdge(sign, union, activeLfAlts); retval.add(resultEdge); if (collectCombos) { edgeB.edgeCombos.addRightwardCombo(edgeA, resultEdge); edgeA.edgeCombos.addLeftwardCombo(edgeB, resultEdge); } } } // done return retval; } /** * Returns all edges that can be created by applying a unary rule * to the given edge or by combining it with a purely syntactic edge, * without collecting combos. */ public List createNewEdges(Edge edge) { return createNewEdges(edge, false); } /** * Returns all edges that can be created by applying a unary rule * to the given edge, or by combining it with a purely syntactic edge, * or by completing a realization/chunk/alt with an optional part, * while updating the given edge with collected combos, * if the collectCombos flag is true. * When gluing fragments, only the opt completion step is done. */ public List createNewEdges(Edge edge, boolean collectCombos) { List retval = null; // instantiate on demand if (!gluingFragments) { List genResults = generalRules.applyUnaryRules(edge.sign); unaryRuleApps++; // make edges for results, updating edge combos if (genResults.size() > 0) { if (retval == null) retval = new ArrayList(genResults.size()); for (int i = 0; i < genResults.size(); i++) { Sign sign = genResults.get(i); // check for unary rule cycle; skip result if found if (sign.getDerivationHistory().containsCycle()) continue; Edge resultEdge = makeEdge(sign, edge.bitset, edge.activeLfAlts); retval.add(resultEdge); if (collectCombos) edge.edgeCombos.unaryResults.add(resultEdge); } } // do rule instances Sign[] signs = { edge.sign }; for (int i = 0; i < ruleInstances.size(); i++) { RuleInstance ruleInst = ruleInstances.get(i); // check sem overlap if (edge.intersects(ruleInst)) continue; // check for indices in common if (useIndexing && !edge.indicesIntersect(ruleInst)) continue; // check LF chunk constraints if (useChunks && !edge.meetsLfChunkConstraints(ruleInst)) continue; // get combined alts, checking compatibility List> combinedLfAlts = getCombinedLfAlts(edge.activeLfAlts, ruleInst.activeLfAlts); if (combinedLfAlts == null) continue; // apply rule List instResults = new ArrayList(1); ruleInst.rule.applyRule(signs, instResults); unaryRuleInstApps++; if (instResults.size() > 0) { if (retval == null) retval = new ArrayList(instResults.size()); BitSet union = (BitSet) edge.bitset.clone(); union.or(ruleInst.bitset); List> activeLfAlts = getActiveLfAlts(combinedLfAlts, union); for (int j = 0; j < instResults.size(); j++) { Sign sign = instResults.get(j); // check for unary rule cycle; skip result if found if (sign.getDerivationHistory().containsCycle()) continue; Edge resultEdge = makeEdge(sign, union, activeLfAlts); retval.add(resultEdge); if (collectCombos) edge.edgeCombos.unaryResults.add(resultEdge); } } } } // do opt completed edges if (!lfOpts.isEmpty() && !edge.complete()) { // get completed bitsets for each completed active alt or chunk, and for whole thing List optCompleted = new ArrayList(2); addOptCompletedBitSet(edge, allPreds, optCompleted); for (List altSet : edge.activeLfAlts) { for (Alt alt : altSet) { addOptCompletedBitSet(edge, alt.bitset, optCompleted); } } for (BitSet chunk : lfChunks) { addOptCompletedBitSet(edge, chunk, optCompleted); } // for each completed bitset, make complete edge with same sign for (BitSet completed : optCompleted) { List> activeLfAlts = getActiveLfAlts(edge.activeLfAlts, completed); // set frag completion if apropos if (gluingFragments && edge.sign.getCategory() instanceof AtomCat) { AtomCat ac = (AtomCat) edge.sign.getCategory(); if (ac.isFragment()) ac.fragCompletion = true; } Edge resultEdge = makeEdge(edge.sign, completed, activeLfAlts); resultEdge.optCompletes = edge; if (retval == null) retval = new ArrayList(1); retval.add(resultEdge); if (collectCombos) edge.edgeCombos.optionalResults.add(resultEdge); } } // ensure retval instantiated if (retval == null) retval = Collections.emptyList(); // check instantiation of outermost cats if (!gluingFragments) checkInstantiation(retval); // done return retval; } // bitset for checking completeness private BitSet tmpBitSetCompleteness = new BitSet(); // bitset for making retval private BitSet tmpBitSetRetval = new BitSet(); // adds a bitset with optional parts completed within the given bitset scope // to the given list, if the optional parts complete the given edge's bitset private void addOptCompletedBitSet(Edge edge, BitSet bitset, List optCompleted) { // check whether already complete tmpBitSetRetval.clear(); tmpBitSetRetval.or(edge.bitset); tmpBitSetRetval.and(bitset); if (tmpBitSetRetval.cardinality() == bitset.cardinality()) return; tmpBitSetRetval.or(edge.bitset); // or retval with opts when apropos for (BitSet opt : lfOpts) { if (subset(opt, bitset)) { if (edge.bitset.intersects(opt)) continue; // skip if opt not entirely missing tmpBitSetRetval.or(opt); } } // check completeness, add retval if complete (and distinct) tmpBitSetCompleteness.clear(); tmpBitSetCompleteness.or(bitset); tmpBitSetCompleteness.and(tmpBitSetRetval); if (tmpBitSetCompleteness.cardinality() == bitset.cardinality()) { if (!optCompleted.contains(tmpBitSetRetval)) optCompleted.add((BitSet)tmpBitSetRetval.clone()); } } /** Returns the edges that can be made by constructing alternative edges from the given edge and the collected combos in its representative edge. */ public List createAltEdges(Edge edge, Edge repEdge) { // instantiate return list with right capacity EdgeCombos edgeCombos = repEdge.edgeCombos; int numResults = numResultsFromCombos(edgeCombos.rightwardCombos); numResults += numResultsFromCombos(edgeCombos.leftwardCombos); numResults += edgeCombos.unaryResults.size(); numResults += edgeCombos.optionalResults.size(); List retval = new ArrayList(numResults); // make alt edges addAltsFromCombos(edge, edgeCombos.rightwardCombos, true, retval); addAltsFromCombos(edge, edgeCombos.leftwardCombos, false, retval); addAltsFromUnaryResults(edge, edgeCombos.unaryResults, retval); addAltsFromOptionalResults(edge, edgeCombos.optionalResults, retval); // done return retval; } // returns the number of results from the given combos private int numResultsFromCombos(List combos) { int retval = 0; for (int i = 0; i < combos.size(); i++) { EdgeCombos.CatCombo combo = combos.get(i); retval += combo.inputEdge.altEdges.size(); } return retval; } // adds alt edges for the given edge, combos, and direction to results private void addAltsFromCombos(Edge edge, List combos, boolean rightward, List results) { for (EdgeCombos.CatCombo combo : combos) { Edge resultEdge = combo.resultEdge; Sign resultSign = resultEdge.sign; Category resultCat = resultSign.getCategory(); Rule rule = resultSign.getDerivationHistory().getRule(); Sign[] resultInputs = resultSign.getDerivationHistory().getInputs(); boolean lefthead = (resultSign.getLexHead() == resultInputs[0].getLexHead()); List comboEdges = combo.inputEdge.altEdges; for (Edge comboEdge : comboEdges) { Sign[] signs = (rightward) ? new Sign[] { edge.sign, comboEdge.sign } : new Sign[] { comboEdge.sign, edge.sign }; Sign lexHead = (rightward == lefthead) ? edge.sign.getLexHead() : comboEdge.sign.getLexHead(); Sign altSign = Sign.createDerivedSignWithNewLF(resultCat, signs, rule, lexHead); results.add(makeAltEdge(altSign, resultEdge)); } } } // adds alt edges for the given edge and unary results to results private void addAltsFromUnaryResults(Edge edge, List unaryResults, List results) { for (Edge resultEdge : unaryResults) { Sign resultSign = resultEdge.sign; Category resultCat = resultSign.getCategory(); Rule rule = resultSign.getDerivationHistory().getRule(); Sign[] signs = { edge.sign }; Sign lexHead = edge.sign.getLexHead(); Sign altSign = Sign.createDerivedSignWithNewLF(resultCat, signs, rule, lexHead); results.add(makeAltEdge(altSign, resultEdge)); } } // adds alt edges for the given edge and optional results to results private void addAltsFromOptionalResults(Edge edge, List optionalResults, List results) { for (Edge resultEdge : optionalResults) { results.add(makeAltEdge(edge.sign, resultEdge)); } } /** Returns the number of rule applications executed. */ public int ruleApps() { return unaryRuleApps * generalRules.getUnaryRules().size() + unaryRuleInstApps + binaryRuleApps * generalRules.getBinaryRules().size(); } //----------------------------------------------------------------- // initGeneralRules // // separates out general rules with no semantics // nb: could consider adding feature licensing for type changing rules with no semantics private void initGeneralRules() { // add all binary rules to general rules for (Rule r : grammar.rules.getBinaryRules()) { generalRules.addRule(r); } // add type raising rules, and type changing ones with no semantics too for (Rule r : grammar.rules.getUnaryRules()) { // skip type changing rules with semantics if (r instanceof TypeChangingRule) { TypeChangingRule rule = (TypeChangingRule) r; if (rule.getResult().getLF() != null) { continue; } } // otherwise add it generalRules.addRule(r); } } //----------------------------------------------------------------- // initNoSemEdges // // creates edges for signs flagged as having no semantics, // and with appropriate licensing values in the initial edges private void initNoSemEdges() { // lookup signs by special index rel constant NO_SEM_FLAG lexicon.setSupertagger(null); // turn off hypertagger first Collection noSemSigns = lexicon.getSignsFromRel(Lexicon.NO_SEM_FLAG); lexicon.setSupertagger(hypertagger); // reset hypertagger if (noSemSigns == null) return; // sets for accumulating no sem edges Set instEdges = new HashSet(); Set uninstEdges = new HashSet(); // add signs with no LF and with matching licensing values Set instantiatedCats = new HashSet(); Set uninstantiatedCats = new HashSet(); List> emptyLfAlts = Collections.emptyList(); // loop until no more no sem edges int numInstEdges, numUninstEdges; do { numInstEdges = instEdges.size(); numUninstEdges = uninstEdges.size(); for (Sign sign : noSemSigns) { Category cat = sign.getCategory(); // get licensed, potentially instantiated cats instantiatedCats.clear(); uninstantiatedCats.clear(); featureLicenser.licenseEmptyCat(cat, instantiatedCats, uninstantiatedCats); // add edges for instantiated cats to initial edges, updating // feature map for (Category instCat : instantiatedCats) { featureLicenser.updateFeatureMap(instCat); featureLicenser.indexSemanticallyNullWords(instCat); Sign instSign = new Sign(sign.getWords(), instCat); instEdges.add(makeEdge(instSign, new BitSet(preds.size()), emptyLfAlts)); } // add edges for uninstantiated cats to no-sem edges, updating // feature map for (Category uninstCat : uninstantiatedCats) { featureLicenser.updateFeatureMap(uninstCat); featureLicenser.indexSemanticallyNullWords(uninstCat); Sign uninstSign = new Sign(sign.getWords(), uninstCat); Edge noSemEdge = makeEdge(uninstSign, new BitSet(preds.size()), emptyLfAlts); uninstEdges.add(noSemEdge); } } } while (numInstEdges != instEdges.size() || numUninstEdges != uninstEdges.size()); // update no sem edge lists instantiatedNoSemEdges.addAll(instEdges); noSemEdges.addAll(uninstEdges); } } ================================================ FILE: src/opennlp/ccg/realize/EdgeHash.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import gnu.trove.*; import java.util.*; /** * A set of edges, unique up to surface words. * Edges with higher scores or whose signs have lower derivational complexity are kept during insertion. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2010/01/14 22:52:01 $ */ public class EdgeHash extends THashSet { private static final long serialVersionUID = 1L; /** Hashing strategy that uses Edge's surfaceWordHashCode and surfaceWordEquals methods. */ protected static TObjectHashingStrategy surfaceWordHashingStrategy = new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(java.lang.Object o) { return ((Edge)o).surfaceWordHashCode(); } public boolean equals(java.lang.Object o1, java.lang.Object o2) { return ((Edge)o1).surfaceWordEquals((Edge)o2); } }; /** Default constructor. */ public EdgeHash() { super(surfaceWordHashingStrategy); } /** * Returns this as a set of edges. */ @SuppressWarnings("unchecked") public Set asEdgeSet() { return (Set) this; } /** * Adds an edge, keeping the one with a higher score or whose sign has lower derivational complexity * if there is an equivalent one there already; returns the old * edge if it was displaced, the new edge if there was no equivalent * old edge, or null if the edge was not actually added. */ public Edge insert(Edge edge) { int pos = index(edge); // equiv edge if (pos >= 0) { Edge oldEdge = (Edge) _set[pos]; // already there? if (oldEdge == edge) return null; // check score if (edge.score > oldEdge.score) { _set[pos] = edge; return oldEdge; } // check complexity int complexity = edge.sign.getDerivationHistory().complexity(); int oldComplexity = oldEdge.sign.getDerivationHistory().complexity(); if (complexity < oldComplexity) { _set[pos] = edge; return oldEdge; } // otherwise toss else return null; } // add new else { add(edge); return edge; } } } ================================================ FILE: src/opennlp/ccg/realize/FeatureLicenser.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.lexicon.*; import opennlp.ccg.unify.*; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import java.util.*; /** * The feature licenser is a helper class for the edge factory, * responsible for managing features which license the use and * instantiation of semantically null or marked categories. * * @author Michael White * @version $Revision: 1.13 $, $Date: 2009/12/21 03:27:18 $ */ public class FeatureLicenser { // the edge factory for which this feature licenser is a helper private final EdgeFactory edgeFactory; // the licensing features private final LicensingFeature[] licensingFeatures; /** Constructor. */ public FeatureLicenser(EdgeFactory edgeFactory) { this.edgeFactory = edgeFactory; this.licensingFeatures = edgeFactory.grammar.lexicon.getLicensingFeatures(); } /** Constructor with licensing features. */ public FeatureLicenser(EdgeFactory edgeFactory, LicensingFeature[] licensingFeatures) { this.edgeFactory = edgeFactory; this.licensingFeatures = licensingFeatures; } //----------------------------------------------------------------- // semantically null word indexing /** * Adds new nominal atoms for subcategorized semantically null words. * A check is made for atomic categories with a value for the 'lex' * feature but with a null or uninstantiated index feature. If found, a new nominal atom is * created as the value of the index feature, and the nominal is * added to the edge factory's nominals map, for indexing purposes. * The same nominal is reused for repeated occurrences of a 'lex' value. */ public void indexSemanticallyNullWords(Category cat) { cat.forall(semanticallyNullWordIndexer); } // counter private int wordCounter = 0; // 'lex' value to index map private Map wordIndexMap = new HashMap(); // cat function private CategoryFcn semanticallyNullWordIndexer = new CategoryFcnAdapter() { public void forall(Category c) { if (!(c instanceof AtomCat)) return; FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; if (!fs.hasAttribute("lex")) return; Object indexVal = fs.getValue("index"); if (indexVal == null || (indexVal instanceof NominalVar)) { String lexVal = fs.getValue("lex").toString(); String index = wordIndexMap.get(lexVal); NominalAtom nom; if (index == null) { do { index = "w" + ++wordCounter; nom = new NominalAtom(index); } while (edgeFactory.nominals.containsKey(nom)); wordIndexMap.put(lexVal, index); edgeFactory.nominals.put(nom, edgeFactory.nominals.size()); } else nom = new NominalAtom(index); fs.setFeature("index", nom); } } }; //----------------------------------------------------------------- // feature map // /** * Updates the licensed feature map with the info from the given initial category. */ public void updateFeatureMap(Category cat) { currentFeatureMap = featureMap; cat.forall(featureMapUpdater); currentFeatureMap = null; } // updates the category-specific licensed feature map private void updateCatFeatureMap(Category cat) { catFeatureMap.clear(); currentFeatureMap = catFeatureMap; cat.forall(featureMapUpdater); currentFeatureMap = null; } // a map from an attr name to a map from vals to sets of atomic categories // containing those attr-val pairs private Map>> featureMap = new HashMap>>(); // a feature map for a specific category to be checked private Map>> catFeatureMap = new HashMap>>(); // working feature map private Map>> currentFeatureMap = null; // list of all initial atom cats checked for main feature map private List allInitialAtomCats = new ArrayList(); // feature map updater private CategoryFcn featureMapUpdater = new CategoryFcnAdapter() { public void forall(Category c) { if (!(c instanceof AtomCat)) return; if (currentFeatureMap == featureMap) allInitialAtomCats.add(c); FeatureStructure fs = c.getFeatureStructure(); if (fs == null) return; // for each feature for (int i = 0; i < licensingFeatures.length; i++) { String attr = licensingFeatures[i].attr; Object val = fs.getValue(attr); if (val != null && !(val instanceof Variable)) { // check for relevant value String valStr = val.toString(); String fVal = licensingFeatures[i].val; List alsoList = licensingFeatures[i].alsoLicensedBy; if (fVal != null && !fVal.equals(valStr) && !alsoList.contains(valStr)) continue; // add to feature map Map> valMap = currentFeatureMap.get(attr); if (valMap == null) { valMap = new HashMap>(); currentFeatureMap.put(attr, valMap); } Set acSet = valMap.get(valStr); if (acSet == null) { acSet = new HashSet(); valMap.put(valStr, acSet); } acSet.add(c); } } } }; //----------------------------------------------------------------- // category licensing // /** * Returns whether the given category contains a feature indicating that * it needs to be licensed. */ public boolean needsLicensing(Category cat) { return checkLicensing(cat, true); } /** * Returns whether the given category is licensed according to * the current feature map. */ public boolean isLicensed(Category cat) { return checkLicensing(cat, false); } // records the licensing feature which succeeded in licensing the last cat (or null if none) private LicensingFeature currentLicensingFeature = null; // checks the given category according to the given flag, // updating currentLicensingFeature private boolean checkLicensing(Category cat, boolean needsLicensing) { currentLicensingFeature = null; boolean emptyCat = (cat.getLF() == null); // set up cat feature map updateCatFeatureMap(cat); Category target = getTarget(cat); // for each feature, look for appropriate attr-val pairs for (int i = 0; i < licensingFeatures.length; i++) { // skip when appropriate license flag not set if (emptyCat && !licensingFeatures[i].licenseEmptyCats) continue; if (!emptyCat && !licensingFeatures[i].licenseMarkedCats) continue; String attr = licensingFeatures[i].attr; Map> valMap = catFeatureMap.get(attr); if (valMap == null) continue; String fVal = licensingFeatures[i].val; Collection vals; if (fVal != null) { if (!valMap.containsKey(fVal)) continue; vals = new ArrayList(1); vals.add(fVal); } else { vals = valMap.keySet(); } byte loc = licensingFeatures[i].loc; // for each attr-val pair for (Iterator it = vals.iterator(); it.hasNext(); ) { String val = it.next(); Set atomCats = valMap.get(val); // check loc if (loc == LicensingFeature.TARGET_ONLY) { if (atomCats.size() != 1) continue; if (!atomCats.contains(target)) continue; } else if (loc == LicensingFeature.ARGS_ONLY) { if (atomCats.contains(target)) continue; } // branch on needs-licensing flag if (needsLicensing) { // found a feature needing to be licensed return true; } else { // check for licensing feature in feature map Map> fmValMap = featureMap.get(attr); // return false if not found if (fmValMap == null) return false; boolean foundLicensingVal = fmValMap.containsKey(val); if (!foundLicensingVal) { List alsoList = licensingFeatures[i].alsoLicensedBy; for (int j = 0; j < alsoList.size(); j++) { if (fmValMap.containsKey(alsoList.get(j))) { foundLicensingVal = true; break; } } } if (!foundLicensingVal) return false; // otherwise record licensing feature and return true currentLicensingFeature = licensingFeatures[i]; return true; } } } // otherwise false return false; } // returns the target cat, if complex, otherwise // just the cat itself private Category getTarget(Category cat) { Category target = cat; if (cat instanceof ComplexCat) { target = ((ComplexCat)cat).getTarget(); } return target; } //----------------------------------------------------------------- // empty (semantically null) category licensing and instantiation // // reusable simple substitution for instantiating vars on atom cats private SimpleSubstitution simpleSubst = new SimpleSubstitution(); /** * Determines whether the given semantically null category * is licensed according to the licensed feature map, and if so, returns * appropriately (un-)instantiated versions of the category. * The licensing features are checked in priority order. * NB: Instantiation is limited to the case where there is a single * value for the operative licensing feature. */ public void licenseEmptyCat(Category cat, Set instantiatedCats, Set uninstantiatedCats) { // reindex UnifyControl.reindex(cat); // return cat uninstantiated if no licensing features found if (!needsLicensing(cat)) { uninstantiatedCats.add(cat); return; } // return nothing if not licensed if (!isLicensed(cat)) return; // find operative licensing feature, if necessary if (currentLicensingFeature == null) { for (int i = 0; i < licensingFeatures.length; i++) { if (!catFeatureMap.containsKey(licensingFeatures[i].attr)) continue; Map> valMap = catFeatureMap.get(licensingFeatures[i].attr); String fVal = licensingFeatures[i].val; if (fVal != null && !valMap.containsKey(fVal)) continue; currentLicensingFeature = licensingFeatures[i]; break; } // if still not found, return cat uninstantiated if (currentLicensingFeature == null) { uninstantiatedCats.add(cat); return; } } // return cat uninstantiated if licensing feature does not // have instantiation flag set if (!currentLicensingFeature.instantiate) { uninstantiatedCats.add(cat); return; } // return cat uninstantiated if licensing feature has more than one val String attr = currentLicensingFeature.attr; Map> valMap = catFeatureMap.get(attr); if (valMap.size() > 1) { uninstantiatedCats.add(cat); return; } String val = valMap.keySet().iterator().next(); Set atomCats = valMap.get(val); // for each atom cat, go ahead with instantiation ... for (Iterator acIt = atomCats.iterator(); acIt.hasNext(); ) { Category ac = acIt.next(); // ensure cats with lex feature have an index var FeatureStructure fs = ac.getFeatureStructure(); if (fs.hasAttribute("lex") && !fs.hasAttribute("index")) { fs.setFeature("index", new NominalVar("W")); UnifyControl.reindex(ac); } // unify with appropriate initial cats Collection initialCats = null; if (!currentLicensingFeature.licenseEmptyCats) initialCats = allInitialAtomCats; else { Map> fmValMap = featureMap.get(attr); initialCats = fmValMap.get(val); List alsoList = currentLicensingFeature.alsoLicensedBy; if (alsoList.size() > 0) { if (initialCats != null) initialCats = new HashSet(initialCats); else initialCats = new HashSet(); for (int i = 0; i < alsoList.size(); i++) { Set alsoSet = fmValMap.get(alsoList.get(i)); if (alsoSet != null) initialCats.addAll(alsoSet); } } } if (initialCats == null) { System.out.println("Warning, unable to find initial cats for feature " + attr + "=" + val); uninstantiatedCats.add(cat); return; } // for each initial cat for (Iterator it = initialCats.iterator(); it.hasNext(); ) { Category initialAC = it.next(); // ensure index instantiated FeatureStructure initialFS = initialAC.getFeatureStructure(); if (initialFS == null) continue; Object index = initialFS.getValue("index"); if (!(index instanceof NominalAtom)) continue; // block instantiation with bound vars if (edgeFactory.boundVarNominals.contains(index)) { instantiatedCats.clear(); uninstantiatedCats.add(cat); return; } // try unifying index ... simpleSubst.clear(); try { Unifier.unify(ac.getFeatureStructure(), initialFS, simpleSubst); // ensure substitution contains index if (!simpleSubst.containsValue(index)) continue; // get rid of other substitutions for (Iterator it2 = simpleSubst.values().iterator(); it2.hasNext(); ) { if (!it2.next().equals(index)) it2.remove(); } // instantiate Category instCat = (Category) cat.fill(simpleSubst); // and add instantiated cats instantiatedCats.add(instCat); } catch (UnifyFailure uf) {} } } } } ================================================ FILE: src/opennlp/ccg/realize/Hypertagger.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2008-9 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.lexicon.*; import opennlp.ccg.hylo.*; import java.util.*; /** * A hypertagger is a realization supertagger. It must extend the * SupertaggerAdapter interface for plugging a supertagger into the * lexicon. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/08/24 01:56:14 $ */ public interface Hypertagger extends SupertaggerAdapter { /** * Maps the given elementary predications to their predicted categories, * so that the beta-best categories can be returned by calls to setPred * and getSupertags. */ public void mapPreds(List preds); /** * Sets the current elementary predication to the one with the given index, * so that the beta-best categories for it can be returned by a call to * getSupertags. */ public void setPred(int index); /** * Stores the gold standard pred info, for use in discriminative training. * The string consists of space delimited tokens, where each token * is a colon-separated list of fields, with the first field containing * the nominal id, and the second field the gold supertag. */ public void storeGoldStdPredInfo(String goldStdPredInfo); } ================================================ FILE: src/opennlp/ccg/realize/LexicalDiversityPruningStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.lexicon.Word; import opennlp.ccg.synsem.Sign; import gnu.trove.THashSet; import gnu.trove.TObjectIdentityHashingStrategy; import java.util.*; /** * A diversity pruning strategy that defines signs to be * notCompellinglyDifferent if the set of open class stems * is the same. The POS classes of interest are set in the * constructor. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2011/04/02 16:32:17 $ */ public class LexicalDiversityPruningStrategy extends DiversityPruningStrategy { /** The interned POS values to use for relevant open class stems. */ @SuppressWarnings("unchecked") protected Set posValsToUse = new THashSet(new TObjectIdentityHashingStrategy()); /** Reusable set of observed interned stems for comparison purposes. */ @SuppressWarnings("unchecked") protected Set stemsSeen = new THashSet(new TObjectIdentityHashingStrategy()); /** Constructor, which sets POS classes of interest. */ public LexicalDiversityPruningStrategy() { String[] poslist = { "JJ", "JJR", "JJS", "NN", "NNP", "NNS", "NNPS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" }; for (String pos : poslist) posValsToUse.add(pos); } /** Returns true iff the given signs are not compellingly different. In particular, returns true iff the set of relevant open class stems are the same. */ public boolean notCompellinglyDifferent(Sign sign1, Sign sign2) { stemsSeen.clear(); for (Word w : sign1.getWords()) { if (posValsToUse.contains(w.getPOS())) stemsSeen.add(w.getStem()); } for (Word w : sign2.getWords()) { if (posValsToUse.contains(w.getPOS()) && !stemsSeen.contains(w.getStem())) return false; } return true; } } ================================================ FILE: src/opennlp/ccg/realize/NBestPruningStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.TextCCG; import java.util.*; import java.util.prefs.*; /** * Default, n-best edge pruning strategy. * * @author Michael White * @version $Revision: 1.9 $, $Date: 2011/03/27 14:45:32 $ */ public class NBestPruningStrategy implements PruningStrategy { /** The current pruning val. */ protected int CAT_PRUNE_VAL; /** Reusable return list. */ protected List retval = new ArrayList(); /** Constructor with pruning val. */ public NBestPruningStrategy(int pruningVal) { CAT_PRUNE_VAL = pruningVal; } /** Default constructor retrieves pruning val from preferences. */ public NBestPruningStrategy() { Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); CAT_PRUNE_VAL = prefs.getInt(Chart.PRUNING_VALUE, Chart.NO_PRUNING); } /** * Returns a (possibly empty) list of edges pruned * from the given ones, which should be sorted by score, * from highest to lowest. * In particular, prunes and returns the edges that follow the N-best * ones in the given list. */ public List pruneEdges(List catEdges) { // clear reusable return list retval.clear(); // ensure pruning enabled if (CAT_PRUNE_VAL == Chart.NO_PRUNING) return retval; // nb: could add an option to prune all egdes with zero score /* for (Iterator it = catEdges.iterator(); it.hasNext(); ) { Edge edge = it.next(); if (edge.score == 0) { retval.add(edge); it.remove(); } } */ // return edges at bottom of list, starting with CAT_PRUNE_VAL (if any) while (CAT_PRUNE_VAL < catEdges.size()) { retval.add(catEdges.remove(CAT_PRUNE_VAL)); } // done return retval; } } ================================================ FILE: src/opennlp/ccg/realize/PruningStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import java.util.*; /** * Interface for edge pruning strategies. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2011/03/27 14:45:32 $ */ public interface PruningStrategy { /** * Prunes and returns a (possibly empty) list of edges * from the given ones, which should be sorted by score, * from highest to lowest. */ public List pruneEdges(List catEdges); } ================================================ FILE: src/opennlp/ccg/realize/Realizer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-11 University of Edinburgh / Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import opennlp.ccg.hylo.*; import opennlp.ccg.*; import org.jdom.*; import java.util.*; import java.util.prefs.*; /** * The realizer manages the realization process. * Realization options may be set for use across calls * to the realizer. * * @author Michael White * @version $Revision: 1.31 $, $Date: 2011/07/19 03:40:46 $ */ public class Realizer { /** The grammar used for realization. */ public final Grammar grammar; /** Flag for whether to use depth-first search. Defaults to false. */ public boolean depthFirst = false; // the chart used to realize a request private Chart chart = null; /** Constructor. */ public Realizer(Grammar grammar) { this.grammar = grammar; } /** Returns the chart used in the latest request, or null if none. */ public Chart getChart() { return chart; } //----------------------------------------------------------------- // default options, for use when not given in realization request // nb: as the usual practice is to set these options once // for reuse across calls to the realizer, only a subset of // the options may be overridden in different calls to the // realize method /** Time limit in ms. (Default is -1, or none.) */ public int timeLimitMS = -1; /** Flag for whether to wait for a complete edge. (Default is false.) */ public boolean waitForCompleteEdge = false; /** Sign scorer to use. (Default is none.) */ public SignScorer signScorer = null; /** Pruning strategy to use. (Default is none.) */ public PruningStrategy pruningStrategy = null; /** Hypertagger to use. (Default is none.) */ public Hypertagger hypertagger = null; //----------------------------------------------------------------- // get LF from doc /** * Retrieves an input LF from the given XML doc, processing any * LF chunks along the way. */ public static LF getLfFromDoc(Document doc) { Element rootElt = doc.getRootElement(); Element lfElt = (rootElt.getName().equals("lf")) ? rootElt : rootElt.getChild("lf"); return getLfFromElt(lfElt); } /** * Retrieves an input LF from the given XML element, processing any * LF chunks along the way. */ public static LF getLfFromElt(Element lfElt) { HyloHelper.processChunks(lfElt); LF lf = HyloHelper.getLF(lfElt); return lf; } //----------------------------------------------------------------- // realization routines /** * Realizes the input LF, * returning the best edge found (or null if none). */ public Edge realize(LF lf) { return realize(lf, this.signScorer); } /** * Realizes the input LF relative to the given sign scorer, * returning the best edge found (or null if none). */ public Edge realize(LF lf, SignScorer signScorer) { Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); int timeLimitToUse = (timeLimitMS != -1) ? timeLimitMS : prefs.getInt(Chart.TIME_LIMIT, Chart.NO_TIME_LIMIT); return realize(lf, signScorer, timeLimitToUse, waitForCompleteEdge); } /** * Realizes the input LF relative to given sign scorer, * returning the best edge found (or null if none) * in the given time limit (in ms), potentially waiting * longer for a complete edge according to the given flag. * If a hypertagger is employed, realization proceeds * iteratively through the available beta-best values * within the overall time or edge limit. */ public Edge realize(LF lf, SignScorer signScorer, int timeLimitMS, boolean waitForCompleteEdge) { List preds = HyloHelper.flatten(lf); SignScorer scorerToUse = (signScorer != null) ? signScorer : SignScorer.nullScorer; PruningStrategy strategyToUse = (pruningStrategy != null) ? pruningStrategy : new NBestPruningStrategy(); // realize iteratively with hypertagger, if present if (hypertagger != null) { return realizeWithHypertagger(preds, scorerToUse, strategyToUse, timeLimitMS); } // otherwise make chart, set start time long startTime = System.currentTimeMillis(); chart = new Chart(new EdgeFactory(grammar, preds, scorerToUse), strategyToUse); chart.startTime = startTime; chart.depthFirst = depthFirst; // run request chart.initialize(); chart.combine(timeLimitMS, waitForCompleteEdge); // XXX tmp // if no complete edge, try again gluing fragments // if (!chart.bestEdge.complete()) { // System.out.println("Trying to glue fragments ..."); // chart.reInitForGluing(); // chart.combine(timeLimitMS, waitForCompleteEdge); // } // return best edge return chart.bestEdge; } // XXX tmp switch for gluing private boolean useGluing = Boolean.getBoolean("useGluing"); // iterate through beta-best values until a complete realization is found; // otherwise return the best fragment using the glue rule, or if all else // fails (or not using gluing), greedy fragment joining private Edge realizeWithHypertagger(List preds, SignScorer signScorer, PruningStrategy pruningStrategy, int timeLimitMS) { // get start time long startTime = System.currentTimeMillis(); // get edge limit Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); int edgeLimit = prefs.getInt(Chart.EDGE_LIMIT, Chart.NO_EDGE_LIMIT); // set supertagger in lexicon grammar.lexicon.setSupertagger(hypertagger); // reset beta hypertagger.resetBeta(); // loop until retval set or need to give up Edge retval = null; chart = null; boolean outOfBetas = false; boolean pastTimeLimit = false; boolean exceededEdgeLimit = false; long iterStartTime = 0, currentTime = 0; int iterTime = 0; while (retval == null && !outOfBetas && !pastTimeLimit && !exceededEdgeLimit) { // instantiate chart and set start time for this iteration chart = new Chart(new EdgeFactory(grammar, preds, signScorer, hypertagger), pruningStrategy); iterStartTime = System.currentTimeMillis(); // do realization in packing mode to see if a complete realization // can be found with this hypertagger setting chart.usePacking = true; chart.collectCombos = false; chart.doUnpacking = false; chart.joinFragments = false; // run request chart.initialize(); if (chart.noUncoveredPreds()) chart.combine(timeLimitMS, false); // check time limit currentTime = System.currentTimeMillis(); iterTime = (int) (currentTime - iterStartTime); if (timeLimitMS != Chart.NO_TIME_LIMIT && iterTime >= timeLimitMS) { pastTimeLimit = true; // System.out.println("Went past time limit with ht beta: " + hypertagger.getCurrentBetaValue()); } // check edge limit if (edgeLimit != Chart.NO_EDGE_LIMIT && chart.numEdges >= edgeLimit) { exceededEdgeLimit = true; // System.out.println("Exceeded edge limit with ht beta: " + hypertagger.getCurrentBetaValue()); } // if complete, unpack and return best edge if (chart.bestEdge.complete()) { chart.doUnpacking = true; chart.doUnpacking(); retval = chart.bestEdge; } // otherwise check beta level if still within limits else if (!pastTimeLimit && !exceededEdgeLimit) { // progress to next beta setting, if any if (hypertagger.hasMoreBetas()) { hypertagger.nextBeta(); } else { // otherwise out of betas outOfBetas = true; // System.out.println("Ran out of betas with ht beta: " + hypertagger.getCurrentBetaValue()); } } } // if no result, take desperate measures with fragments if (retval == null) { // try realization with gluing if (useGluing) { // System.out.println("Num edges for final iteration: " + chart.numEdges); // System.out.println("Trying gluing option after iterTime: " + iterTime); chart.reInitForGluing(); // double time and space limits, to give gluing option some room chart.edgeLimit = edgeLimit * 2; chart.combine(timeLimitMS * 2, waitForCompleteEdge); // System.out.println("Num edges after gluing: " + chart.numEdges); currentTime = System.currentTimeMillis(); iterTime = (int) (currentTime - iterStartTime); // if complete, unpack and return best edge if (chart.bestEdge.complete()) { // System.out.println("Unpacking in final iteration after iterTime: " + iterTime); chart.doUnpacking = true; chart.doUnpacking(); retval = chart.bestEdge; } } // otherwise try a final iteration in an iteration in anytime mode, possibly resorting to joining fragments if (retval == null) { // System.out.println("Trying a final iteration in anytime mode after iterTime: " + iterTime); // instantiate chart and set start time for this iteration chart = new Chart(new EdgeFactory(grammar, preds, signScorer, hypertagger), pruningStrategy); iterStartTime = System.currentTimeMillis(); // run request chart.usePacking = false; chart.joinFragments = true; chart.initialize(); chart.combine(timeLimitMS, waitForCompleteEdge); // System.out.println("Num edges after anytime iteration: " + chart.numEdges); currentTime = System.currentTimeMillis(); iterTime = (int) (currentTime - iterStartTime); // if (chart.bestEdge.complete()) // System.out.println("Found complete edge after iterTime: " + iterTime); // else // System.out.println("Resorting to joined fragments after iterTime: " + iterTime); // return best edge retval = chart.bestEdge; } } // update end time long endTime = System.currentTimeMillis(); chart.timeTilDone = (int) (endTime - startTime); // reset supertagger in lexicon grammar.lexicon.setSupertagger(null); // return return retval; } } ================================================ FILE: src/opennlp/ccg/realize/RuleInstance.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.grammar.*; import opennlp.ccg.hylo.*; import java.util.*; //import java.util.prefs.*; //import java.text.*; /** * A rule instance is a tracker for an instantiated version of a type changing * rule, ie a type changing rule with its semantics instantiated together with * bitsets representing its coverage of the input predicates * and the indices in its arg category, along with lists of the active LF alts. * Such rule instances are created and managed by an EdgeFactory. * The design follows the Singleton pattern. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2005/11/24 03:22:08 $ */ public class RuleInstance extends Tracker { /** The instantiated type changing rule. */ public final TypeChangingRule rule; /** Constructor. */ public RuleInstance(TypeChangingRule rule, BitSet bitset, BitSet indices, List> activeLfAlts) { super(bitset, indices, activeLfAlts); this.rule = rule; } /** Returns '{bitset} name: arg => result'. */ public String toString() { StringBuffer sb = new StringBuffer(); //sb.append(indices + " "); sb.append(bitset + " "); sb.append(rule); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/realize/StemPruningStrategy.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.lexicon.Word; import opennlp.ccg.synsem.Sign; import java.util.*; /** * A diversity pruning strategy that defines signs to be * notCompellinglyDifferent if they have the same sequence of * stems. * The empty constructor defaults the singleBestPerGroup flag * to true. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2011/04/02 16:32:17 $ */ public class StemPruningStrategy extends DiversityPruningStrategy { /** Constructor, defaults singleBestPerGroup to true. */ public StemPruningStrategy() { this(true); } /** Full constructor. */ public StemPruningStrategy(boolean singleBestPerGroup) { this.singleBestPerGroup = singleBestPerGroup; } /** Returns true iff the given signs are not compellingly different. In particular, returns true iff the signs have the same sequence of stems. */ public boolean notCompellinglyDifferent(Sign sign1, Sign sign2) { List words1 = sign1.getWords(); List words2 = sign2.getWords(); if (words1.size() != words2.size()) return false; for (int i=0; i < words1.size(); i++) { if (words1.get(i).getStem() != words2.get(i).getStem()) return false; } return true; } } ================================================ FILE: src/opennlp/ccg/realize/Tracker.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.realize; import opennlp.ccg.hylo.*; import java.util.*; /** * A tracker is a wrapper for either a sign (ie, an edge) or * a type changing rule (ie, a rule instance) which has bitsets for * representing the coverage of the input predicates and the semantic indices used. * It also has a list of active LF alts. * Trackers are created by an EdgeFactory. * The design follows the Singleton pattern. * * @author Michael White * @version $Revision: 1.9 $, $Date: 2005/11/24 03:15:26 $ */ public class Tracker { /** The coverage bitset. */ public final BitSet bitset; /** The indices bitset. */ public final BitSet indices; /** The active LF alts. */ public final List> activeLfAlts; /** Constructor. */ public Tracker(BitSet bitset, BitSet indices, List> activeLfAlts) { this.bitset = bitset; this.indices = indices; this.activeLfAlts = activeLfAlts; } /** * Returns whether the coverage bitset of this tracker intersects with the * coverage bitset of the given one. */ public boolean intersects(Tracker tracker) { return bitset.intersects(tracker.bitset); } /** * Returns whether the indices bitset of this tracker intersects with the * indices bitset of the given one, if both non-empty; otherwise, returns * true (if either this tracker or the given one has no indices). */ public boolean indicesIntersect(Tracker tracker) { return indices.isEmpty() || tracker.indices.isEmpty() || indices.intersects(tracker.indices); } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/FeatureExtractionException.java ================================================ package opennlp.ccg.realize.hypertagger; public class FeatureExtractionException extends Exception { public FeatureExtractionException(String string) { super(string); } public FeatureExtractionException() { super(); } /** * */ private static final long serialVersionUID = 1L; } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/LFInfo.java ================================================ package opennlp.ccg.realize.hypertagger; import opennlp.ccg.synsem.LF; /** * @author espinosa * This class encapsulates a LF and its corresponding gold standard predicate info, if available. */ public class LFInfo { LF lf; String fullWords; String lfNum; public LFInfo(LF lf, String fullWords, String lfNum) { this.lf = lf; this.fullWords = fullWords; this.lfNum = lfNum; } public LF getLF() { return this.lf; } public String getFullWords() { return this.fullWords; } public String getLFNum() { return this.lfNum; } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/LFLoader.java ================================================ package opennlp.ccg.realize.hypertagger; import java.io.File; import java.io.FileFilter; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.hylo.HyloHelper; import opennlp.ccg.realize.Realizer; import opennlp.ccg.synsem.LF; import org.jdom.Document; import org.jdom.Element; /** * @author espinosa * This class abstracts over a collection of LFs contained in a collection of files. */ public class LFLoader implements Iterator { static class XmlFilenameFilter implements FileFilter { public boolean accept(File f) { return f.getName().toLowerCase().endsWith(".xml"); } } Grammar grammar; ArrayList lfFiles; int filePos = 0; LinkedList lfs; int total = 0; int skipped = 0; /** * Constructs a new LFLoader which will load LFs from a collection of files or directories under a base directory. * @param grammarFile The grammar to use * @param baseDir The base directory. Paths will be interpreted relative to this directory. * @param paths The files to load the LFs from. Directories or files can be given. Directories are not searched recursively. Only files ending * in .xml will be loaded. */ public LFLoader(File grammarFile, File baseDir, List paths) { lfs = new LinkedList(); URL grammarURL = null; try { grammarURL = grammarFile.toURI().toURL(); } catch (MalformedURLException e1) { e1.printStackTrace(); } try { grammar = new Grammar(grammarURL); } catch (Exception e) { e.printStackTrace(); } lfFiles = new ArrayList(); paths = normalize(paths); for (String lfFilename : paths) { // if this argument is a directory, load all XML files from it File f = new File(baseDir, lfFilename); if(f.isDirectory()) { lfFiles.addAll(Arrays.asList(f.listFiles(new XmlFilenameFilter()))); } else { lfFiles.add(f); } } } private List normalize(List paths) { ArrayList ret = new ArrayList(); for(String s: paths) { if(s.indexOf(',') < 0) { ret.add(s.trim()); } else { // explode comma-separated values into separate strings String[] fields = s.split(","); for(String f : fields) { ret.add(f.trim()); } } } return ret; } @SuppressWarnings("unchecked") private void loadFile(File lfFile) { Document doc = null; int n = 0; try { doc = grammar.loadFromXml(lfFile.getAbsolutePath()); } catch (IOException e) { // if there's a problem, just skip this file System.err.println("Couldn't open input file " + lfFile + ", skipping.\n"); return; } catch (Exception e) { e.printStackTrace(); return; } Element root = doc.getRootElement(); List testItems = root.getChildren(); // Iterate through test item LFS and print to file/stdio tags predicted // by the hypertagger for (Element item : testItems) { String lfNum = "unk"; lfNum = item.getAttributeValue("info"); Element itemLFElt = item.getChild("lf"); //Element itemFullWordsElt = item.getChild("full-words"); Element itemPredInfoElt = item.getChild("pred-info"); //String sentId = itemFullWordsElt.getAttributeValue("info"); //String fullWords = itemFullWordsElt.getTextNormalize(); // mww: extra null check String predInfo = null; if (itemPredInfoElt != null) predInfo = itemPredInfoElt.getAttributeValue("data"); //String predInfo = itemPredInfoElt.getAttributeValue("data"); if(predInfo == null || predInfo.equals("")) { /* because this class is used to load LFs for training purposes, we can't continue without the gold-std info */ // mww: added info: lfNum System.err.println("No pred-info found for lf #" + n + " (info: " + lfNum + ") in file " + lfFile + ", skipping."); skipped++; continue; } // mww: added try-catch block try { LF lf = Realizer.getLfFromElt(itemLFElt); LF flatLF = HyloHelper.flattenLF(lf); lfs.offer(new LFInfo(flatLF, predInfo, lfNum)); } catch (Exception exc) { System.err.println("Skipping lf #" + n + " (info: " + lfNum + ") in file " + lfFile + ", uncaught exception:"); System.err.println(exc.getMessage()); exc.printStackTrace(System.err); skipped++; continue; } n++; total++; } System.err.println("LFL: loaded " + n + " LFs from " + lfFile); } /* two cases: * - if there's an LF in the queue, return it * - if there isn't, load the next file -- BUT -- if the LF queue is still empty, load the next file, and so on */ public boolean hasNext() { if(!lfs.isEmpty()) { return true; } // queue is empty, load next file while(lfs.isEmpty()) { if(filePos == lfFiles.size()) { return false; // no more files } loadFile(lfFiles.get(filePos)); filePos++; } return true; } // this method returns null when no more LFs can be loaded public LFInfo next() { if(!lfs.isEmpty()) { return lfs.poll(); } while(lfs.isEmpty()) { if(filePos == lfFiles.size()) { return null; } loadFile(lfFiles.get(filePos)); } return lfs.poll(); } public void remove() { // NOT IMPLEMENTED throw new RuntimeException("Method not implemented"); } public int getTotal() { return this.total; } public int getSkipped() { return this.skipped; } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/LMFactorExtractor.java ================================================ package opennlp.ccg.realize.hypertagger; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; /* this is a copy of TagExtract, modified to extract SRILM-format data using the full-words * element from XML-format LFs. * The input is a file in which each line is a "pred-info"-style line, one per LF. * The input file is given as the sole commandline argument. * The factors are written to stdout. */ public class LMFactorExtractor { public static void main(String args[]) { BufferedReader rd = null; int lineNum = 0; int bNum = 0; try { rd = new BufferedReader(new FileReader(new File(args[0]))); } catch (Exception e) { e.printStackTrace(); System.exit(1); } String line = null; StringBuilder out; while(true) { try { line = rd.readLine(); } catch(IOException e) { e.printStackTrace(); System.exit(1); } lineNum++; if(line == null) { break; } if(line.matches("^\\s*$")) { continue; } out = new StringBuilder(); out.append(" "); String[] fields = line.split("\\s+"); bNum = 0; for(String f : fields) { bNum++; String[] info = f.split(":"); if(info.length != 4) { System.err.println("Wrong number of fields encountered in input line " + lineNum + ", bundle " + bNum); System.exit(1); } out.append(info[3]); out.append(":S-"); out.append(info[3]); out.append(":P-"); out.append(info[2]); out.append(":T-"); out.append(info[1]); out.append(" "); } out.append(""); System.out.println(out.toString()); } } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/TagExtract.java ================================================ package opennlp.ccg.realize.hypertagger; import java.io.BufferedWriter; import java.io.File; import java.io.FileFilter; import java.io.FileWriter; import java.io.IOException; import java.util.Comparator; import static java.util.Arrays.*; import joptsimple.*; import opennlp.ccg.realize.hypertagger.TagExtractor; import opennlp.ccg.synsem.LF; import opennlp.ccg.util.Pair; public class TagExtract { private static boolean quiet = false; // when true, suppress stderr messages @SuppressWarnings("unused") private static File posModelFile; private static File hyperModelFile; private static File posPriorModelFile; private static File htPriorModelFile; private static File posVocabFile; private static File htVocabFile; private static String argnames; private TagExtractor tex; private BufferedWriter output; @SuppressWarnings({ "unchecked", "rawtypes" }) static class PairComparator implements Comparator { @SuppressWarnings("boxing") public int compare(Pair p, Pair q) { if(p.a > q.a) { return 1; } if(p.a == q.a) { return 0; } return -1; } public int compare(Object p, Object q) { return this.compare((Pair) p, (Pair)q ); } } static class XmlFilenameFilter implements FileFilter { public boolean accept(File f) { return f.getName().toLowerCase().endsWith(".xml"); } } public TagExtract(TagExtractor t) { this.tex = t; } /* TODO: this method should probably be rewritten to use LFLoader and a config file */ public static void main(String[] args) throws IOException { TagExtract t = null; //PrintStream output = System.out; BufferedWriter output; //int lfcount = 0; //int lfNum = 0; // option processing OptionParser o = new OptionParser(); o.acceptsAll(asList("help", "h"), "this message"); o.acceptsAll(asList("quiet", "q"), "print no messages"); o.acceptsAll(asList("pos", "pos"), "extract POS features"); OptionSpec pos_s = o.acceptsAll(asList("p", "pos-model")).withRequiredArg().ofType(File.class).describedAs("POS model to use"); OptionSpec outf = o.acceptsAll(asList("o", "output")).withRequiredArg().ofType(File.class).describedAs("output file"); OptionSpec posPrior_s = o.acceptsAll(asList("P", "pos-prior")).withRequiredArg().ofType(File.class).describedAs("POS prior model to use"); OptionSpec ht_s = o.acceptsAll(asList("y", "hyper-model")).withRequiredArg().ofType(File.class).describedAs("HT model to use as input to 2-pass model (see README)"); OptionSpec htPrior_s = o.acceptsAll(asList("H", "ht-prior")).withRequiredArg().ofType(File.class).describedAs("HT prior model to use"); OptionSpec gr_s = o.acceptsAll(asList("g", "grammar")).withRequiredArg().ofType(File.class).describedAs("grammar filename"); OptionSpec ht_vocab_s = o.acceptsAll(asList("V", "ht-prior-vocab")).withRequiredArg().ofType(File.class).describedAs("HT prior vocab filename"); OptionSpec pos_vocab_s = o.acceptsAll(asList("v", "pos-prior-vocab")).withRequiredArg().ofType(File.class).describedAs("POS prior vocab filename"); OptionSpec corpusDir_s = o.acceptsAll(asList("d", "lf-dir")).withRequiredArg().ofType(File.class).describedAs("Directory to change to before searching for XML files"); OptionSpec argnames_s = o.acceptsAll(asList("an", "argnames")).withRequiredArg().describedAs("Names of argument roles in format name(:shortname)?"); OptionSet options = o.parse(args); /* if -h (help) is given, print message and exit */ if (options.has("h") || args.length == 0) { o.printHelpOn(System.out); System.out.println("See the README for additional information."); System.exit(0); } output = new BufferedWriter(new FileWriter(options.valueOf(outf))); // some of these will be nulls, depending on what the user is trying to do hyperModelFile = options.valueOf(ht_s); posModelFile = options.valueOf(pos_s); posPriorModelFile = options.valueOf(posPrior_s); posVocabFile = options.valueOf(pos_vocab_s); htPriorModelFile = options.valueOf(htPrior_s); htVocabFile = options.valueOf(ht_vocab_s); argnames = options.valueOf(argnames_s); if(options.has("q")) quiet = true; LFLoader lfs = new LFLoader(options.valueOf(gr_s), options.valueOf(corpusDir_s), options.nonOptionArguments()); if(options.has("pos")) { TagExtractor tex = new ZLPOSTagger(); // mww: set arg names if (argnames != null) debug("Setting arg names to " + argnames + "\n"); tex.setArgNames(argnames); // uses default names if null if(posPriorModelFile != null && posVocabFile != null) { debug("Loading POS model priors from " + posPriorModelFile + "\n"); debug("Loading POS model vocab from " + posVocabFile + "\n"); tex.loadPriorModel(posPriorModelFile, posVocabFile); } debug("Extracting POS features..." + "\n"); t = new TagExtract(tex); } else { // extracting hypertags // using GS pos tags TagExtractor tex = new ZLMaxentHypertagger(); // mww: set arg names if (argnames != null) debug("Setting arg names to " + argnames + "\n"); tex.setArgNames(argnames); // uses default names if null if(htPriorModelFile != null && htVocabFile != null) { debug("Loading HT model priors from " + htPriorModelFile + "\n"); debug("Loading HT model vocab from " + htVocabFile + "\n"); tex.loadPriorModel(htPriorModelFile,htVocabFile); } if(hyperModelFile != null) { debug("Loading proto-HT model from " + hyperModelFile + "\n"); tex.loadProtoModel(hyperModelFile); } debug("Extracting hypertagger features..." + "\n"); t = new TagExtract(tex); } t.setOutput(output); while(lfs.hasNext()) { LFInfo lfi = lfs.next(); LF lf = lfi.getLF(); try { //lfNum++; t.extract(lf, lfi.getFullWords()); //lfcount++; //debug("LFs extracted: " + lfcount + "\r"); } catch (FeatureExtractionException e) { debug("In LF #" + lfi.getLFNum() + ":\n"); debug(e.toString() + "\n"); } } output.close(); debug("\n"); } private void extract(LF flatLF, String fullWords) throws FeatureExtractionException { tex.storeGoldStdPredInfo(fullWords); tex.setLF(flatLF); try { output.write(tex.getAllFeaturesAndAnswer()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } private void setOutput(BufferedWriter output) { this.output = output; } public static void debug(String string) { if(!quiet) System.err.print(string); } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/TagExtractor.java ================================================ package opennlp.ccg.realize.hypertagger; import java.io.File; import java.util.*; import opennlp.ccg.hylo.*; import opennlp.ccg.synsem.*; import opennlp.ccg.util.Pair; /** This class contains methods for extracting features from a logical form * * @author espinosa * */ public abstract class TagExtractor { protected class LfGraphLink { String label; // eg. "genrel" String arg; // if applicable LfGraphNode target; // graph node it points to LfGraphNode source; // node it extends from public LfGraphLink(String l, String a, LfGraphNode t) { label = l; arg = a; target = t; } public LfGraphNode getTarget() { return target; } public void setTarget(LfGraphNode t) { target = t; } public String getLabel() { return label; } public LfGraphNode getSource() { return source; } public void setSource(LfGraphNode source) { this.source = source; } } protected class LfGraphNode { String data; String predicateName; int index; // from original LF SatOp pred; // original predicate object ArrayList children; LfGraphLink parentLink; HashMap attribs; protected String POS; // caches the part-of-speech tag for this node (or GS postag) protected String ST; // gold-standard supertag ArrayList> POSList; // not used atm Set> STList; // for 2-pass tags public Set> getSTList() { return STList; } public void setSTList(Set> set) { STList = set; } ArrayList multiparents; FeatureList features; private String id; // e.g. "w1" protected FeatureList getFeatures() { return features; } protected void setFeatures(FeatureList features) { this.features = features; } public LfGraphNode(String s, int idx) { data = s; // this is just "w1" or whatever index = idx; children = new ArrayList(); attribs = new HashMap(); POS = null; multiparents = new ArrayList(); } public String getData() { return data; } public void setData(String s) { data = s; } public void addAttribute(String name, String value) { attribs.put(name, value); } public void addChild(LfGraphLink link) { children.add(link); } public ArrayList getChildren() { return children; } public int getNumChildren() { return children.size(); } public boolean isLeafNode() { if(children.isEmpty()) { return true; } else { return false; } } /* public boolean isLexPred() { return HyloHelper.isLexPred(data); }*/ public LfGraphNode findNode(BitSet b) { // intended to be run from the root node; only searches // nodes below this one // Say we want a node whose index is between 1-4 inclusive // pass a bitset with those bits set. Returns the lex pred // node whose index is in the bitset. if(b.get(index)) { return this; } for(LfGraphLink n : children) { n.getTarget().findNode(b); } return null; } public HashMap getAttribs() { return attribs; } public void setAttribs(HashMap attribs) { this.attribs = attribs; } public int getIndex() { return index; } public void setIndex(int index) { this.index = index; } public void setChildren(ArrayList children) { this.children = children; } public String getPredicateName() { return predicateName; } public void setPredicateName(String predicateName) { this.predicateName = predicateName; } public LfGraphLink getParentLink() { return parentLink; } public void setParentLink(LfGraphLink parentLink) { this.parentLink = parentLink; } public void addMultiParent(LfGraphNode parentNode) { // XXX should not be necessary! fix bug & eliminate if(parentNode == this) { return; } this.multiparents.add(parentNode); } public ArrayList getMultiParents() { return this.multiparents; } public SatOp getPred() { return pred; } protected void setPred(SatOp pred) { this.pred = pred; } public String getPOS() { return POS; } public void setPOS(String pos) { POS = new String(pos); } public ArrayList> getPOSList() { return POSList; } public void setPOSList(ArrayList> plist) { this.POSList = plist; } public void setID(String string) { this.id = string; } public String getID() { return this.id; } } public static int LFID = 0; /** Implements a list of features. Keys are short strings, e.g. "CN", "FO", ... * Values are arbitrary strings, but a key can have multiple values. Thus, the values are actually of type ArrayList. * @author espinosa * */ protected class FeatureList extends HashMap> { /** * */ private static final long serialVersionUID = 1L; int numFeatures = 0; // deprecated protected void addFeature(String featureName, String value) { /*if(value == null) { return; } if(this.get(featureName) == null) { this.put(featureName.trim(), new ArrayList()); } ArrayList feats = this.get(featureName); feats.add(value.trim()); this.put(featureName.trim(), feats); numFeatures++; */ addFeatureWithProb(featureName, value); } protected void addFeatures(String featureName, ArrayList> values) { for(Pair v : values) { this.addFeatureWithProb(featureName, v.a, v.b ); } } // merge f's features into self /* protected void addFeatures(FeatureList f) { Set>> es = this.entrySet(); for(Map.Entry> e : es) { String fn = e.getKey(); for(String fv : e.getValue()) { this.addFeature(fn, fv); } } } */ protected void addFeatureWithProb(String featureName, String value, Double prob) { if(value == null) { return; } if(this.get(featureName) == null) { this.put(featureName.trim(), new ArrayList()); } ArrayList feats = this.get(featureName); feats.add(value.trim() + ":" + prob.toString()); this.put(featureName.trim(), feats); numFeatures++; } protected void addFeatureWithProb(String featureName, String value) { this.addFeatureWithProb(featureName, value, new Double(1.0)); } protected ArrayList getFeature(String featureName) { return this.get(featureName); } protected String getSingleFeature(String featureName) { return this.get(featureName).get(0); } protected String getAllFeatures() { String output = ""; for(String k : this.keySet()) { for(String v : this.get(k)) { output = output.concat(k + "=" + v); output = output.concat(" "); } } return output; } protected String[] getAllFeaturesForMaxent() { String[] out = new String[this.getNumFeatures()]; int i = 0; for(String k : this.keySet()) { for(String v : this.get(k)) { out[i] = k + "=" + v; i++; } } return out; } protected int getNumFeatures() { return numFeatures; } } // fields protected LfGraphNode lfGraph; // stores the graph structure of the LF protected HashMap nomTable; protected HashMap varTable; protected HashMap lexpairs; protected HashMap pospairs; protected LF lf; protected List preds; protected List flatLF; protected int maxIndex; protected Map argNameMap; // mww: map from arg names to short arg names public TagExtractor() { argNameMap = new HashMap(); } public void setLF(LF lf) throws FeatureExtractionException { LFID++; // experimental HyloHelper.convertNominals(lf); setLF(HyloHelper.getPreds(lf)); } /** This method takes an LF and extracts its features, changing the internal state * of this object accordingly. Other methods such as getSupertag() can then * be called to obtain the tagger's prediction. * @param preds A logical form * @throws FeatureExtractionException when the logical form cannot be processed and the extracted features will not be meaningful */ public void setLF(List preds) throws FeatureExtractionException { this.preds = preds; LfGraphNode curNode = null; curNode = null; nomTable = new HashMap(); varTable = new HashMap(); int i = 0; // Pass 1: find and store nominals for(SatOp s: preds) { if(s == null) { throw new FeatureExtractionException(); // ??? why is it sometimes null? } if(s.getArg() instanceof Proposition) { LfGraphNode thisNode = new LfGraphNode(s.getNominal().toString(), i); thisNode.setPredicateName(((Proposition)s.getArg()).getName()); thisNode.setID(s.getNominal().toString()); thisNode.setPred(s); if(lfGraph == null) { // i.e., the first node processed lfGraph = thisNode; } nomTable.put(thisNode.getID(), thisNode); } i++; } this.maxIndex = i; // Pass 2: traverse all other nodes, linking to nominals found in pass 1 i = 0; for(SatOp s: preds) { if(s == null) { throw new FeatureExtractionException(); } if(s.getArg() instanceof Proposition) { curNode = nomTable.get(s.getNominal().toString()); i++; continue; } Diamond d = (Diamond)s.getArg(); // not sure how this could happen, but it did if(d == null || (d.getArg() == null)) { throw new FeatureExtractionException(); } if(d.getArg() instanceof NominalVar) { // XXX all this is probably defunct and can safely be deleted // add multiparent // might need getName() instead of toString() //System.err.println("found var: str = " + d.getArg().toString()); LfGraphNode target = nomTable.get(d.getArg().toString()); LfGraphLink ln = new LfGraphLink(d.getMode().toString(), null, target); ln.setSource(curNode); //target.addMultiParent(ln); } else if(d.getArg() instanceof Nominal) { // make link labeled to nominal LfGraphNode target = nomTable.get(d.getArg().toString()); LfGraphLink ln = new LfGraphLink(d.getMode().toString(), null, target); ln.setSource(curNode); if(target != null) { // ??? why is it sometimes null? and if it is, should an exception be thrown? target.setParentLink(ln); target.addMultiParent(curNode); } else { //System.err.println("TE: target was null: " + d.getArg().toString()); } if(curNode == null) { throw new FeatureExtractionException();} curNode.addChild(ln); } else { // proposition // add attr to current node try { curNode.addAttribute(d.getMode().toString(), d.getArg().toString()); } catch(Exception e) { // ??? this null must occur because there were no attributes, but this // probably isn't the best way to handle it } } // don't change curNode here i++; } } public LF getLF() { return this.lf; } /** This method extracts features from a node in the graph and returns * them as an array of strings. It's implemented slightly differently * in HyperTagger and POSTagExtractor. * @param n The node from which to extract the features. * @return A list of features. */ protected abstract FeatureList getFeatures(LfGraphNode n); /** Get the features for the index'th node * * @param index The index into the LF */ public FeatureList getFeatures(int index) { for(LfGraphNode n : nomTable.values()) { if(n.getIndex() == index) { return getFeatures(n); } } return null; // bad index } // these methods only make sense for training, because // they fetch the gold-standard supertag or POS tag. public String getSupertag(LfGraphNode n) { return lexpairs.get(n); } public String getSupertag(int index) { for(LfGraphNode n : nomTable.values()) { if(n.getIndex() == index) { return lexpairs.get(n); } } return null; } public String getPOStag(LfGraphNode n) { return pospairs.get(n); } public String getPOStag(int index) { for(LfGraphNode n : nomTable.values()) { if(n.getIndex() == index) { return pospairs.get(n); } } return null; } // return the node with the given index protected LfGraphNode findNode(int index) { for(LfGraphNode n : nomTable.values()) { if(n.getIndex() == index) { return n; } } return null; } protected int numNodes() { return nomTable.size(); } protected int maxIndex() { return this.maxIndex; } // utility method to do the inverse of java.String.split() public static String join(ArrayList a, String delimiter) { String out = new String(); int i; for(i = 0; i < a.size(); i++) { out = out.concat(a.get(i)); if(i != a.size() - 1) { out = out.concat(delimiter); } } return out; } /** * @param predInfo * @throws FeatureExtractionException * This method stores gold standard predicate info, which is expected to be in the following format: * predInfo :: (field)+ * field :: wordId:supertag:POStag:predName * * The supertag and postag are expected to have been escaped by Lexicon.DefaultTokenizer.escape(), and will be unescaped during storage. * This method throws a runtime error if the predInfo string cannot be parsed. * */ public abstract void storeGoldStdPredInfo(String predInfo); public abstract String getAllFeaturesAndAnswer(); public abstract void loadPriorModel(File priorModelFile, File vocabFile); public void loadProtoModel(File hyperModelFile) { // TODO refactor TagExtract app so this isn't needed return; } // mww: sets configurable arg names /** * Sets the arg name map to the given names. * @param argnames Space-delimited arg names in format name(:shortname)?. * Defaults to "Arg0:A0 Arg1:A1 Arg1a:A1a Arg1b:A1b Arg2:A2 Arg2a:A2a Arg2b:A2b Arg3:A3 Arg4:A4 Arg5:A5". */ protected void setArgNames(String argnames) { argNameMap.clear(); // default is augmented propbank arg names if (argnames == null) argnames = "Arg0:A0 Arg1:A1 Arg1a:A1a Arg1b:A1b Arg2:A2 Arg2a:A2a Arg2b:A2b Arg3:A3 Arg4:A4 Arg5:A5"; String[] nameslist = argnames.split("\\s+"); for (String argname : nameslist) { String[] namepair = argname.split(":"); if (namepair.length == 2) argNameMap.put(namepair[0], namepair[1]); else if (namepair.length == 1) argNameMap.put(namepair[0], namepair[1]); } } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/ZLMaxentHypertagger.java ================================================ package opennlp.ccg.realize.hypertagger; import static java.util.Arrays.asList; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.hylo.SatOp; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.parse.supertagger.io.XMLPOSDictionaryReader; import opennlp.ccg.parse.supertagger.io.XMLWordDictionaryReader; import opennlp.ccg.parse.supertagger.ml.STPriorModel; import opennlp.ccg.parse.supertagger.util.STTaggerPOSDictionary; import opennlp.ccg.parse.supertagger.util.STTaggerWordDictionary; import opennlp.ccg.parse.tagger.util.ConfigFileProcessor; import opennlp.ccg.parse.tagger.util.ResultSink; import opennlp.ccg.realize.Hypertagger; import opennlp.ccg.util.Pair; /** * * @author espinosa * This class implements the hypertagger. Instantiating a hypertagger requires several external * files: *
      *
    1. A POS model *
    2. A prior model for POS tags, and its vocab file *
    3. A prior model for hypertags, and its vocab file *
    * The prior model files are optional. *

    * To use the hypertagger for realization, instantiate a POS tagger first, then * instantiate the hypertagger using that POS tagger. Example in pseudo-code: * * POSPriorModel ppm = new POSPriorModel(String flmFile, String vocabFile); * ZLMaxentModel posMod = new ZLMaxentModel(String fileName); * ZLPOSTagger pt = new ZLPOSTagger(posMod, ppm); * ZLMaxentHypertagger ht = new ZLMaxentHypertagger(pt, File htModelPath); * ht.loadPriorModel(File priorModelFile, File vocabFile); * * The tagger can also be instantiated from a config file. This is the recommended * method for using the tagger as part of the realizer. See the method 'ZLMaxentHypertaggerFactory'. * * */ public class ZLMaxentHypertagger extends TagExtractor implements Hypertagger { File hypertagModelFilename; File posModelFilename; public ZLMaxentModel hypertagModel; // null if extracting feats for training ZLMaxentModel posModel; // null if extracting feats for training ZLPOSTagger postagger; ZLMaxentModel protoHTModel; double protoHTBeta = 0.01; // FIXME what should this be? LfGraphNode currentPred; String LFNum; protected double[] betas; int currentBeta; //Flag which indicates whether gold std tags need to be ensured during tag prediction //Gold std tag info for perceptron training (event generation) private boolean goldStdTagInsert=false; //Nominal id to gold std supertag mapping private HashMap goldPred2Tag; private STPriorModel priorModel; double priorBeta = 0.4; private HashMap goldPredPOS; private STTaggerWordDictionary wdict; // word (pred)-level tagging dictionary private STTaggerPOSDictionary posdict; // pos-level tagging dictionary int dictK; // frequency threshold for tagdict lookups public BufferedWriter tdErr; public class ProbIndexPair implements Comparable { public double prob; public int index; public ProbIndexPair(double prob, int index) { this.prob = prob; this.index = index; } public int compareTo(Object o) { if(prob < ((ProbIndexPair)o).prob) { return -1; } else if(prob == ((ProbIndexPair)o).prob) { return 0; } else { return 1; } } } /** * @author espinosa * This is a singleton class containing functions for extracting various features * from the LF graph nodes. These * feature functions are called from ZLMaxentHypertagger#getFeatures. * * Some of the functions extract several features at once, to avoid unnecessary iteration. * Feature template abbreviations: * * FO -- fan-out, i.e. number of children * PN -- predicate name * RN -- parent name * CT -- type of child * A1N, A2N, ... -- Arg1 name, Arg2 name, ... (by default) * A1P, A2P, ... -- Arg1 POS tag, Arg2 POS tag, ... (by default) * MP -- Modifier POS tag (non-arg children) * PP -- parent's POS tag, if any parent * CN -- name of child * NA -- number of Argument children * PT -- POS tag (see docs) * ZD -- det=value * ZM -- mood=value * ZN -- num=value * ZT -- tense=value * ZP -- partic=value * XC -- semantic class of node, if applicable * XnD -- semantic class of argument child node n, if applicable (by default) * XP -- semantic class of parent node, if applicable * XM -- semantic class of non-arg child node, if applicable * CS -- child supertag * PS -- parent supertag * AS -- argument supertag * MS -- modifier supertag * */ // mww: switched to configurable arg names void fillFeatures(LfGraphNode n, FeatureList f) { f.addFeatureWithProb("FO", Integer.toString(n.getNumChildren())); for(String att : n.getAttribs().keySet()) { f.addFeatureWithProb("Z" + att.substring(0,1).toUpperCase(), n.getAttribs().get(att)); } f.addFeatureWithProb("PN", n.getPredicateName()); if(n.getMultiParents().size() > 0) { for(LfGraphNode parent : n.getMultiParents() ) { f.addFeatureWithProb("RN", parent.getPredicateName()); /* the line below will add the parent's best-ranked POS tag with prob=1.0 */ f.addFeatureWithProb("PP", getPOS(parent)); /* The code below will add the parent's best-ranked POS tag with its actual probability */ /* ArrayList> poslist = getBetaBestPOS(parent); Pair pospair = poslist.get(0); feats.addFeatureWithProb("PP", pospair.a, pospair.b); */ // add class name, if available String cls = parent.getPred().getNominal().toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls != null && cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); f.addFeatureWithProb("XP", cfeat); } } } else { f.addFeatureWithProb("RN", "0"); } int argchildren = 0; for(LfGraphLink lnk : n.getChildren()) { f.addFeatureWithProb("CT", lnk.getLabel()); if(lnk.getTarget() != null) { // how could it be null? f.addFeatureWithProb("CN", lnk.getTarget().getPredicateName()); // mww: use short arg name String shortArgName = argNameMap.get(lnk.getLabel()); if (shortArgName != null) { // increment argchild count argchildren++; f.addFeatureWithProb(shortArgName + "N", lnk.getTarget().getPredicateName()); f.addFeatureWithProb(shortArgName + "P", getPOS(lnk.getTarget()) ); // add class info for arg child, if applicable String cls = lnk.getTarget().getPred().getNominal().toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls != null && cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); // mww: for backwards compatibility String argNumOrName = (shortArgName.startsWith("A")) ? shortArgName.substring(1) : shortArgName; f.addFeatureWithProb("X" + argNumOrName + "D", cfeat); } } else { // not an argument child f.addFeatureWithProb("MP", getPOS(lnk.getTarget())); // add class info for non-arg child, if applicable String cls = lnk.getTarget().getPred().getNominal().toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls != null && cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); f.addFeatureWithProb("XM", cfeat); } } } } f.addFeatureWithProb("NA", Integer.toString(argchildren)); if(posModel == null) { f.addFeatureWithProb("PT", getPOS(n)); } else { ArrayList> poslist = getBetaBestPOS(n); for(Pair pospair : poslist) { // add PT=NNP:0.7, PT=NN:0.6, etc f.addFeatureWithProb("PT", pospair.a, pospair.b); n.setPOS(pospair.a); // take the top choice to be "the pos tag", used later for prior } } // class name feature for node, if available Nominal idx = n.getPred().getNominal(); String cls = idx.toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls != null && cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); f.addFeatureWithProb("XC", cfeat); } // prior features if(priorModel != null) { Word w = Word.createWord(n.getPredicateName(), null, null, n.getPredicateName(), getPOS(n), null, null); priorModel.computePriors(w); List> tags = priorModel.getBetaBestPriors(w, priorBeta); for(Pair t : tags) { f.addFeatureWithProb("PR_ST", DefaultTokenizer.unescape(t.a), t.b); } } } private void fillTwoPassFeatures(LfGraphNode n, FeatureList f) { // get the STs from the STList for(Pair p : n.getSTList()) { f.addFeatureWithProb("ST", p.a, p.b); } // get parent STs for(LfGraphNode pl : n.getMultiParents()) { for(Pair p : pl.getSTList()) { f.addFeatureWithProb("STP", p.a, p.b); } } for(LfGraphLink cl : n.getChildren()) { /* for(Pair p : cl.getTarget().getSTList()) { */ LfGraphNode c = cl.getTarget(); if(c != null) { for(Pair p : c.getSTList()) { f.addFeatureWithProb("STC", p.a, p.b); } } } } public ZLMaxentHypertagger() { super(); betas = new double[7]; // values determined by DNM 20 April 2008). betas[0] = 0.16; betas[1] = 0.05; betas[2] = 0.0058; betas[3] = 0.00175; betas[4] = 0.000625; betas[5] = 0.000125; betas[6] = 0.000058; currentBeta = 0; goldPred2Tag = new HashMap(); goldPredPOS = new HashMap(); } /** This constructor loads both a POS-tagging model and a hypertagging model. * * @param posModelFile The filename from which to load the POS-tagging model. If null, * gold-stardard POS tags will be used (and must be stored via storeGoldStdPredInfo()). * @param hyperModelFile The filename from which to load the hypertagging model * @throws IOException If any model fails to load. */ public ZLMaxentHypertagger(File posModelFile, File hyperModelFile) throws IOException { this(); this.posModelFilename = posModelFile; // load the models if(posModelFile != null) { this.posModel = new ZLMaxentModel(); this.posModel.load(posModelFile); this.postagger = new ZLPOSTagger(posModel); postagger.setPrefixLength(4); postagger.setSuffixLength(4); postagger.argNameMap = this.argNameMap; // share the arg name map } this.hypertagModelFilename = hyperModelFile; this.hypertagModel = new ZLMaxentModel(); this.hypertagModel.load(hyperModelFile); } /* POS tagger can be null. In that case, gold-standard POS tags will be used. * To use in "realization mode", do the following: * 1) instantiate a ZLPOSTagger as normal * 2) load the POS prior model into the POS tagger * 3) Instantiate the ZLMaxentHypertagger with this constructor, passing the POS tagger * and the path to the trained hypertagging model file * 4) load the ht-prior model into the ZLMaxentHypertagger using setPriorModel(vocab, priorfile) * 5) commence tagging via setLF() and getSupertags() */ public ZLMaxentHypertagger(ZLPOSTagger ptag, File hyperModelFile) { this(); this.postagger = ptag; this.postagger.argNameMap = this.argNameMap; // share the arg name map this.hypertagModelFilename = hyperModelFile; this.hypertagModel = new ZLMaxentModel(); this.hypertagModel.load(hyperModelFile); } /** This constructor loads only a POS-tagging model. It's useful for extracting features for Maxent training. * * @param posModelFile The filename from which to load the model. * @throws IOException If the model couldn't be loaded for any reason. */ public ZLMaxentHypertagger(File posModelFile) throws IOException { this(); this.posModelFilename = posModelFile; // load the model this.posModel = new ZLMaxentModel(); this.posModel.load(posModelFile); this.postagger = new ZLPOSTagger(posModel); postagger.setPrefixLength(4); postagger.setSuffixLength(4); postagger.argNameMap = this.argNameMap; // share the arg name map } public static ZLMaxentHypertagger ZLMaxentHypertaggerFactory(String configFile) throws IOException { ZLMaxentHypertagger hypertagger = new ZLMaxentHypertagger(); ZLPOSTagger postagger = null; String[] pathKeys = { "priormodel", "priormodelvocab", "wdict", "posdict", "maxentmodel", "posconfig" }; Map opts = ConfigFileProcessor.readInConfig(configFile, pathKeys); // load the POS model first String posConfig = opts.get("posconfig"); if(posConfig != null) { postagger = ZLPOSTagger.ZLPOSTaggerFactory(posConfig); } hypertagger.postagger = postagger; // now load the prior models and/or tag dicts // if prior models are specified, then tagdicts are not used, or even loaded String priorModelPath = opts.get("priormodel"); String wdictPath = opts.get("wdict"); if(priorModelPath != null) { String vocabPath = opts.get("priormodelvocab"); if(vocabPath == null) { throw new IOException("You must specify a vocab filename along with the prior model filename."); } hypertagger.loadPriorModel(new File(priorModelPath), new File(vocabPath)); } else if(wdictPath != null) { String posdictPath = opts.get("posdict"); if(posdictPath == null) { throw new IOException("You must specify both a word-level dict and a POS dict."); } XMLWordDictionaryReader wdr = new XMLWordDictionaryReader(new File(wdictPath)); XMLPOSDictionaryReader pdr = new XMLPOSDictionaryReader(new File(posdictPath)); STTaggerWordDictionary wdict = wdr.read(); STTaggerPOSDictionary posdict = pdr.read(); hypertagger.wdict = wdict; hypertagger.posdict = posdict; String kstring = opts.get("dictk"); if(kstring != null) hypertagger.dictK = Integer.parseInt(kstring); } String maxentModelPath = opts.get("maxentmodel"); if(maxentModelPath == null) { System.err.println("Maxent model path must be specified with key \"maxentmodel\"."); throw new IOException(); } String protoHTModelPath = opts.get("protomodel"); // process betas String betaString = opts.get("betas"); if(betaString != null) { String[] bs = betaString.split("\\s+"); double[] betaVals = new double[bs.length]; int i; for(i = 0; i < bs.length; i++) { betaVals[i] = Double.parseDouble(bs[i]); } // can't sort descending w/o extra code and vars? // for now, assume betas are in correct order in configfile //Arrays.sort(betaVals); hypertagger.betas = betaVals; } hypertagger.hypertagModel = new ZLMaxentModel(maxentModelPath); if(protoHTModelPath != null) { hypertagger.protoHTModel = new ZLMaxentModel(protoHTModelPath); System.err.println("Two-pass model instantiated. Initializing hyperdrive."); } // mww: add argnames String argnames = opts.get("argnames"); hypertagger.setArgNames(argnames); return hypertagger; } private String getPOS(LfGraphNode n) { if(postagger == null) { // use gold-standard POS tag String posTag = goldPredPOS.get(n.getPred().getNominal().getName()); if(posTag == null) { // XXX this is not correct posTag = "UNK"; } return posTag; } // else use POS model to get beta-best POS if(n.getPOS() == null) { FeatureList feats = postagger.getFeatures(n); //String pos = new String(postagger.getBestPOS(feats)); // POS tags for nodes are cached in the node itself // so as to avoid repeated calls to the model for the same node ArrayList> pos = postagger.getBetaBestPOS(feats); n.setPOS(pos.get(0).a); } return n.getPOS(); } private String getGoldSupertag(LfGraphNode n) { return goldPred2Tag.get(n.getPred().getNominal().getName()); } private ArrayList> getBetaBestPOS(LfGraphNode n) { // TODO (for training): add gold-standard POS tag, if it is not in the beta-best // list. Or should we use beta-best POS tags in training at all? if(n.getPOSList() == null) { FeatureList feats = postagger.getFeatures(n); ArrayList> pos = null; //String pos = new String(postagger.getBestPOS(feats)); pos = postagger.getBetaBestPOS(feats); n.setPOSList(pos); } return n.getPOSList(); } // computes all features for the graph node, returns a feature list protected FeatureList getFeatures(LfGraphNode n) { FeatureList feats; feats = n.getFeatures(); if(feats != null) { return feats; } feats = new FeatureList(); fillFeatures(n, feats); return feats; } /** Get all features for the current LF as a single multi-line string. * Used during extraction of training data. * @return All features for the current LF. */ public String getAllFeaturesAndAnswer() { StringBuilder output = new StringBuilder(); for(LfGraphNode n : nomTable.values()) { FeatureList feats = getFeatures(n); String context = feats.getAllFeatures(); //String tag = n.getPred().getOrigin().getSupertag(); String tag = goldPred2Tag.get(n.getPred().getNominal().getName()); output = output.append(tag + " " + context); output = output.append("\n"); } return output.toString(); } // for every node in the tree, compute its features, and store them inside the nodes private void assignAllFeatures() { for(LfGraphNode n : nomTable.values()) { FeatureList feats = getFeatures(n); n.setFeatures(feats); // eval the proto-model, set the node's STList to the returned outcomes/probs if(protoHTModel != null) { n.setSTList(getProtoSupertagsAndProbs(n, protoHTBeta)); } } // now traverse the graph again, calculating the additional STS features, // then unioning that feature list with the cached one if(protoHTModel != null) { for(LfGraphNode n : nomTable.values()) { FeatureList feats = getFeatures(n); // feats guaranteed cached by first pass fillTwoPassFeatures(n, feats); n.setFeatures(feats); // necessary? } } } public ZLMaxentModel getPosModel() { return posModel; } public File getPosModelFilename() { return posModelFilename; } public void setLF(List preds) throws FeatureExtractionException { super.setLF(preds); if(postagger != null) { postagger.setLF(preds); } assignAllFeatures(); } public String getLFNum() { return this.LFNum; } public void setLFNum(String s) { this.LFNum = s; } /** Set the POS tagging model to use. * @param posModelFilename The filename containing the LBFGS model to use for * computing simple POS tags as part of the featureset. * @throws IOException when the model cannot be loaded for some reason. */ public void setPOSModelFilename(File posModelFilename) throws IOException { this.posModelFilename = posModelFilename; // load the model this.posModel = new ZLMaxentModel(); this.posModel.load(posModelFilename); this.postagger = new ZLPOSTagger(posModel); } public File getHypertagModelFilename() { return hypertagModelFilename; } /** Set the POS tagging model to use. * @param posModelFilename The filename containing the LBFGS model to use for * computing simple POS tags as part of the featureset. * @throws IOException when the model cannot be loaded for some reason. */ public void setHypertagModelFilename(File hypertagModelFilename) throws IOException { this.hypertagModelFilename = hypertagModelFilename; // load the model this.hypertagModel = new ZLMaxentModel(); this.hypertagModel.load(hypertagModelFilename); } /* methods from Hypertagger interface */ // should probably just rename setLF to mapPreds in the first place public void mapPreds(List preds) { try { setLF(preds); } catch(FeatureExtractionException e) { throw (RuntimeException) new RuntimeException().initCause(e); } } public void setPred(int index) { currentPred = findNode(index); } public void resetBeta() { currentBeta = 0; } public void resetBetaToMax() { currentBeta = betas.length - 1; } public void nextBeta() { if(currentBeta < betas.length -1) { currentBeta++; } } public void previousBeta() { if(currentBeta > 0) { currentBeta--; } } public boolean hasMoreBetas() { if(currentBeta < betas.length - 1) { return true; } return false; } public boolean hasLessBetas() { if(currentBeta > 0 && this.hasMoreBetas()) { return true; } return false; } public Map getSupertags() { Set> tp = this.getSupertagsAndProbs(this.currentPred, this.betas[this.currentBeta]); if(tp == null) { return null; } HashMap tagMap = new HashMap(tp.size()); for(Pair p : tp) { tagMap.put(p.a, p.b); } return tagMap; } /* this method is for use with the ResultSink class, during testing. That code requires the elements of the pairs to be swapped. */ private List> getSupertagsAsList() { Set> tp = this.getSupertagsAndProbs(this.currentPred, this.betas[this.currentBeta]); ArrayList> ret = new ArrayList>(); for(Pair p : tp) { ret.add(new Pair(p.b,p.a)); } return ret; } // cannot be made static due to ProbIndexPair private ArrayList getModelOutcomes(FeatureList f, ZLMaxentModel m) { ArrayList probList = new ArrayList(); double[] probs = m.eval(f.getAllFeaturesForMaxent(),true); for(int i = 0; i < probs.length; i++) { probList.add(new ProbIndexPair(probs[i], i)); } Collections.sort(probList); Collections.reverse(probList); return probList; } private HashSet> betaSearch(ArrayList probList, double beta) { double maxProb = probList.get(0).prob; HashSet> names = new HashSet>(); for(int i = 0; i < probList.size(); i++) { if(probList.get(i).prob >= beta * maxProb) { names.add(new Pair(protoHTModel.getOutcome(probList.get(i).index), probList.get(i).prob)); } else { break; } } return names; } public Set> getProtoSupertagsAndProbs(LfGraphNode n, double beta) { HashSet> names; ArrayList probList = new ArrayList(); probList = getModelOutcomes(n.getFeatures(), protoHTModel); names = betaSearch(probList, beta); return names; } /* 'MAIN' hypertagging entry point */ @SuppressWarnings("boxing") public Set> getSupertagsAndProbs(LfGraphNode n, double beta) { if(currentPred == null) { return null; // not handling EPs for rels or feats, so return null } double bestOCProb = 0.0, curOCProb = 0.0; HashSet> names = new HashSet>(); HashSet tagList = new HashSet(); ArrayList probList = new ArrayList(); probList = getModelOutcomes(n.getFeatures(), hypertagModel); /* check tagdict */ if(wdict != null) { // get pred name, get tags from dict at K, if none, try pos dict String predName = n.getPredicateName(); Collection permittedOutcomes = wdict.getEntry(predName, this.dictK); if(permittedOutcomes == null) { // back off to POS dict String pos = getPOS(n); permittedOutcomes = posdict.getEntry(pos); // sanity check if(permittedOutcomes == null) { System.err.println("!! No pos dict entries for " + pos); } } ArrayList po = new ArrayList(); for(String s : permittedOutcomes) { po.add(DefaultTokenizer.unescape(s)); } permittedOutcomes = (Collection)po; if(permittedOutcomes != null) { String oc = ""; for(ProbIndexPair p : probList) { oc = hypertagModel.getOutcome(p.index); curOCProb = p.prob; if(permittedOutcomes.contains(oc)) { if (bestOCProb == 0) { bestOCProb = curOCProb; } if (curOCProb >= (bestOCProb * beta)) { // Beta constraint. // The cut-off was met, add the outcome. names.add(new Pair(oc, p.prob)); tagList.add(oc); // update max, for first selected outcome if (curOCProb > bestOCProb) { bestOCProb = curOCProb; } } else { // Else, since our ProbIndexPair[] is sorted by probablity, there will be no more // outcomes that make the (beta) cut. break; } } else { // the outcome was ruled out by the tagdict. make a note of it. String nomId = currentPred.pred.getNominal().getName(); String gsTag = ""; String gsPos = ""; if(goldPred2Tag.containsKey(nomId)) { gsTag = goldPred2Tag.get(nomId); gsPos = goldPredPOS.get(nomId); } if(gsTag.equals(oc)) { try { tdErr.write(predName + "\t" + currentPred.index + "\t" + LFNum + "\t" + gsTag + "\t" + gsPos + "\n"); tdErr.flush(); } catch(Exception e) { throw new RuntimeException(e); } } } } } } else { // not using dicts double maxProb = probList.get(0).prob; for(int i = 0; i < probList.size(); i++) { if(probList.get(i).prob >= beta * maxProb) { names.add(new Pair(hypertagModel.getOutcome(probList.get(i).index), probList.get(i).prob)); tagList.add(hypertagModel.getOutcome(probList.get(i).index)); } else { break; } } } String nomId=currentPred.pred.getNominal().getName(); if(goldPred2Tag.containsKey(nomId)){ String goldStdTag=goldPred2Tag.get(nomId); if(goldStdTagInsert && !tagList.contains(goldStdTag)) names.add(new Pair(goldStdTag,1.0)); } return names; } public double getCurrentBetaValue() { if(currentBeta "); for(LfGraphNode n : nomTable.values()) { if(goldPred2Tag.get(n.getPred().getNominal().getName()) == null) { continue; // skip has-rel for now } out.append(DefaultTokenizer.escape(n.getPredicateName())); out.append(":S-"); out.append(DefaultTokenizer.escape(n.getPredicateName())); out.append(":P-"); out.append(DefaultTokenizer.escape(getPOS(n))); out.append(":T-"); out.append(DefaultTokenizer.escape(goldPred2Tag.get(n.getPred().getNominal().getName()))); out.append(" "); } out.append("\n"); return out.toString(); } public Word getPredAsWord(int idx) { LfGraphNode n = findNode(idx); Word w = Word.createWord(n.getPredicateName(), null, null, n.getPredicateName(), getPOS(n), getGoldSupertag(n), null); return w; } private Word getPredAsWord() { Word w = Word.createWord(currentPred.getPredicateName(), null, null, currentPred.getPredicateName(), getPOS(currentPred), getGoldSupertag(currentPred), null); return w; } @SuppressWarnings("boxing") public static void main(String[] args) throws IOException { String usage = "\nhypertagger (-i ) (-o [defaults to ]) (-c )\n"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } OptionParser o = new OptionParser(); o.acceptsAll(asList("help", "h"), "this message"); o.acceptsAll(asList("quiet", "q"), "print no status messages"); OptionSpec b_s = o.acceptsAll(asList("beta", "b"), "ignore betas in config file and use this value").withRequiredArg().ofType(Double.class); OptionSpec gr_s = o.acceptsAll(asList("g", "grammar")).withRequiredArg().ofType(File.class).describedAs("grammar filename"); OptionSpec corpusDir_s = o.acceptsAll(asList("d", "lf-dir")).withRequiredArg().ofType(File.class).describedAs("Directory to change to before searching for XML files"); OptionSpec configFile_s = o.acceptsAll(asList("c", "config")).withRequiredArg().ofType(File.class).describedAs("configfilename"); OptionSpec output_s = o.acceptsAll(asList("o", "output")).withRequiredArg().ofType(File.class).describedAs("output filename"); OptionSpec dump_s = o.acceptsAll(asList("dump-tags", "T")).withRequiredArg().ofType(File.class).describedAs("dump predicted tags to file"); o.acceptsAll(asList("goldstd", "G"), "include gold-standard supertags in tag dump"); OptionSet options = o.parse(args); File outputF = options.valueOf(output_s); File dumpF = options.valueOf(dump_s); File configFile = options.valueOf(configFile_s); BufferedWriter out = null; BufferedWriter dump = null; ArrayList resBetas; boolean quiet = options.has("q"); int lfcount = 0; try { out = (output_s.equals("stdout")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(outputF)); } catch (IOException ex) { System.err.print("Output file " + outputF + " could not be opened. Exiting..."); Logger.getLogger(STPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit(1); } ZLMaxentHypertagger ht = ZLMaxentHypertagger.ZLMaxentHypertaggerFactory(configFile.getAbsolutePath()); if(options.has("T")) { try { dump = new BufferedWriter(new FileWriter(dumpF)); } catch(IOException e) { System.err.print("Output file " + dumpF + " could not be opened. Exiting..."); System.exit(1); } } if(options.has("b")) { double beta = options.valueOf(b_s); ht.betas = new double[1]; ht.betas[0] = beta; } resBetas = new ArrayList(ht.betas.length); for(int i = 0; i < ht.betas.length; i++) { ResultSink r = new ResultSink(); resBetas.add(r); } ArrayList errFiles = new ArrayList(); for(int i = 0; i < ht.betas.length; i++) { File logdir = new File("logs"); if (!logdir.exists()) logdir.mkdirs(); BufferedWriter b = new BufferedWriter(new FileWriter(new File("logs/tagdict.err.out." + i))); b.write("### beta = " + ht.betas[i] + "\n"); errFiles.add(b); } LFLoader lfs = new LFLoader(options.valueOf(gr_s), options.valueOf(corpusDir_s), options.nonOptionArguments()); while(lfs.hasNext()) { lfcount++; LFInfo lfi = lfs.next(); try { ht.setLF(lfi.getLF()); ht.storeGoldStdPredInfo(lfi.getFullWords()); ht.setLFNum(lfi.getLFNum()); } catch (FeatureExtractionException e) { e.printStackTrace(); } List>> lfTagging = new ArrayList>>(); List gsTagging = new ArrayList(); List> tags; Word w; for(int bi = 0; bi < ht.betas.length; bi++) { gsTagging = new ArrayList(); lfTagging = new ArrayList>>(); ht.setBetaIndex(bi); ht.tdErr = errFiles.get(bi); for(int i = 0; i < ht.maxIndex(); i++) { ht.setPred(i); if(ht.currentPred == null) { //System.err.println("Skipping null pred " + i); continue; } w = ht.getPredAsWord(); tags = ht.getSupertagsAsList(); lfTagging.add(tags); gsTagging.add(w); if(dump != null) { if(options.has("G") && w.getSupertag() != null) { dump.write(w.getSupertag() + " "); } dump.write(w.getForm() + " "); for(int j = 0; j < tags.size(); j++) { dump.write(tags.get(j).a + " " + tags.get(j).b + " "); } dump.write("\n"); } } resBetas.get(bi).addSent(lfTagging, gsTagging); if(!quiet) { System.err.println("LFs processed: " + lfcount + "\r"); } ht.tdErr.flush(); } } if(dump != null) { dump.flush(); dump.close(); } for(int i = 0; i < ht.betas.length; i++) { errFiles.get(i).close(); } for(int i = 0; i < ht.betas.length; i++) { out.write("---------------\n"); out.write("BETA: " + ht.betas[i] + "\n"); out.write(resBetas.get(i).report()); } out.flush(); out.close(); } private void setBetaIndex(int bi) { this.currentBeta = bi; } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/ZLMaxentModel.java ================================================ package opennlp.ccg.realize.hypertagger; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; public class ZLMaxentModel { private MyIntegerPool intPool = new MyIntegerPool(50000); private double doubMax = 1.79769E+308; private ItemMap predMap = new ItemMap(this.intPool); private ItemMap outComeMap = new ItemMap(this.intPool); private Map>> params = new HashMap>>(); private boolean loaded = false; private double[] probs; private int n_outcome; public ZLMaxentModel() { } public ZLMaxentModel(String modelFilename) { this.load(new File(modelFilename)); } public void load(File modelFile) { if(!this.loaded) { this.loaded = true; BufferedReader br = null; try { br = new BufferedReader(new FileReader(modelFile)); String line = br.readLine(); if(line.contains("#")) { //DEBUG: outf.write(line+"\n"); line = br.readLine(); } // Read in contextual predicates. int numPreds = Integer.parseInt(line.trim()); for(int i=0; i>> tempParamsMap = new ArrayList>>(); int numParameters = this.predMap.size(); ArrayList> prms; int fid = 0; String ln = ""; for(int q=0; q>(); Integer oid; for(int p=1; p(oid, this.intPool.getInt(fid))); fid++; } tempParamsMap.add(prms); } // Load theta. int nTheta = Integer.parseInt(br.readLine().trim()); double[] theta = new double[nTheta]; for(int z=0; z> tmpParamsList; int index = 0; for(ArrayList> param : tempParamsMap) { tmpParamsList = new ArrayList>(); for(ZPair mapping : param) { //System.out.print(mapping.b.intValue()+" "); tmpParamsList.add(new ZPair(mapping.a, new Double(theta[mapping.b.intValue()]))); } this.params.put(this.intPool.getInt(index), tmpParamsList); index++; } this.n_outcome = this.outComeMap.size(); this.probs = new double[this.n_outcome]; // Initialise the array for computing distribution over all labels. } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { br.close(); } catch(IOException ioe) { ioe.printStackTrace(); } } } } public String getBestOutcome(double[] probs) { double maxprob = 0.0; int maxidx = -1; for(int i = 0; i < probs.length; i++) { if(probs[i] > maxprob) { maxidx = i; maxprob = probs[i]; } } return getOutcome(maxidx); } @SuppressWarnings("boxing") public double[] eval(String[] context, boolean realValued) { // Zero out prob distribution over labels. for(int i=0; i> pred_value = new ArrayList>(context.length); Double one = new Double(1.0); // Split up the strings into (pred,double) pairs. int splitPoint; for(int m=0; m(context[m].substring(0, splitPoint), Double.parseDouble(context[m].substring(splitPoint+1, context[m].length()))) ); } else { pred_value.add( new ZPair(context[m], one) ); } } ArrayList> featureWeights; //for(int j=0; j pv : pred_value) { Integer predID = this.predMap.id(pv.a); if(predID!=null) { featureWeights = this.params.get(predID); for(ZPair fw : featureWeights) { this.probs[fw.a.intValue()] += (fw.b.doubleValue() * pv.b.doubleValue()); } } } double sum = 0.0; for(int p=0; p dict = new HashMap(); private Map reverseDict = new HashMap(); private MyIntegerPool intPool; public ItemMap(MyIntegerPool intPool) { this.intPool = intPool; this.index = intPool.getInt(0); } @SuppressWarnings("boxing") public int add(String item) { if(this.dict.containsKey(item)) { return this.dict.get(item); } else { this.dict.put(item, index); this.reverseDict.put(index, item); this.index = this.intPool.getInt(index.intValue() + 1); return index.intValue()-1; } } public Integer id(String item) { if(this.dict.containsKey(item)) { return this.dict.get(item); } else { return null; } } public int size() { return this.dict.size(); } public String getItem(int i) { return this.reverseDict.get(this.intPool.getInt(i)); } } class ZPair { public A a; public B b; public ZPair(A a, B b) { this.a = a; this.b = b; } } class MyIntegerPool { private Integer[] _table; public MyIntegerPool(int size) { this._table = new Integer[size]; for(int i=0; i= 0) { return this._table[i]; } else { return new Integer(i); } } } ================================================ FILE: src/opennlp/ccg/realize/hypertagger/ZLPOSTagger.java ================================================ package opennlp.ccg.realize.hypertagger; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.lexicon.DefaultTokenizer; import opennlp.ccg.parse.postagger.ml.POSPriorModel; import opennlp.ccg.parse.tagger.util.ConfigFileProcessor; import opennlp.ccg.parse.tagger.util.ResultSink; import opennlp.ccg.util.Pair; public class ZLPOSTagger extends TagExtractor { ZLMaxentModel model; POSPriorModel priorModel; int prefixLength = 4; int suffixLength = 4; double beta = 0.35; private HashMap goldPred2Tag=new HashMap(); public class ProbIndexPair implements Comparable { public double prob; public int index; public ProbIndexPair(double prob, int index) { this.prob = prob; this.index = index; } public int compareTo(ProbIndexPair o) { if(prob < o.prob) { return -1; } else if(prob == o.prob) { return 0; } else { return 1; } } } public ZLPOSTagger() { super(); } public ZLPOSTagger(ZLMaxentModel model) { this(); this.model = model; this.prefixLength = 4; this.suffixLength = 4; this.beta = 0.35; // 0.4 delivers 1.08 POStags/pred } public ZLPOSTagger(ZLMaxentModel model, POSPriorModel priorModel) { this(); this.model = model; this.prefixLength = 4; this.suffixLength = 4; this.beta = 0.35; this.priorModel = priorModel; } public POSPriorModel getPriorModel() { return priorModel; } public void setPriorModel(POSPriorModel priorModel) { this.priorModel = priorModel; } /** get the features for one node * * FO -- fan-out, i.e. number of children * PN -- predicate name * RN -- parent name * RT -- parent relation * CT -- type of child * CN -- name of child * NA -- number of Argument children * A0N, A1N, ... -- names of argument children (by default) * PX -- prefix (N characters) * SX -- suffix (M characters) * HD -- has a digit * UH -- has an uppercase character or a hyphen * * @param n The graph node to extract features from * @return An array of strings representing the features */ // mww: switched to configurable arg names @SuppressWarnings("boxing") protected FeatureList getFeatures(LfGraphNode n) { FeatureList feats = new FeatureList(); int argchildren = 0; feats.addFeature("PN", n.getPredicateName()); feats.addFeature("FO", Integer.toString(n.getNumChildren())); // add name of parent, if any parent, and parent relation if(n.getMultiParents().size() > 0) { for(LfGraphNode parent : n.getMultiParents() ) { feats.addFeature("RN", parent.getPredicateName()); // add class name, if available String cls = parent.getPred().getNominal().toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls != null && cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); feats.addFeature("XP", cfeat); } } } else { feats.addFeature("RN", "0"); } // add types of children, count up argument children for(LfGraphLink lnk : n.getChildren()) { feats.addFeature("CT", lnk.getLabel()); if(lnk.getTarget() != null) { feats.addFeature("CN", lnk.getTarget().getPredicateName()); // mww: use short arg name String shortArgName = argNameMap.get(lnk.getLabel()); if (shortArgName != null) { // increment argchild count argchildren++; feats.addFeature(shortArgName + "N", lnk.getTarget().getPredicateName()); // add class info for arg child, if applicable String cls = lnk.getTarget().getPred().getNominal().toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); // mww: for backwards compatibility String argNumOrName = (shortArgName.startsWith("A")) ? shortArgName.substring(1) : shortArgName; feats.addFeature("X" + argNumOrName + "D", cfeat); } } } } feats.addFeature("NA", Integer.toString(argchildren)); // class name feature, if available Nominal idx = n.getPred().getNominal(); String cls = idx.toString(); // string is in X:Y:Z format. Remove 'X:' leaving 'Y:Z'. if(cls.indexOf(':') > 0) { String cfeat = cls.substring(cls.indexOf(':') + 1); feats.addFeature("XC", cfeat); } // compute prefix and suffix String predname = n.getPredicateName(); // chop .04 part, if any Pattern pat = Pattern.compile("(\\w+)\\.\\d+$"); String basePredName = new String(predname); Matcher mat = pat.matcher(basePredName); if(mat.matches()) { basePredName = mat.group(1); } if(basePredName.length() > prefixLength + 1) { String prefix = predname.substring(0, prefixLength); feats.addFeature("PX", prefix); } if(basePredName.length() > suffixLength + 1) { String suffix = basePredName.substring(basePredName.length() - suffixLength, basePredName.length()); feats.addFeature("SX", suffix); } // check for digit and/or (upcase char and hyphen) if(basePredName.matches("\\d+")) { feats.addFeature("HD", "1"); } else { feats.addFeature("HD", "0"); } if(predname.matches("[A-Z]+") || predname.matches("-+")) { // XXX hack, because I don't think these PASS nodes should have a positive val // for this feature if(predname != "PASS") { feats.addFeature("UH", "1"); } else { feats.addFeature("UH", "0"); } } else { feats.addFeature("UH", "0"); } // features from prior model, PPOS, if applicable if(priorModel != null) { List> priors = priorModel.getPriors(predname); double beta = 0.1; double best = priors.get(0).a; for(Pair prior : priors) { if(prior.a > (beta * best)) { // add the feature PPOS=: feats.addFeatureWithProb("PPOS", prior.b, prior.a); } else { break; } } } return feats; } /** Get the features for the index'th node * * @param index The index into the LF * @return An array of strings representing the features */ public FeatureList getFeatures(int index) { for(LfGraphNode n : nomTable.values()) { if(n.getIndex() == index) { return getFeatures(n); } } return null; // bad index } public int getPrefixLength() { return prefixLength; } public void setPrefixLength(int prefixLength) { this.prefixLength = prefixLength; } public int getSuffixLength() { return suffixLength; } public void setSuffixLength(int suffixLength) { this.suffixLength = suffixLength; } /** Get all features for the current LF as a single multi-line string. * Used during extraction of training data. * @return All features for the current LF. */ // formerly known as getAllFeaturesAndPOS() public String getAllFeaturesAndAnswer() { StringBuilder output = new StringBuilder(); for(LfGraphNode n : nomTable.values()) { String feats = getFeatures(n).getAllFeatures(); String postag = goldPred2Tag.get(n.getPred().getNominal().getName()); output = output.append(postag + " "); output = output.append(feats); output = output.append("\n"); } return output.toString(); } public String getAllFeaturesAndPOSWithID() { String output = ""; for(LfGraphNode n : nomTable.values()) { String feats = getFeatures(n).getAllFeatures(); String postag = n.getPred().getOrigin().getPOS(); output = output.concat("<" + Integer.toString(LFID) + "> " + postag + " " + feats); output = output.concat(postag + " "); output = output.concat(feats); output = output.concat("\n"); } return output; } /* Returns an arraylist of tuples (POS, probability) */ @SuppressWarnings({ "boxing" }) public ArrayList> getBetaBestPOS(FeatureList feats) { ArrayList> poss = new ArrayList>(); ArrayList probList = new ArrayList(); double[] probs = model.eval(feats.getAllFeaturesForMaxent(),true); for(int i = 0; i < probs.length; i++) { probList.add(new ProbIndexPair(probs[i], i)); } Collections.sort(probList); Collections.reverse(probList); double maxProb = probList.get(0).prob; for(int i = 0; i < probList.size(); i++) { if(probList.get(i).prob >= this.beta * maxProb) { poss.add(new Pair(model.getOutcome(probList.get(i).index), probList.get(i).prob)); } } return poss; } @SuppressWarnings("boxing") public ArrayList> getBetaBestPOS(String[] feats) { ArrayList> poss = new ArrayList>(); ArrayList probList = new ArrayList(); double[] probs = model.eval(feats,true); for(int i = 0; i < probs.length; i++) { probList.add(new ProbIndexPair(probs[i], i)); } Collections.sort(probList); Collections.reverse(probList); double maxProb = probList.get(0).prob; for(int i = 0; i < probList.size(); i++) { if(probList.get(i).prob >= this.beta * maxProb) { poss.add(new Pair(model.getOutcome(probList.get(i).index), probList.get(i).prob)); } } return poss; } // for tagging (i.e. evaluating feature list against model) public String getBestPOS(FeatureList feats) { // feed the features to the model, and get the best guess // at the POS tag given those features. double[] probs = model.eval(feats.getAllFeaturesForMaxent(),false); return new String(model.getBestOutcome(probs)); } public double getBeta() { return beta; } public void setBeta(double beta) { this.beta = beta; } /* (non-Javadoc) * @see opennlp.ccg.realize.hypertagger.TagExtractor#storeGoldStdPredInfo(java.lang.String) */ public void storeGoldStdPredInfo(String predInfo) { String[] preds = predInfo.split("\\s+"); if(preds != null) { for(int i = 1; i < preds.length; i++) { String[] info = preds[i].split(":"); if(info.length != 4) { System.err.println("Malformed pred-info field, skipping (value was \"" + preds[i] + "\")"); continue; } goldPred2Tag.put(info[0], DefaultTokenizer.unescape(info[2])); } } } public void loadPriorModel(File priorModelFile, File vocabFile) { try { priorModel = new POSPriorModel(priorModelFile.getAbsolutePath(), vocabFile.getAbsolutePath()); } catch (IOException e) { System.err.println("Unable to load prior model or vocab file"); e.printStackTrace(); } } @SuppressWarnings({"unused" }) public static void main(String[] args) throws IOException { String usage = "\nBasicPOSTagger -c (-i ) (-o [defaults to ])\n"+ " (-b beta value) (-m model file)\n"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } //SRILMFactoredBundleCorpusIterator inp = null; // input is just a file full of maxent events // i.e., we are not really tagging an lf, just testing a model by feeding it pre-extracted // events and comparing with the GS tags BufferedReader inp = null; BufferedWriter out = null; try { String inputCorp = null; String configFile = null; String output = null; String modelFile = null; double beta = 0; boolean test = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-i")) { inputCorp = args[++i]; continue; } if (args[i].equals("-o")) { output = args[++i]; continue; } if (args[i].equals("-c")) { configFile = args[++i]; continue; } if (args[i].equals("-m")) { modelFile = args[++i]; continue; } if (args[i].equals("-b")) { beta = Double.parseDouble(args[++i]); continue; } System.out.println("Unrecognized option: " + args[i]); } ResultSink rs = new ResultSink(ResultSink.ResultSinkType.POSTAG); try { inp = new BufferedReader(new FileReader(new File(inputCorp))); } catch (FileNotFoundException ex) { System.err.print("Input corpus " + inputCorp + " not found. Exiting..."); Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit(-1); } try { out = (output.equals("")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output))); } catch (IOException ex) { System.err.print("Output file " + output + " not found. Exiting..."); Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); System.exit(-1); } //String[] pathKeys = { "maxentmodel", "priormodel", "priormodelvocab", "sequencemodel" }; //Map opts = ConfigFileProcessor.readInConfig(configFile, pathKeys); //POSPriorModel posPrior = new POSPriorModel(opts.get("priormodel"), opts.get("priormodelvocab")); ZLPOSTagger post = new ZLPOSTagger(new ZLMaxentModel(modelFile)); if(beta > 0) { post.setBeta(beta); } //post.setPriorModel(posPrior); // unneeded? String line; int count = 0; int wins = 0; int tagsPerPred = 0; while(true) { line = inp.readLine(); if(line == null) { break; } count++; int pos = line.indexOf(' '); String gs = line.substring(0,pos); //System.err.println("GS: " + gs); //System.err.println("Featline: " + line.substring(pos + 1)); String[] feats = line.substring(pos+1).split("\\s+"); ArrayList> ptags = post.getBetaBestPOS(feats); tagsPerPred += ptags.size(); // now check for the win... for(Pair p : ptags) { if(p.a.equals(gs)) { wins++; break; } } } // for now, just print overall accuracy out.write("Beta: " + post.getBeta() + "\n"); out.write("Acc: " + (double)wins / (double)count * 100.0 + "\n"); out.write("Tags/Pred: " + (double)tagsPerPred / (double)count + "\n"); out.flush(); } catch(Throwable t) { t.printStackTrace(); } finally { try { inp.close(); out.close(); } catch (IOException ex) { Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex); } } } public static ZLPOSTagger ZLPOSTaggerFactory(String configFile) throws IOException { ZLPOSTagger postagger = new ZLPOSTagger(); String[] pathKeys = { "priormodel", "priormodelvocab", "maxentmodel"}; Map opts = ConfigFileProcessor.readInConfig(configFile, pathKeys); String priorModelFile = opts.get("priormodel"); if(priorModelFile != null) { String vocabFile = opts.get("priormodelvocab"); if(vocabFile == null) { throw new IOException("A vocab file must be specified."); } postagger.loadPriorModel(new File(priorModelFile), new File(vocabFile)); } String modelFile = opts.get("maxentmodel"); if(modelFile == null) { throw new IOException("You must specify the maxent model to use."); } postagger.model = new ZLMaxentModel(modelFile); String betaString = opts.get("beta"); if(betaString != null) { double beta = Double.parseDouble(betaString); postagger.beta = beta; } return postagger; } } ================================================ FILE: src/opennlp/ccg/synsem/AbstractCat.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.IOException; import java.io.Serializable; import gnu.trove.*; import org.jdom.*; import opennlp.ccg.hylo.*; import opennlp.ccg.unify.*; /** * Abstract category adapter for CCG categories. * * @author Gann Bierner * @author Jason Baldridge * @author Michael White * @version $Revision: 1.22 $, $Date: 2011/05/22 03:40:55 $ */ public abstract class AbstractCat implements Category, Serializable { private static final long serialVersionUID = 1L; /** The feature structure, which should only be used with atomic categories. */ protected FeatureStructure _featStruc; /** The logical form, which should be used only with the outermost category. */ protected LF _lf; /** The hash code, if already computed. */ private transient int _hashCode = -1; /** The hash code for the category without its LF, if already computed. */ private transient int _hashCodeNoLF = -1; /** The mapping from vars to ints, if already computed. */ private transient TObjectIntHashMap _varMap = null; /** The supertag, if already computed. */ protected String _supertag = null; /** Default constructor. */ public AbstractCat() {} /** Constructor which sets the LF. */ public AbstractCat(LF lf) { _lf = lf; } /** * Constructor which retrieves the LF from the XML element * and flattens it to a conjunction of elementary predications * (or a single one). */ public AbstractCat(Element elt) { Element lfElt = elt.getChild("lf"); if (lfElt != null) { _lf = HyloHelper.flattenLF(HyloHelper.getLF(lfElt)); } } /** * Adds an XML element for the LF, if any, to the given catElt. * Uses {@link HyloHelper#toXml(LF)}. */ public void toXml(Element catElt) { if (_lf != null) catElt.addContent(HyloHelper.toXml(_lf)); } // during deserialization, intern computed supertag, and ensure varmap recomputed private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); if (_supertag != null) _supertag = _supertag.intern(); _varMap = null; } /** Gets the feature structure. */ public FeatureStructure getFeatureStructure() { return _featStruc; } /** Sets the feature structure. */ public void setFeatureStructure(FeatureStructure fs) { _featStruc = fs; } /** Gets the LF. */ public LF getLF() { return _lf; } /** Sets the LF. */ public void setLF(LF lf) { _lf = lf; } //----------------------------------------------------------------- // methods from Category which should be implemented by subclasses of // AbstractCat public abstract String toString(); /** * Returns the supertag for the category. */ public abstract String getSupertag(); /** * This will return a TeX formatted representation for a category. * If toTeX() is not implemented for this category, the normal * toString() method is invoked instead. */ public String toTeX() { return toString(); } public abstract Category copy(); public abstract Category shallowCopy(); public abstract Object fill (Substitution s) throws UnifyFailure; public abstract void unifyCheck (Object u) throws UnifyFailure; /** NB: The LF does not participate in unification. */ public abstract Object unify (Object u, Substitution sub) throws UnifyFailure; /** * Returns the target category of this category. */ public abstract TargetCat getTarget(); //----------------------------------------------------------------- // implemented methods from Category public boolean shallowEquals(Object o) { return equals(o); } public void deepMap(ModFcn mf) { if (_lf != null) _lf.deepMap(mf); mf.modify(this); } public void forall(CategoryFcn f) { f.forall(this); } public boolean occurs(Variable v) { if (_lf == null) return false; return _lf.occurs(v); } // methods to support printing of Categories public String prettyPrint() { return prettyPrint(""); } protected String prettyPrint(String pad) { return pad+toString(); } protected int prettyLength(String s) { int max=0, cur=0; for(int i=0; i info) { List args = new ArrayList(); for (Iterator infoIt = info.iterator(); infoIt.hasNext();) { Element el = infoIt.next(); String elName = el.getName(); if (elName.equals("setarg")) { args.add(new SetArg(el)); _hasSet = true; } else if (elName.equals("dollar")) { String name = el.getAttributeValue("name"); if (name == null) name = el.getAttributeValue("n"); args.add(new Dollar(name)); _hasDollar = true; } else if (elName.equals("slash") || elName.equals("sl")) { Slash s = new Slash(el); Element argEl = infoIt.next(); if (argEl.getName().equals("dollar")) { String name = argEl.getAttributeValue("name"); if (name == null) name = argEl.getAttributeValue("n"); args.add(new Dollar(s, name)); _hasDollar = true; } else { args.add(new BasicArg(s, CatReader.getCat(argEl))); } } else { System.out.println("Invalid element for creating ArgStack: " + elName); } } _list = new Arg[args.size()]; args.toArray(_list); } public void toXml(Element catElt) { for (Arg arg: _list) { if (arg instanceof SetArg) catElt.addContent(((SetArg) arg).toXml()); else if (arg instanceof Dollar) { Dollar dollar = (Dollar) arg; if (!dollar.getSlash().toString().equals("|.")) catElt.addContent(dollar.getSlash().toXml()); Element dollarElt = new Element("dollar"); dollarElt.setAttribute("name", dollar.name()); catElt.addContent(dollarElt); } else if (arg instanceof BasicArg) { BasicArg barg = (BasicArg) arg; catElt.addContent(barg.getSlash().toXml()); catElt.addContent(barg.getCat().toXml()); } } } public void addAt(Arg c, int index) { Arg[] $list = new Arg[_list.length + 1]; insert(subList(0, index)._list, $list, 0); $list[index] = c; insert(subList(index)._list, $list, index + 1); _list = $list; if (c instanceof Dollar) { _hasDollar = true; } else if (c instanceof SetArg) { _hasSet = true; } } public void add(Arg c) { Arg[] $list = new Arg[_list.length + 1]; int last = insert(_list, $list, 0); $list[last] = c; _list = $list; if (c instanceof Dollar) { _hasDollar = true; } else if (c instanceof SetArg) { _hasSet = true; } } public void addAt(ArgStack cl, int index) { Arg[] $list = new Arg[_list.length + cl._list.length]; int last = insert(subList(0, index)._list, $list, 0); last = insert(cl._list, $list, last); insert(subList(index)._list, $list, last); _list = $list; if (cl.containsDollarArg()) { _hasDollar = true; } else if (cl.containsSetArg()) { _hasSet = true; } } public void add(ArgStack cl) { Arg[] $list = new Arg[_list.length + cl._list.length]; int last = insert(_list, $list, 0); insert(cl._list, $list, last); _list = $list; if (cl.containsDollarArg()) { _hasDollar = true; } else if (cl.containsSetArg()) { _hasSet = true; } } public void addFront(Arg c) { Arg[] $list = new Arg[_list.length + 1]; $list[0] = c; insert(_list, $list, 1); _list = $list; if (c instanceof Dollar) { _hasDollar = true; } else if (c instanceof SetArg) { _hasSet = true; } } public void addFront(ArgStack cl) { Arg[] $list = new Arg[_list.length + cl._list.length]; int last = insert(cl._list, $list, 0); insert(_list, $list, last); _list = $list; if (cl.containsDollarArg()) { _hasDollar = true; } else if (cl.containsSetArg()) { _hasSet = true; } } public void insertFront(ArgStack cl) { insertAt(cl, 0); } public void insertEnd(ArgStack cl) { insertAt(cl, _list.length - 1); } public void insertAt(ArgStack cl, int index) { Arg insertInto = _list[index]; if (insertInto instanceof BasicArg) { cl.add(insertInto); _list[index] = new SetArg(cl); _hasSet = true; } else if (insertInto instanceof SetArg) { ((SetArg) insertInto).add(cl); } else { System.out.println("Problem inserting arg stack: " + cl); } } public int size() { return _list.length; } public boolean containsDollarArg() { return _hasDollar; } public boolean containsSetArg() { return _hasSet; } public Arg get(int i) { return _list[i]; } public void set(int i, Arg c) { _list[i] = c; if (c instanceof Dollar) { _hasDollar = true; } else if (c instanceof SetArg) { _hasSet = true; } } public Arg getLast() { return _list[_list.length - 1]; } public void setLast(Arg c) { set(_list.length - 1, c); } /** Sets the harmonic composition result of each arg's slash. */ public void setSlashHarmonicCompositionResult(boolean harmonicResult) { for (int i=0; i < _list.length; i++) { _list[i].setSlashHarmonicCompositionResult(harmonicResult); } } public ArgStack copy() { Arg[] $list = new Arg[_list.length]; for (int i = 0; i < $list.length; i++) { $list[i] = _list[i].copy(); } return new ArgStack($list); } public ArgStack copyWithout(int indexToRemove) { Arg[] $list = new Arg[_list.length - 1]; if ($list.length < 1) { System.out.println("Removing last item from an argument stack!"); } int index = 0; for (int i = 0; i < _list.length; i++) { if (i != indexToRemove) { $list[index++] = _list[i].copy(); } } return new ArgStack($list); } public ArgStack subList(int from) { return subList(from, _list.length); } public ArgStack subList(int from, int upto) { Arg[] $list; if (upto > from) { $list = new Arg[upto - from]; int index = 0; for (int i = from; i < upto; i++) { $list[index++] = _list[i]; } } else { $list = new Arg[0]; } return new ArgStack($list); } public ArgStack shallowCopy() { return new ArgStack(_list); } public boolean occurs(Variable v) { for (int i = 0; i < _list.length; i++) { if (_list[i].occurs(v)) { return true; } } return false; } public ArgStack fill(Substitution s) throws UnifyFailure { ArgStack args = new ArgStack(); for (int i = 0; i < _list.length; i++) { Object value = _list[i].fill(s); if (value instanceof ArgStack) { args.add((ArgStack) value); } else { args.add((Arg) value); } } return args; } public void deepMap(ModFcn mf) { for (int i = 0; i < _list.length; i++) { _list[i].deepMap(mf); } } public boolean containsContrarySlash() { for (int i = 0; i < _list.length; i++) { if (_list[i] instanceof BasicArg && !((BasicArg) _list[i]).getSlash().sameDirAsModality()) { return true; } else if (_list[i] instanceof SetArg && ((SetArg) _list[i]).containsContrarySlash()) { return true; } } return false; } public void slashesUnify(Slash s) throws UnifyFailure { for (int i = 0; i < _list.length; i++) { _list[i].unifySlash(s); } } public int unifySuffix(ArgStack as, Substitution sub) throws UnifyFailure { int asIndex = as.size(); for (int i = _list.length - 1; i >= 0; i--) { asIndex--; get(i).unify(as.get(asIndex), sub); } return asIndex; } public ArgStack unify(ArgStack as, Substitution sub) throws UnifyFailure { return unifyPrefix(as, as.size(), sub); } public ArgStack unifyPrefix(ArgStack as, int upto, Substitution sub) throws UnifyFailure { ArgStack $args; if (containsDollarArg()) { if (as.containsDollarArg()) { $args = unifyDollarWithDollar(as, upto, sub); } else { $args = unifyDollarWithNoDollar(size(), as, upto, sub); } } else if (as.containsDollarArg()) { $args = as.unifyDollarWithNoDollar(upto, this, size(), sub); } else if (size() == upto) { $args = unifySimple(as, upto, sub); } else { $args = unifyComplex(as, upto, sub); // throw new UnifyFailure(); } return $args; } private ArgStack unifySimple(ArgStack as, int upto, Substitution sub) throws UnifyFailure { ArgStack $args = new ArgStack(); for (int i = upto - 1; i >= 0; i--) { $args.addFront((Arg) _list[i].unify(as.get(i), sub)); } return $args; } private ArgStack unifyComplex(ArgStack as, int upto, Substitution sub) throws UnifyFailure { ArgStack $args = new ArgStack(); int aIndex = size() - 1; int bIndex = upto - 1; while (aIndex >= 0 && bIndex >= 0) { // while (null != aArg && null != bArg) { Arg aArg = get(aIndex); Arg bArg = as.get(bIndex); if ((aArg instanceof BasicArg && bArg instanceof BasicArg) || (aArg instanceof SetArg && bArg instanceof SetArg)) { $args.addFront((Arg) aArg.unify(bArg, sub)); aIndex--; bIndex--; } else if (aArg instanceof BasicArg && bArg instanceof SetArg) { int setsize = ((SetArg) bArg).size(); if (setsize <= aIndex + 1) { int stop = aIndex - setsize; for (; aIndex > stop;) { aIndex--; if (bArg instanceof BasicArg) { $args.addFront((Arg) aArg.unify(bArg, sub)); } else { int idInSet = ((SetArg) bArg) .indexOf((BasicArg) aArg); if (idInSet == -1) throw new UnifyFailure(); $args.addFront((Arg) aArg.unify(((SetArg) bArg) .get(idInSet), sub)); aArg = get(aIndex); bArg = ((SetArg) bArg).copyWithout(idInSet); } } bIndex--; } else { throw new UnifyFailure(); } } else if (aArg instanceof SetArg && bArg instanceof BasicArg) { throw new UnifyFailure(); } else { throw new UnifyFailure(); } } if (aIndex > -1 || bIndex > -1) { throw new UnifyFailure(); } return $args; } private ArgStack unifyDollarWithNoDollar(int uptoThis, ArgStack otherStack, int uptoOther, Substitution sub) throws UnifyFailure { if ((!(_hasSet || otherStack._hasSet) && uptoThis > uptoOther + 1) || (uptoThis > 1 && uptoOther < 1)) { throw new UnifyFailure(); } ArgStack $args = new ArgStack(); otherStack = otherStack.subList(0, uptoOther); int otherIndex = uptoOther - 1; for (int i = uptoThis - 1; i >= 0; i--) { Arg argi = get(i); if (argi instanceof Dollar) { if (i > 0) { throw new UnifyFailure(); } else { ArgStack $subArgs = otherStack.subList(0, otherIndex + 1); // Slash dsl = ((Dollar) argi).getSlash(); ((Dollar) argi).unify($subArgs.copy(), sub); otherIndex = 0; $args.addFront($subArgs); } } else if (argi instanceof BasicArg) { if (otherIndex < 0) { throw new UnifyFailure(); } Arg otherArg = otherStack.get(otherIndex); if (otherArg instanceof BasicArg) { $args.addFront((Arg) argi.unify(otherArg, sub)); otherIndex--; } else if (otherArg instanceof SetArg) { SetArg sa = (SetArg) otherArg; int id = sa.indexOf((BasicArg) argi); if (id == -1) throw new UnifyFailure(); $args.addFront((Arg) argi.unify(sa.get(id), sub)); otherStack.set(otherIndex, sa.copyWithout(id)); } } else { throw new UnifyFailure(); } } if (otherIndex > 0) { throw new UnifyFailure(); } return $args; } private ArgStack unifyDollarWithDollar(ArgStack as, int upto, Substitution sub) throws UnifyFailure { ArgStack $args; if (size() == 1) { $args = as.subList(0, upto); ((Dollar) get(0)).unify($args.copy(), sub); } else if (upto == 1) { $args = subList(0, size()); ((Dollar) as.get(0)).unify($args.copy(), sub); } else if (upto == size()) { $args = unifySimple(as, upto, sub); } else { throw new UnifyFailure(); } return $args; } public void forall(CategoryFcn fcn) { for (int i = 0; i < _list.length; i++) { _list[i].forall(fcn); } } private void checkForDollar() { for (int i = 0; i < _list.length; i++) { if (_list[i] instanceof Dollar) { _hasDollar = true; return; } } } private void checkForSet() { for (int i = 0; i < _list.length; i++) { if (_list[i] instanceof SetArg) { _hasSet = true; return; } } } public String toString() { StringBuffer sb = new StringBuffer(); for (int i = 0; i < _list.length; i++) { sb.append(_list[i].toString()); } return sb.toString(); } // private boolean methodExists(Object o, String methodName) { // java.lang.reflect.Method[] m = o.getClass().getMethods(); // for (int i = 0; i < m.length; i++) // if (m[i].getName() == methodName) { // if (m[i].getDeclaringClass().toString().startsWith("class")) // return true; // else // return false; // } // return false; // } /** * Returns the supertag for this arg stack. */ public String getSupertag() { StringBuffer sb = new StringBuffer(); for (int i = 0; i < _list.length; i++) { Arg arg = (Arg) _list[i]; sb.append(arg.getSupertag()); } return sb.toString(); } /** * Returns a TeX-formatted string representation for this arg stack. */ public String toTeX() { StringBuffer sb = new StringBuffer(); for (int i = 0; i < _list.length; i++) { sb.append(_list[i].toTeX()); } return sb.toString(); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { int retval = 0; for (int i = 0; i < _list.length; i++) { retval += _list[i].hashCode(varMap); } return retval; } /** * Returns whether this arg stack equals the given object up to variable * names, using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } ArgStack as = (ArgStack) obj; if (_list.length != as._list.length) { return false; } for (int i = 0; i < _list.length; i++) { if (!_list[i].equals(as._list[i], varMap, varMap2)) { return false; } } return true; } protected static int insert(Arg[] a, Arg[] b, int pos) { for (int i = 0; i < a.length; i++) { b[pos++] = a[i]; } return pos; } } ================================================ FILE: src/opennlp/ccg/synsem/AtomCat.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-11 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.unify.*; import opennlp.ccg.util.DisplayPrefs; import opennlp.ccg.grammar.Grammar; import org.jdom.*; import gnu.trove.*; /** * The most basic CG category. This corresponds to a category like 'np[acc]', * i.e. a category name and associated features. * * @author Gann Bierner * @author Jason Baldridge * @author Michael White * @version $Revision: 1.18 $, $Date: 2011/07/15 03:02:53 $ */ public final class AtomCat extends AbstractCat implements TargetCat { private static final long serialVersionUID = 1L; private String type; /** Flag indicating whether this cat is a fragment with completion status true (defaults to false). */ public boolean fragCompletion = false; /** Constructor which creates an atomic category with the given type. */ public AtomCat(String t) { this(t, new GFeatStruc()); } /** Constructor which creates an atomic category with the given type and feature structure. */ public AtomCat(String t, FeatureStructure fs) { this(t, fs, null); } /** Constructor which creates an atomic category with the given type, feature structure and LF. */ public AtomCat(String t, FeatureStructure fs, LF lf) { super(lf); type = t; _featStruc = fs; } /** Constructor which retrieves the atomic category from the XML element. */ public AtomCat(Element acel) { // call super to get LF if present super(acel); // get type type = acel.getAttributeValue("type"); if (type == null) type = acel.getAttributeValue("t"); // get feature structure Element fsEl = acel.getChild("fs"); if (fsEl != null) { _featStruc = new GFeatStruc(fsEl); } // or create empty one else { _featStruc = new GFeatStruc(); } } /**Returns an XML element representing the category. */ public Element toXml() { Element retval = new Element("atomcat"); retval.setAttribute("type", type); if (!_featStruc.isEmpty() || _featStruc.getIndex() > 0) { if (_featStruc instanceof GFeatStruc) { // only supporting GFeatStruc per xml construction retval.addContent(((GFeatStruc) _featStruc).toXml()); } } // call super to add LF if present super.toXml(retval); return retval; } /** * Returns this category as the target category. */ public TargetCat getTarget() { return this; } public String getType() { return type; } public Category copy() { AtomCat retval = new AtomCat(type, _featStruc.copy(), (_lf == null) ? null : (LF) _lf.copy()); retval.fragCompletion = fragCompletion; return retval; } public Category shallowCopy() { AtomCat retval = new AtomCat(type, _featStruc, _lf); retval.fragCompletion = fragCompletion; return retval; } public void deepMap(ModFcn mf) { super.deepMap(mf); _featStruc.deepMap(mf); } public void unifyCheck (Object u) throws UnifyFailure { if (u instanceof AtomCat) { AtomCat u_ac = (AtomCat)u; if (!(type.equals(u_ac.type))) { throw new UnifyFailure(); } if (_featStruc != null && u_ac._featStruc != null) { _featStruc.unifyCheck(u_ac._featStruc); } } else if (!(u instanceof Variable)) { throw new UnifyFailure(); } } /** NB: The LF does not participate in unification. */ public Object unify (Object u, Substitution sub) throws UnifyFailure { if (u instanceof AtomCat && type.equals(((AtomCat)u).type)) { AtomCat u_ac = (AtomCat)u; FeatureStructure $fs; if (_featStruc == null) { $fs = u_ac._featStruc; } else if (u_ac._featStruc == null) { $fs = _featStruc; } else { $fs = (FeatureStructure)_featStruc.unify(u_ac._featStruc, sub); } return new AtomCat(type, $fs); } else { throw new UnifyFailure(); } } public Object fill (Substitution s) throws UnifyFailure { AtomCat $ac = new AtomCat(type, (FeatureStructure)_featStruc.fill(s), (_lf == null) ? null : (LF) _lf.fill(s)); return $ac; } public boolean shallowEquals(Object c) { if (c instanceof AtomCat) { AtomCat ac = (AtomCat)c; return type.equals(ac.type); } return false; } public String toString() { DisplayPrefs prefs = Grammar.theGrammar.prefs; StringBuffer sb = new StringBuffer(); sb.append(type); if (fragCompletion) sb.append("_c"); if(_featStruc != null && prefs.showFeats) sb.append(_featStruc.toString()); if (_lf != null && prefs.showSem) { sb.append(" : ").append(_lf.toString()); } if (sb.length() == 0) return "UnknownCat"; return sb.toString(); } /** * Returns the interned supertag for the category. */ public String getSupertag() { if (_supertag != null) return _supertag; StringBuffer sb = new StringBuffer(); sb.append(type); if(_featStruc != null) sb.append(_featStruc.getSupertagInfo()); if (sb.length() == 0) _supertag = "UnknownCat"; else _supertag = sb.toString().intern(); return _supertag; } public String toTeX() { DisplayPrefs prefs = Grammar.theGrammar.prefs; StringBuffer sb = new StringBuffer(); sb.append(type); if(_featStruc != null && prefs.showFeats) sb.append(_featStruc.toTeX()); if (sb.length() == 0) return "UnknownCat"; return sb.toString(); } /** * Returns a hash code for this category ignoring the LF, * using the given map from vars to ints. */ public int hashCodeNoLF(TObjectIntHashMap varMap) { int retval = type.hashCode(); if (_featStruc != null) { if (_featStruc instanceof GFeatStruc) { retval += ((GFeatStruc)_featStruc).hashCode(varMap); } else { // nb: would be nice to get rid of this case retval += _featStruc.hashCode(); } } return retval; } /** * Returns whether this category equals the given object * up to variable names, using the given maps from vars to ints, * ignoring the LFs (if any). */ public boolean equalsNoLF(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } AtomCat ac = (AtomCat) obj; if (_featStruc != null && ac._featStruc == null) { return false; } if (_featStruc == null && ac._featStruc != null) { return false; } if (!type.equals(ac.type)) { return false; } if (_featStruc != null) { if (_featStruc instanceof GFeatStruc) { if (!((GFeatStruc)_featStruc).equals(ac._featStruc, varMap, varMap2)) { return false; } } else { // nb: would be nice to get rid of this case if (!_featStruc.equals(ac._featStruc)) { return false; } } } return true; } /** * Returns whether this category is a fragment category; * returns true iff the type of this cat is "frag". */ public boolean isFragment() { return type.equals("frag"); } } ================================================ FILE: src/opennlp/ccg/synsem/BasicArg.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.Serializable; import gnu.trove.*; import opennlp.ccg.unify.*; /** * A basic argument that contains a slash and a category. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/11/28 03:39:27 $ */ public final class BasicArg implements Arg, Serializable { private static final long serialVersionUID = -4244825501682166456L; private final Slash _slash; private final Category _cat; public BasicArg(Slash s, Category c) { _slash = s; _cat = c; } public Arg copy() { return new BasicArg(_slash.copy(), _cat.copy()); } public Slash getSlash() { return _slash; } public void setSlashModifier(boolean modifier) { _slash.setModifier(modifier); } public void setSlashHarmonicCompositionResult(boolean harmonicResult) { _slash.setHarmonicCompositionResult(harmonicResult); } public Category getCat() { return _cat; } public boolean occurs(Variable v) { return _cat.occurs(v); } public Object fill(Substitution sub) throws UnifyFailure { return new BasicArg((Slash) _slash.fill(sub), (Category) _cat.fill(sub)); } public void forall(CategoryFcn fcn) { _cat.forall(fcn); } public void unifySlash(Slash s) throws UnifyFailure { _slash.unifyCheck(s); } public void unifyCheck(Object u) throws UnifyFailure { } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof BasicArg) { return new BasicArg((Slash) _slash .unify(((BasicArg) u)._slash, sub), (Category) _cat.unify( ((BasicArg) u)._cat, sub)); } else { throw new UnifyFailure(); } } public void deepMap(ModFcn mf) { _slash.deepMap(mf); _cat.deepMap(mf); } public String toString() { StringBuffer sb = new StringBuffer(); sb.append(_slash.toString()); if (_cat instanceof ComplexCat) { sb.append('(').append(_cat).append(')'); } else { sb.append(_cat); } return sb.toString(); } /** * Returns the supertag for this arg. */ public String getSupertag() { StringBuffer sb = new StringBuffer(); sb.append(_slash.getSupertag()); if (_cat instanceof ComplexCat) { sb.append('(').append(_cat.getSupertag()).append(')'); } else { sb.append(_cat.getSupertag()); } return sb.toString(); } /** * Returns a TeX-formatted string representation for this arg. */ public String toTeX() { StringBuffer sb = new StringBuffer(); sb.append(_slash.toTeX()); if (_cat instanceof ComplexCat) { sb.append('(').append(_cat.toTeX()).append(')'); } else { sb.append(_cat.toTeX()); } return sb.toString(); } /** * Returns a hash code for this, using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { return _slash.hashCode(varMap) + _cat.hashCodeNoLF(varMap); } /** * Returns whether this arg equals the given object up to variable names, * using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } BasicArg ba = (BasicArg) obj; return _slash.equals(ba._slash, varMap, varMap2) && _cat.equalsNoLF(ba._cat, varMap, varMap2); } } ================================================ FILE: src/opennlp/ccg/synsem/CatReader.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-4 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import org.jdom.*; /** * Utility class to build categories. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.4 $, $Date: 2005/10/18 22:20:15 $ */ public class CatReader { public static Category getCat(Element catel) { Category cat = null; String catType = catel.getName(); if (catType.equals("atomcat") || catType.equals("ac")) { cat = new AtomCat(catel); } else if (catType.equals("complexcat") || catType.equals("cc")) { cat = new ComplexCat(catel); } return cat; } } ================================================ FILE: src/opennlp/ccg/synsem/Category.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-5 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.unify.*; import opennlp.ccg.hylo.*; import gnu.trove.*; import org.jdom.Element; /** * A CCG category. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.11 $, $Date: 2011/05/22 03:40:55 $ */ public interface Category extends Unifiable, Mutable, java.io.Serializable { /** * Accessor function for the feature structure associated with this category. * * @return the feature structure for this cateogory */ public FeatureStructure getFeatureStructure(); /** * Gives this category a new feature structure. * * @param fs the new feature structure */ public void setFeatureStructure(FeatureStructure fs); /** Gets the LF. */ public LF getLF(); /** Sets the LF. */ public void setLF(LF lf); /** * Determines if this category is equal to another on the top level. * It does not check sub categories. * * @param o object to check for equality * @return whether or not this is shallowly equal to object */ public boolean shallowEquals(Object o); /** * Deep copies this category. * * @return a deep copy of this category */ public Category copy(); /** Shallow copies this category. */ public Category shallowCopy(); /** * Iterates through this Category applying a function to this category * and every subcategory. * * @param f a function to be applied */ public void forall(CategoryFcn f); //to ls /** * Returns a hash code for this category. * The hash code handles equivalence up to variable names * as long as features and predicates are in the same order. */ public int hashCode(); /** * Returns a hash code for this category ignoring the LF. */ public int hashCodeNoLF(); /** * Returns a hash code for this category ignoring the LF, * using the given map from vars to ints, * to allow for equivalence up to variable names. */ public int hashCodeNoLF(TObjectIntHashMap varMap); /** * Returns whether this category equals the given object. * Equivalence up to variable names is handled * as long as features and predicates are in the same order. */ public boolean equals(Object obj); /** * Returns whether this category equals the given object, * ignoring the LFs (if any). */ public boolean equalsNoLF(Object obj); /** * Returns whether this category equals the given object * up to variable names, using the given maps from vars to ints, * ignoring the LFs (if any). */ public boolean equalsNoLF(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2); /** * Returns the target category of this category. */ public TargetCat getTarget(); /** * Returns the nominal which is the value of the index feature on the * target cat, or null if none. */ public Nominal getIndexNominal(); /** * Returns the interned supertag for the category. */ public String getSupertag(); /** * Returns whether this category is a fragment category. */ public boolean isFragment(); /** * Returns a TeX-formatted string representation for the category. */ public String toTeX(); /** * Returns an XML element representing the category. */ public Element toXml(); } ================================================ FILE: src/opennlp/ccg/synsem/CategoryFcn.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; /** * A set of functions that can be applied to Categories. This is a way of * getting around the fact that Java doesn't have 1st class functions. * Thus, if a method is needed that takes a function that works on Categories, * pass one of these instead. * * @author Gann Bierner * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:12 $ */ public interface CategoryFcn { /** * Converts a category to a different category * * @param c the category to change * @return the transformed category */ public Category fcn(Category c); /** * Converts a category to a different category with some additional * information about its context. * * @param a The logical form in which the category appears * @param c The category to convert * @param i The position of the category in the logical form * @return the transformed category */ public void fcn(Category a, Category c, int i); /** * Performs some destructive operation given a category * * @param c The category used for whatever purpose */ public void forall(Category c); } ================================================ FILE: src/opennlp/ccg/synsem/CategoryFcnAdapter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; /** * An adapter for Category Functions so that you only have to implement the * methods you want. * * @author Gann Bierner * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:12 $ */ public class CategoryFcnAdapter implements CategoryFcn{ public Category fcn(Category c) {return c;} public void fcn(Category a, Category c, int i) {} public void forall(Category c) {}; } ================================================ FILE: src/opennlp/ccg/synsem/ComplexCat.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.unify.*; import opennlp.ccg.util.DisplayPrefs; import opennlp.ccg.grammar.Grammar; import gnu.trove.*; import org.jdom.*; import java.util.*; /** * A non-recursive representation of complex categories. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.15 $, $Date: 2009/06/18 23:38:57 $ */ public final class ComplexCat extends AbstractCat { private static final long serialVersionUID = 1L; private TargetCat _target; private ArgStack _args; /** Constructor with target and single arg. */ public ComplexCat(TargetCat target, Arg arg) { this(target, new ArgStack(arg)); } /** Constructor with target and arg stack. */ public ComplexCat(TargetCat target, ArgStack args) { this(target, args, null); } /** Constructor with target, arg stack and LF. */ public ComplexCat(TargetCat target, ArgStack args, LF lf) { super(lf); _target = target; if (args.size() < 1) { System.out.println("WARNING!!! Creating a ComplexCat with" + " empty argument stack!"); } _args = args; } /** Constructor which retrieves the complex category from the XML element. */ // also determines modifier slashes @SuppressWarnings("unchecked") public ComplexCat(Element el) { // call super to get LF if present super(el); // get children minus LF elt List info = el.getChildren(); Element lfElt = el.getChild("lf"); if (lfElt != null) { info.remove(lfElt); } // get target and args from first and rest of remaining children _target = (TargetCat) CatReader.getCat(info.get(0)); _args = new ArgStack(info.subList(1, info.size())); // set modifier slashes setModifierSlashes(); } /**Returns an XML element representing the category. */ public Element toXml() { Element retval = new Element("complexcat"); retval.addContent(_target.toXml()); _args.toXml(retval); // call super to add LF if present super.toXml(retval); return retval; } // sets modifier slashes based on feat struc ids private void setModifierSlashes() { FeatureStructure targetFS = _target.getFeatureStructure(); int targetIndex = targetFS.getIndex(); if (targetIndex == 0) targetIndex = targetFS.getInheritsFrom(); if (targetIndex == 0) return; for (int i=0; i < _args.size(); i++) { Arg arg = _args.get(i); if (arg instanceof BasicArg) { setModifierSlash(targetIndex, (BasicArg)arg); } else if (arg instanceof SetArg) { SetArg sArg = (SetArg) arg; for (int j=0; j < sArg.size(); j++) { setModifierSlash(targetIndex, sArg.get(j)); } } } } // sets modifier slash based on the target index private void setModifierSlash(int targetIndex, BasicArg arg) { FeatureStructure argFS = arg.getCat().getTarget().getFeatureStructure(); // check for matching ids if (targetIndex == argFS.getIndex() || targetIndex == argFS.getInheritsFrom()) arg.getSlash().setModifier(true); } /** * Returns the target category of this category. */ public TargetCat getTarget() { return _target; } public Arg getArg(int pos) { return _args.get(pos); } public Arg getOuterArg() { return _args.getLast(); } public Category getResult() { return getSubResult(arity() - 1); } public Category getSubResult(int upto) { if (upto == 0) { return _target; } else { return new ComplexCat(_target, _args.subList(0, upto)); } } public ArgStack getArgStack() { return _args; } public ArgStack getArgStack(int from) { return _args.subList(from); } public boolean containsDollarArg() { return _args.containsDollarArg(); } public boolean containsSetArg() { return _args.containsSetArg(); } public void add(Arg a) { _args.add(a); } public void add(ArgStack as) { _args.add(as); } public void addBeforeEnd(ArgStack as) { int size = _args.size(); if (size < 1) { add(as); } else { _args.addAt(as, size - 1); } } public void addFront(ArgStack as) { _args.addFront(as); } public void insertFront(ArgStack as) { _args.insertFront(as); } public void insertEnd(ArgStack as) { _args.insertEnd(as); } public void set(int index, Arg c) { _args.set(index, c); } public void setOuterArgument(Arg c) { _args.setLast(c); } public int arity() { return _args.size(); } public Category copy() { return new ComplexCat((TargetCat) _target.copy(), _args.copy(), (_lf == null) ? null : (LF) _lf.copy()); } public Category shallowCopy() { return new ComplexCat(_target, _args, _lf); } public void deepMap(ModFcn mf) { super.deepMap(mf); _target.deepMap(mf); _args.deepMap(mf); } public void forall(CategoryFcn f) { f.forall(this); _target.forall(f); _args.forall(f); } public void unifyCheck(Object u) throws UnifyFailure { if (u instanceof ComplexCat) { ComplexCat cc = (ComplexCat) u; _target.unifyCheck(cc._target); } } /** NB: The LF does not participate in unification. */ public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof AtomCat && arity() == 1 & containsDollarArg()) { sub.makeSubstitution((Dollar) _args.get(0), new ArgStack()); return GUnifier.unify(_target, (AtomCat) u, sub); } else if (u instanceof ComplexCat) { ComplexCat cc = (ComplexCat) u; ArgStack $args = _args.unify(cc._args, sub); Category $target = GUnifier.unify(_target, cc._target, sub); if ($args.size() == 0) { return $target; } else { return new ComplexCat((TargetCat) $target, $args); } } else { throw new UnifyFailure(); } } public boolean occurs(Variable v) { return super.occurs(v) || _target.occurs(v) || _args.occurs(v); } // nb: not yet sure about calling setLF methods public Object fill(Substitution s) throws UnifyFailure { Category $target = (Category) _target.fill(s); ArgStack $args = _args.fill(s); LF $lf = (_lf == null) ? null : (LF) _lf.fill(s); if ($args.size() == 0) { $target.setLF($lf); return $target; } if ($target instanceof TargetCat) { return new ComplexCat((TargetCat) $target, $args, $lf); } else if ($target instanceof ComplexCat) { ((ComplexCat) $target).add($args); $target.setLF($lf); return $target; } else { throw new UnifyFailure(); } } public String toString() { DisplayPrefs prefs = Grammar.theGrammar.prefs; StringBuffer sb = new StringBuffer(); sb.append(_target.toString()).append(_args.toString()); if (_lf != null && prefs.showSem) { sb.append(" : ").append(_lf.toString()); } return sb.toString(); } /** * Returns the interned supertag for the category. */ public String getSupertag() { if (_supertag != null) return _supertag; StringBuffer sb = new StringBuffer(); sb.append(_target.getSupertag()).append(_args.getSupertag()); _supertag = sb.toString().intern(); return _supertag; } public String toTeX() { // Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); // boolean showSem = prefs.getBoolean(SHOW_SEMANTICS, false); StringBuffer sb = new StringBuffer(); sb.append(_target.toTeX()).append(_args.toTeX()); return sb.toString(); } /** * Returns a hash code for this category ignoring the LF, using the given * map from vars to ints. */ public int hashCodeNoLF(TObjectIntHashMap varMap) { int retval = _target.hashCodeNoLF(varMap); retval += _args.hashCode(varMap); return retval; } /** * Returns whether this category equals the given object up to variable * names, using the given maps from vars to ints, ignoring the LFs (if any). */ public boolean equalsNoLF(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } ComplexCat cc = (ComplexCat) obj; if (!_target.equalsNoLF(cc._target, varMap, varMap2)) { return false; } if (!_args.equals(cc._args, varMap, varMap2)) { return false; } return true; } } ================================================ FILE: src/opennlp/ccg/synsem/DerivationHandler.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.parse.DerivationHistory; /** * A class to simplify implementing a recursive procedure on a derivation. * Recursion is handled within the implementation of the abstract methods * by calls to handleDerivation, so that order of traversal * can be flexibly specified. Results may be optionally cached. * Note that the top step is only used with complete derivations. * * @author Michael White * @version $Revision: 1.1 $, $Date: 2010/02/21 16:44:59 $ */ abstract public class DerivationHandler { /** Top step. */ abstract public T topStep(Sign sign); /** Lexical step. */ abstract public T lexStep(Sign sign); /** Unary step. */ abstract public T unaryStep(Sign sign, Sign headChild); /** Binary step. */ abstract public T binaryStep(Sign sign, boolean left, Sign headChild, Sign siblingChild); /** Checks for cached value, returning null if none. Defaults to null. */ public T checkCache(Sign sign) { return null; } /** Caches the result. Default no-op. */ public void cache(Sign sign, T result) {} /** Handles a complete derivation, invoking the top step. */ public T handleCompleteDerivation(Sign sign) { return topStep(sign); } /** Handles a sub-derivation, checking and updating cache. */ public T handleDerivation(Sign sign) { // check cache T retval = checkCache(sign); if (retval != null) return retval; // lexical case if (sign.isLexical()) { retval = lexStep(sign); cache(sign, retval); return retval; } // recursive case DerivationHistory dh = sign.getDerivationHistory(); Sign[] inputs = dh.getInputs(); // unary case if (inputs.length == 1) { Sign headChild = inputs[0]; retval = unaryStep(sign, headChild); cache(sign, retval); return retval; } // binary case else { boolean left; Sign headChild, siblingChild; if (sign.getLexHead() == inputs[0].getLexHead()) { left = true; headChild = inputs[0]; siblingChild = inputs[1]; } else { left = false; headChild = inputs[1]; siblingChild = inputs[0]; } retval = binaryStep(sign, left, headChild, siblingChild); cache(sign, retval); return retval; } } } ================================================ FILE: src/opennlp/ccg/synsem/Dollar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-7 Jason Baldridge and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.Serializable; import gnu.trove.*; import opennlp.ccg.unify.*; /** * A variable representing a stack of arguments * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.10 $, $Date: 2009/11/28 03:39:27 $ */ public final class Dollar implements Arg, Variable, Mutable, Indexed, Serializable { private static final long serialVersionUID = -5936227018184772678L; private final Slash _slash; private final String _name; private int _index = 0; private boolean _hasMostGeneralSlash = false; public Dollar(String name) { this(new Slash(), name); } public Dollar(Slash s, String name) { this(s, name, 0); } public Dollar(Slash s, String name, int id) { _slash = s; _name = name; _index = id; if (s.toString().equals("|.")) { _hasMostGeneralSlash = true; } } public String name() { return _name; } public int getIndex() { return _index; } public void setIndex(int uniqueIndex) { _index = uniqueIndex; } public Arg copy() { return new Dollar(_slash.copy(), _name, _index); } public void forall(CategoryFcn fcn) { } public Slash getSlash() { return _slash; } public void setSlashModifier(boolean modifier) { _slash.setModifier(modifier); } public void setSlashHarmonicCompositionResult(boolean harmonicResult) { _slash.setHarmonicCompositionResult(harmonicResult); } public boolean equals(Object o) { return (o instanceof Dollar && _index == ((Dollar) o).getIndex() && _slash .equals(((Dollar) o).getSlash())); } public int hashCode() { return 31 * _index + _slash.hashCode() ; } public boolean occurs(Variable v) { return (v instanceof Dollar && equals(v)); } public Object fill(Substitution sub) throws UnifyFailure { Object value = sub.getValue(this); if (value == null) { return this; } if (value instanceof Dollar) { return value; } // nb: must do occurs check here, at least in part b/c ArgStack doesn't // quite implement Unifiable if (value instanceof Arg && !((Arg) value).occurs(this)) { return ((Arg) value).fill(sub); } else if (value instanceof ArgStack && !((ArgStack) value).occurs(this)) { return ((ArgStack) value).fill(sub); } else { // System.out.println("Error in value for dollar: " + this +" = " + // value); throw new UnifyFailure(); } } public void unifySlash(Slash s) throws UnifyFailure { _slash.unifyCheck(s); } public void unifyCheck(Object u) throws UnifyFailure { } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof ArgStack && !((ArgStack) u).occurs(this)) { ((ArgStack) u).slashesUnify(_slash); } else if (u instanceof Arg && !((Arg) u).occurs(this)) { ((Arg) u).unifySlash(_slash); } else { throw new UnifyFailure(); } sub.makeSubstitution(this, u); return u; } public void deepMap(ModFcn mf) { mf.modify(this); } public String toString() { StringBuffer sb = new StringBuffer(); if (!_hasMostGeneralSlash) sb.append(_slash.toString()); sb.append('$').append(_name);// .append(_index); return sb.toString(); } /** * Returns the supertag for this dollar arg. */ public String getSupertag() { StringBuffer sb = new StringBuffer(); if (!_hasMostGeneralSlash) sb.append(_slash.getSupertag()); sb.append('$'); return sb.toString(); } /** * Returns a TeX-formatted string representation for this dollar arg. */ public String toTeX() { StringBuffer sb = new StringBuffer(); if (!_hasMostGeneralSlash) sb.append(_slash.toTeX()); sb.append("\\$ \\subs{").append(_name).append("}");// .append(_index); return sb.toString(); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { int retval = _slash.hashCode(varMap); // see if this already in map if (varMap.containsKey(this)) { retval += varMap.get(this); } // otherwise add it else { int next = varMap.size() + 1; varMap.put(this, next); retval += next; } return retval; } /** * Returns whether this dollar equals the given object up to variable names, * using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (this == obj) return true; if (obj.getClass() != this.getClass()) return false; Dollar d = (Dollar) obj; if (varMap.get(this) != varMap2.get(d)) return false; if (!_slash.equals(d._slash, varMap, varMap2)) return false; return true; } } ================================================ FILE: src/opennlp/ccg/synsem/GenerativeSyntacticModel.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2010 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.*; import java.net.URL; import java.util.*; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.lexicon.*; import opennlp.ccg.ngrams.ConditionalProbabilityTable; import opennlp.ccg.ngrams.NgramScorer; import opennlp.ccg.perceptron.*; import opennlp.ccg.test.Regression; import opennlp.ccg.test.RegressionInfo; import opennlp.ccg.test.RegressionInfo.TestItem; import opennlp.ccg.util.Pair; /** * A class implementing a variant of Hockenmaier's HWDep generative syntactic model, * with additional postag variables. * Note that the top step is only used with complete derivations. * Also note that for simplicity there is no special treatment of rare words, and thus * a single unknown word is used in the model, rather than one for each POS tag. * * @author Michael White * @version $Revision: 1.12 $, $Date: 2010/03/07 03:23:01 $ */ public class GenerativeSyntacticModel implements FeatureExtractor, SignScorer { /** Feature key. */ public static String genlogprobkey = "genlogprob"; /** Expansion string constant. */ public static final String EXPANSION = "E"; /** Left expansion string constant. */ public static final String LEFT = "left"; /** Right expansion string constant. */ public static final String RIGHT = "right"; /** Unary expansion string constant. */ public static final String UNARY = "unary"; /** Leaf expansion string constant. */ public static final String LEAF = "leaf"; /** Category of parent string constant. */ public static final String PARENT = "P"; /** Category of head string constant. */ public static final String HEAD = "H"; /** Category of sibling string constant. */ public static final String SIBLING = "S"; /** Lexical head category of parent string constant. */ public static final String LEXCAT_PARENT = "CP"; /** Head postag of parent string constant. */ public static final String POS_PARENT = "T"; /** Head word of parent string constant. */ public static final String WORD_PARENT = "W"; /** Lexical head category of sibling string constant. */ public static final String LEXCAT_SIBLING = "CS"; /** Head postag of sibling string constant. */ public static final String POS_SIBLING = "TS"; /** Head word of sibling string constant. */ public static final String WORD_SIBLING = "WS"; /** Lexical head category of top string constant. */ public static final String LEXCAT_TOP = "CT"; /** Head postag top string constant. */ public static final String POS_TOP = "TT"; /** Head word of top string constant. */ public static final String WORD_TOP = "WT"; /** Derivation top string constant. */ public static final String TOP = ""; /** * Class for caching the model's log prob in a sign. */ public static class GenLogProb { /** The log prob. */ public final double logprob; /** Constructor. */ public GenLogProb(double logprob) { this.logprob = logprob; } } /** The top step model. */ protected ConditionalProbabilityTable topModel; /** The lexical step model. */ protected ConditionalProbabilityTable leafModel; /** The unary step model. */ protected ConditionalProbabilityTable unaryModel; /** The binary step model. */ protected ConditionalProbabilityTable binaryModel; /** Constructor with file names. */ public GenerativeSyntacticModel(String topModelFN, String leafModelFN, String unaryModelFN, String binaryModelFN) throws IOException { topModel = new ConditionalProbabilityTable(topModelFN); leafModel = new ConditionalProbabilityTable(leafModelFN); unaryModel = new ConditionalProbabilityTable(unaryModelFN); binaryModel = new ConditionalProbabilityTable(binaryModelFN); } /** Flag for whether to show scoring breakdown. */ protected boolean debugScore = false; /** Sets the debug score flag, and propagates to component models. */ public void setDebug(boolean debugScore) { this.debugScore = debugScore; topModel.setDebug(debugScore); leafModel.setDebug(debugScore); unaryModel.setDebug(debugScore); binaryModel.setDebug(debugScore); } /** The alphabet. */ protected Alphabet alphabet = null; /** Generative logprob feature. */ protected Alphabet.Feature genlogprobFeature = null; /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { this.alphabet = alphabet; List keys = new ArrayList(1); keys.add(genlogprobkey); genlogprobFeature = alphabet.closed() ? alphabet.index(keys) : alphabet.add(keys); } /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { return genLogProbVector((float) logprob(sign, complete)); } /** Returns a feature vector with the given gen log prob. */ protected FeatureVector genLogProbVector(float logprob) { FeatureList retval = new FeatureList(1); if (genlogprobFeature != null) retval.add(genlogprobFeature, logprob); return retval; } /** Derivation handler for getting the log prob for each derivation step as a sum. */ public class LogProbGetter extends DerivationHandler { // reusable list of pairs private List> pairs = new ArrayList>(); private String listPairs() { StringBuffer sb = new StringBuffer(); for (Pair pair : pairs) sb.append(pair.a).append('-').append(pair.b).append(' '); return sb.toString(); } /** Checks for cached value. */ public Double checkCache(Sign sign) { GenLogProb glp = (GenLogProb) sign.getData(GenLogProb.class); return (glp == null) ? null : glp.logprob; } /** Caches the total. */ public void cache(Sign sign, Double total) { sign.addData(new GenLogProb(total)); } /** Top step. */ public Double topStep(Sign sign) { pairs.clear(); addTopFactors(sign, pairs); if (debugScore) System.out.println("[topStep] " + listPairs()); return topModel.logprob(pairs) + handleDerivation(sign); } /** Lexical step. */ public Double lexStep(Sign sign) { pairs.clear(); addLexFactors(sign, pairs); if (debugScore) System.out.println("[lexStep] " + listPairs()); return leafModel.logprob(pairs); } /** Unary step. */ public Double unaryStep(Sign sign, Sign headChild) { pairs.clear(); addUnaryFactors(sign, pairs, headChild); if (debugScore) System.out.println("[unaryStep] " + listPairs()); return unaryModel.logprob(pairs) + handleDerivation(headChild); } /** Binary step. */ public Double binaryStep(Sign sign, boolean left, Sign headChild, Sign siblingChild) { pairs.clear(); addBinaryFactors(sign, pairs, left, headChild, siblingChild); if (debugScore) System.out.println("[binaryStep] " + listPairs()); return binaryModel.logprob(pairs) + handleDerivation(headChild) + handleDerivation(siblingChild); } } /** Derivation handler for getting the factors for each derivation step as a list of words. */ public static class FactorsGetter extends DerivationHandler { /** The factors. */ public List factors = new ArrayList(); // reusable list of pairs private List> pairs = null; // new pairs private void newPairs() { pairs = new ArrayList>(); } // adds new word for pairs to result private void addPairs() { factors.add(new ListPairWord(pairs)); } /** Top step. */ public Void topStep(Sign sign) { newPairs(); addTopFactors(sign, pairs); addPairs(); handleDerivation(sign); return null; } /** Lexical step. */ public Void lexStep(Sign sign) { newPairs(); addLexFactors(sign, pairs); addPairs(); return null; } /** Unary step. */ public Void unaryStep(Sign sign, Sign headChild) { newPairs(); addUnaryFactors(sign, pairs, headChild); addPairs(); handleDerivation(headChild); return null; } /** Binary step. */ public Void binaryStep(Sign sign, boolean left, Sign headChild, Sign siblingChild) { newPairs(); addBinaryFactors(sign, pairs, left, headChild, siblingChild); addPairs(); handleDerivation(headChild); handleDerivation(siblingChild); return null; } } /** Returns the probability of the derivation according to the models. */ public double score(Sign sign, boolean complete) { return NgramScorer.convertToProb(logprob(sign, complete)); } /** Returns the log probability of the derivation according to the models. */ public double logprob(Sign sign, boolean complete) { LogProbGetter lpgetter = new LogProbGetter(); if (complete) return lpgetter.handleCompleteDerivation(sign); else return lpgetter.handleDerivation(sign); } /** Returns the factors from the derivation of the given sign (assumed to be complete). */ public static List getFactors(Sign sign) { FactorsGetter fgetter = new FactorsGetter(); fgetter.handleCompleteDerivation(sign); return fgetter.factors; } /** Adds the factors for the top step in the derivation of the given sign. */ public static void addTopFactors(Sign sign, List> pairs) { pairs.add(new Pair(EXPANSION, TOP)); pairs.add(new Pair(PARENT, TOP)); pairs.add(new Pair(LEXCAT_PARENT, TOP)); pairs.add(new Pair(WORD_PARENT, TOP)); pairs.add(new Pair(HEAD, sign.getSupertag())); Sign lexHead = sign.getLexHead(); pairs.add(new Pair(LEXCAT_TOP, lexHead.getSupertag())); pairs.add(new Pair(POS_TOP, lexHead.getPOS())); pairs.add(new Pair(WORD_TOP, lexHead.getWordForm())); } /** Adds the factors for a lexical step in the derivation of the given sign. */ public static void addLexFactors(Sign sign, List> pairs) { pairs.add(new Pair(EXPANSION, LEAF)); addParentFactors(sign, pairs); } /** Adds the parent factors for a step in the derivation of the given sign. */ public static void addParentFactors(Sign sign, List> pairs) { pairs.add(new Pair(PARENT, sign.getSupertag())); Sign lexHead = sign.getLexHead(); pairs.add(new Pair(LEXCAT_PARENT, lexHead.getSupertag())); pairs.add(new Pair(POS_PARENT, lexHead.getPOS())); pairs.add(new Pair(WORD_PARENT, lexHead.getWordForm())); } /** Returns the factors for a unary step in the derivation of the given sign. */ public static void addUnaryFactors(Sign sign, List> pairs, Sign headChild) { pairs.add(new Pair(EXPANSION, UNARY)); addParentFactors(sign, pairs); pairs.add(new Pair(HEAD, headChild.getSupertag())); } /** Returns the factors for a binary step in the derivation of the given sign. */ public static void addBinaryFactors(Sign sign, List> pairs, boolean left, Sign headChild, Sign siblingChild) { pairs.add(new Pair(EXPANSION, (left) ? LEFT : RIGHT)); addParentFactors(sign, pairs); pairs.add(new Pair(HEAD, headChild.getSupertag())); pairs.add(new Pair(SIBLING, siblingChild.getSupertag())); Sign siblingLexHead = siblingChild.getLexHead(); pairs.add(new Pair(LEXCAT_SIBLING, siblingLexHead.getSupertag())); pairs.add(new Pair(POS_SIBLING, siblingLexHead.getPOS())); pairs.add(new Pair(WORD_SIBLING, siblingLexHead.getWordForm())); } /** Tests loading and scoring. */ public static void main(String[] args) throws IOException { String argstr = "(-dir ) (-g ) (-t ) (-verbose)"; String usage = "Usage: java opennlp.ccg.synsem.GenerativeSyntacticModel " + argstr; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } String dir = ".", topfn = "top.flm", leaffn = "leaf.flm", unaryfn = "unary.flm", binaryfn = "binary.flm"; String grammarfn = "grammar.xml", tbfn = "testbed.xml"; boolean verbose = false; for (int i=0; i < args.length; i++) { if (args[i].equals("-dir")) { dir = args[++i]; continue; } if (args[i].equals("-g")) { grammarfn = args[++i]; continue; } if (args[i].equals("-t")) { tbfn = args[++i]; continue; } if (args[i].equals("-v") || args[i].equals("-verbose")) { verbose = true; continue; } System.out.println("Unrecognized option: " + args[i]); } // load grammar URL grammarURL = new File(grammarfn).toURI().toURL(); System.out.println("Loading grammar from URL: " + grammarURL); Grammar grammar = new Grammar(grammarURL); // load model System.out.println("Loading syntactic model from: " + dir); topfn = dir + "/" + topfn; leaffn = dir + "/" + leaffn; unaryfn = dir + "/" + unaryfn; binaryfn = dir + "/" + binaryfn; GenerativeSyntacticModel model = new GenerativeSyntacticModel(topfn, leaffn, unaryfn, binaryfn); if (verbose) model.setDebug(true); // score saved signs double logprobttotal = 0.0; int numsents = 0; for (File f : Regression.getXMLFiles(new File(tbfn))) { // load testfile System.out.println("Loading: " + f.getName()); RegressionInfo rinfo = new RegressionInfo(grammar, f); // do each item for (int i=0; i < rinfo.numberOfItems(); i++) { TestItem item = rinfo.getItem(i); if (item.numOfParses == 0) continue; numsents++; if (verbose) System.out.println("scoring: " + item.sentence); else System.out.print("."); Sign sign = item.sign; double logprob = model.logprob(sign, true); logprobttotal += logprob; if (verbose) { System.out.println(sign.getDerivationHistory().toString()); System.out.println("logprob: " + logprob); } } System.out.println(); } // totals System.out.println("total logprob: " + logprobttotal); System.out.println("logprob per sentence: " + (logprobttotal / numsents)); } } ================================================ FILE: src/opennlp/ccg/synsem/LF.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.unify.*; import org.jdom.*; import gnu.trove.*; /** * An interface for objects which represent Logical Forms. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.10 $, $Date: 2005/11/01 22:35:35 $ */ public interface LF extends Unifiable, Mutable { /** * Sets the LF chunks to which this LF belongs. * LF chunks are used during realization to ensure * that certain edges are semantically complete * before combination is attempted with edges * with semantics outside the chunk. * The chunks are numbered starting with 0, * and null represents no chunks. */ public void setChunks(TIntArrayList chunks); /** * Gets the LF chunks to which this LF belongs. */ public TIntArrayList getChunks(); /** Returns the simple type of this LF, or null if none. */ public SimpleType getType(); /** * Returns a copy of this LF. * (LF chunks are not copied.) */ public LF copy(); /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap); /** * Returns whether this LF equals the given object * up to variable names, using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2); /** * Returns an XML representation of this LF. */ public Element toXml(); /** * Returns a pretty-printed string of this LF, with the given indent. */ public String prettyPrint(String indent); } ================================================ FILE: src/opennlp/ccg/synsem/LexLogProbFeatureExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.util.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.lexicon.SupertaggerAdapter; /** * A class for extracting total lexical log probabilities from a supertagger * as a feature. The class may also be used as a sign scorer. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2009/11/01 22:26:29 $ */ public class LexLogProbFeatureExtractor implements FeatureExtractor, SignScorer { /** Feature key. */ public static String lexlogprobkey = "lexlogprob"; /** The alphabet. */ protected Alphabet alphabet = null; /** Lexical logprob feature. */ protected Alphabet.Feature lexlogprobFeature = null; /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { this.alphabet = alphabet; List keys = new ArrayList(1); keys.add(lexlogprobkey); lexlogprobFeature = alphabet.closed() ? alphabet.index(keys) : alphabet.add(keys); } /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { return lexLogProbVector(getLexLogProb(sign, complete)); } /** Recursively gets lex log prob total for the given sign, if not already present. */ protected float getLexLogProb(Sign sign, boolean complete) { // check for stored log prob SupertaggerAdapter.LexLogProb lexlogprob = (SupertaggerAdapter.LexLogProb) sign.getData(SupertaggerAdapter.LexLogProb.class); if (lexlogprob != null) return lexlogprob.logprob; // otherwise calculate and store one float logprob = 0; // lex case if (sign.isLexical()) { // just use zero if not already there } // non-terminal else { // use input totals to calculate current one Sign[] inputs = sign.getDerivationHistory().getInputs(); if (inputs.length == 1) logprob = getLexLogProb(inputs[0], false); else if (inputs.length == 2) logprob = getLexLogProb(inputs[0], false) + getLexLogProb(inputs[1], false); } // store it and return sign.addData(new SupertaggerAdapter.LexLogProb(logprob)); return logprob; } /** Returns a feature vector with the given lex log prob total. */ protected FeatureVector lexLogProbVector(float logprob) { FeatureList retval = new FeatureList(1); if (lexlogprobFeature != null) retval.add(lexlogprobFeature, logprob); return retval; } /** * Returns a score for the given sign and completeness flag; * specifically, returns the lex log prob total for the sign. */ public double score(Sign sign, boolean complete) { return getLexLogProb(sign, complete); } } ================================================ FILE: src/opennlp/ccg/synsem/LexSemOrigin.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2007 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; /** * An interface for items which introduce lexical semantics, covering * (lexical) signs and unary type changing rules. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2008/01/03 21:30:12 $ */ public interface LexSemOrigin { /** * Returns the supertag. */ public String getSupertag(); /** * Returns the POS tag. * For unary type changing rules, the constant TypeChangingRule.POS_STRING is * always returned. */ public String getPOS(); /** * Sets the origin of the elementary predications. */ public void setOrigin(); } ================================================ FILE: src/opennlp/ccg/synsem/Modality.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.unify.*; /** * A modality that can decorate a categorial slash. * * @author Jason Baldridge * @version $Revision: 1.2 $, $Date: 2004/05/01 10:40:04 $ */ public interface Modality extends Unifiable { public Object copy(); public byte getDirection(); public String toString(byte dir); public String toTeX(byte dir); public String toTeX(); } ================================================ FILE: src/opennlp/ccg/synsem/ReRankingScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; /** * Interface for signs scorers with both a base model and a full (reranking) model. * The implementation of the score(Sign, boolean) method should vary according to the * full model flag. The base model should be the default. */ public interface ReRankingScorer extends SignScorer { /** Sets flag for using full (vs. base) model. The base model should be the default. */ public void setFullModel(boolean on); } ================================================ FILE: src/opennlp/ccg/synsem/SetArg.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.unify.*; import org.jdom.*; import java.io.Serializable; import java.util.*; import gnu.trove.*; /** * A category which contains an unordered set of categories. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.11 $, $Date: 2009/12/21 02:15:44 $ */ public final class SetArg implements Arg, Serializable { private static final long serialVersionUID = -7067480310511294657L; private ArgStack _args; @SuppressWarnings("unchecked") public SetArg(Element el) { List info = el.getChildren(); List args = new ArrayList(); for (Iterator infoIt = info.iterator(); infoIt.hasNext();) { Slash s = new Slash(infoIt.next()); Category c = CatReader.getCat(infoIt.next()); args.add(new BasicArg(s, c)); } Arg[] list = new Arg[args.size()]; args.toArray(list); _args = new ArgStack(list); } public SetArg(Arg[] args) { _args = new ArgStack(args); } public SetArg(ArgStack args) { _args = args; } public Element toXml() { Element retval = new Element("setarg"); for (Arg arg : _args._list) { if (arg instanceof BasicArg) { // only supporting basic args per xml construction BasicArg barg = (BasicArg) arg; retval.addContent(barg.getSlash().toXml()); retval.addContent(barg.getCat().toXml()); } } return retval; } public Arg copy() { return new SetArg(_args.copy()); } public void add(ArgStack as) { _args.add(as); } public void forall(CategoryFcn fcn) { _args.forall(fcn); } public Arg copyWithout(int pos) { if (_args.size() == 2) { if (pos == 0) { return _args.get(1); } else { return _args.get(0); } } else { return new SetArg(_args.copyWithout(pos)); } } public int size() { return _args.size(); } public BasicArg get(int pos) { return (BasicArg) _args.get(pos); } public Category getCat(int pos) { return ((BasicArg) _args.get(pos)).getCat(); } public int indexOf(BasicArg a) { int index = -1; for (int i = 0; i < _args.size() && index < 0; i++) { try { a.unifySlash(((BasicArg) _args.get(i)).getSlash()); GUnifier.unify(getCat(i), a.getCat()); index = i; } catch (UnifyFailure uf) { } } // if (index<0) { // throw new UnifyFailure(); // } else { // return index; // } return index; } public int indexOf(Category cat) { int index = -1; for (int i = 0; i < _args.size() && index < 0; i++) { try { GUnifier.unify(getCat(i), cat); index = i; } catch (UnifyFailure uf) { } } return index; // if (index<0) { // throw new UnifyFailure(); // } else { // return index; // } } public void setSlashModifier(boolean modifier) { for (int i = 0; i < _args.size(); i++) { BasicArg arg = get(i); arg.setSlashModifier(modifier); } } public void setSlashHarmonicCompositionResult(boolean harmonicResult) { for (int i = 0; i < _args.size(); i++) { BasicArg arg = get(i); arg.setSlashHarmonicCompositionResult(harmonicResult); } } public boolean containsContrarySlash() { for (int i = 0; i < _args.size(); i++) { if (!((BasicArg) _args.get(i)).getSlash().sameDirAsModality()) { return true; } } return false; } public void unifySlash(Slash s) throws UnifyFailure { for (int i = 0; i < _args.size(); i++) { _args.get(i).unifySlash(s); } } public void unifyCheck(Object u) throws UnifyFailure { } // nb: direct unification not implemented ... public Object unify(Object u, Substitution sub) throws UnifyFailure { throw new UnifyFailure(); } public Object fill(Substitution s) throws UnifyFailure { return new SetArg(_args.fill(s)); } public void deepMap(ModFcn mf) { _args.deepMap(mf); } public boolean occurs(Variable v) { return _args.occurs(v); } public boolean equals(Object c) { return false; } public String toString() { StringBuffer sb = new StringBuffer(10); sb.append('{').append(_args.toString()).append('}'); return sb.toString(); } /** * Returns the supertag for this arg. */ public String getSupertag() { StringBuffer sb = new StringBuffer(); sb.append("{").append(_args.getSupertag()).append("}"); return sb.toString(); } /** * Returns a TeX-formatted string representation for this arg. */ public String toTeX() { StringBuffer sb = new StringBuffer(10); sb.append("\\{").append(_args.toTeX()).append("\\}"); return sb.toString(); } /** * Returns a hash code for this arg, using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { return _args.hashCode(varMap); } /** * Returns whether this arg equals the given object up to variable names, * using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (obj.getClass() != this.getClass()) { return false; } SetArg sa = (SetArg) obj; return _args.equals(sa._args, varMap, varMap2); } } ================================================ FILE: src/opennlp/ccg/synsem/Sign.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-9 Jason Baldridge, University of Edinburgh and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import opennlp.ccg.parse.*; import opennlp.ccg.util.*; import opennlp.ccg.lexicon.*; import opennlp.ccg.grammar.*; import opennlp.ccg.hylo.*; import org.jdom.*; import gnu.trove.*; import java.io.*; import java.util.*; /** * A CCG sign, consisting of a list of words paired with a category. * Signs may contain arbitrary data objects which are ignored in equality checking. * Non-serializable data objects are filtered during serialization. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.44 $, $Date: 2011/08/27 19:27:01 $ */ public class Sign implements LexSemOrigin, Serializable { private static final long serialVersionUID = 1072712272514007274L; /** The words. */ protected List _words; /** The category. */ protected Category _cat; /** The derivation history. */ protected DerivationHistory _history; /** The lexical head. */ protected Sign _lexHead; /** List of transient data objects, for retrieval by class. */ protected LinkedList data = null; /** Constructor for subclasses. */ protected Sign() {} /** Constructor with derivation history. */ @SuppressWarnings("unchecked") protected Sign(List words, Category cat, DerivationHistory dh, Sign lexHead) { _words = (List) Interner.globalIntern(words); _cat = cat; _history = dh; _lexHead = lexHead; } /** Constructor with no additional derivation history. */ public Sign(List words, Category cat) { this(words, cat, null, null); _history = new DerivationHistory(this); _lexHead = this; } /** Constructor with no additional derivation history. */ public Sign(Word word, Category cat) { this(new SingletonList(word), cat); } // during deserialization, interns words @SuppressWarnings("unchecked") private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); _words = (List) Interner.globalIntern(_words); } // during serialization, skips non-serializable data objects private void writeObject(java.io.ObjectOutputStream stream) throws IOException { // save old data objects LinkedList tmp = data; // filter non-serializable ones if (tmp != null) { data = new LinkedList(); for (Object obj : tmp) { if (obj instanceof Serializable) data.add(obj); } if (data.isEmpty()) data = null; } // serialize stream.defaultWriteObject(); // restore old data objects data = tmp; } /** Factory method for creating a sign from a lexical sign plus a coarticulation one. */ public static Sign createCoartSign(Category cat, Sign lexSign, Sign coartSign) { List words = lexSign.getWords(); if (words.size() > 1) throw new RuntimeException("Can't create coarticulation sign from multiple words."); Word word = words.get(0); Word coartWord = coartSign.getWords().get(0); Word wordPlus = Word.createWordWithAttrs(word, coartWord); Sign retval = new Sign(new SingletonList(wordPlus), cat, null, null); retval._lexHead = retval; Rule coartRule = new Rule() { public String name() { return "coart"; } public int arity() { return 1; } public List applyRule(Category[] inputs) { throw new RuntimeException("Not supported."); } public RuleGroup getRuleGroup() { throw new RuntimeException("Not supported."); } public void setRuleGroup(RuleGroup ruleGroup) { throw new RuntimeException("Not supported."); } public Element toXml() { throw new RuntimeException("Not supported."); } }; retval._history = new DerivationHistory(new Sign[]{lexSign,coartSign}, retval, coartRule); return retval; } /** Factory method for creating derived signs with the given cat from the given inputs, rule and lex head. */ public static Sign createDerivedSign(Category cat, Sign[] inputs, Rule rule, Sign lexHead) { return new Sign(cat, inputs, rule, lexHead); } /** Factory method for creating derived signs from the given result cat, inputs, rule and lex head, with a new LF constructed from the inputs. Note that unlike with rule applications, the result LF is constructed with no var substitutions, so it is useful only for creating alternative signs during realization. */ public static Sign createDerivedSignWithNewLF(Category cat, Sign[] inputs, Rule rule, Sign lexHead) { Category copyCat = cat.shallowCopy(); LF lf = null; for (int i = 0; i < inputs.length; i++) { lf = HyloHelper.append(lf, inputs[i].getCategory().getLF()); } if (rule instanceof TypeChangingRule) { TypeChangingRule tcr = (TypeChangingRule) rule; lf = HyloHelper.append(lf, tcr.getResult().getLF()); } if (lf != null) { HyloHelper.sort(lf); } copyCat.setLF(lf); return new Sign(copyCat, inputs, rule, lexHead); } /** Constructor with words and derivation history formed from the given inputs, rule and lex head. */ protected Sign(Category cat, Sign[] inputs, Rule rule, Sign lexHead) { this(getRemainingWords(inputs, 0), cat, null, lexHead); _history = new DerivationHistory(inputs, this, rule); } // returns the remaining words in a structure sharing way private static List getRemainingWords(Sign[] inputs, int index) { // if (inputs.length == 0) throw new RuntimeException("Error: can't make sign from zero inputs"); if (index == (inputs.length - 1)) return inputs[index]._words; return new StructureSharingList( inputs[index]._words, getRemainingWords(inputs, index+1) ); } /** Returns the words of the sign. */ public List getWords() { return _words; } /** Returns the words as a string. Delegates to the current tokenizer's getOrthography method. */ public String getOrthography() { return Grammar.theGrammar.lexicon.tokenizer.getOrthography(_words); } /** Returns the sign's category. */ public Category getCategory() { return _cat; } /** Returns whether the sign is lexical. */ public boolean isLexical() { return _history.isEmpty(); } /** Sets the derivation history. */ public void setDerivationHistory(DerivationHistory dh) { _history = dh; } /** Returns the derivation history. */ public DerivationHistory getDerivationHistory() { return _history; } /** Returns the lexical head. */ public Sign getLexHead() { return _lexHead; } /** Returns a hash code for this sign. */ public int hashCode() { return System.identityHashCode(_words) + _cat.hashCode(); } /** Returns whether this sign equals the given object. */ public boolean equals(Object obj) { if (obj == this) return true; if (!(obj instanceof Sign)) return false; Sign sign = (Sign) obj; return _words == sign._words && _cat.equals(sign._cat); } /** * Returns a hash code for this sign with the words restricted to surface words; * with lexical signs, however, the original hash code is returned, so that * words with signs that differ just in their pos tags can be distinguished * (for robustness). */ public int surfaceWordHashCode() { return surfaceWordHashCode(false); } /** * Returns a hash code for this sign with the words restricted to surface words, * and with the LF ignored according to the given flag; * with lexical signs, however, the original hash code is returned, so that * words with signs that differ just in their pos tags can be distinguished * (for robustness). */ public int surfaceWordHashCode(boolean ignoreLF) { // original hash code for lex signs if (_history.getInputs() == null) return hashCode(); // otherwise use surface words int hc = 1; for (int i = 0; i < _words.size(); i++) { Word word = _words.get(i); hc = 31*hc + word.surfaceWordHashCode(); } hc += (ignoreLF) ? _cat.hashCodeNoLF() : _cat.hashCode(); return hc; } /** * Returns whether this sign and the given object have equal categories and * restrictions to surface words; * with lexical signs, however, the original equals result is returned, so that * words with signs that differ just in their pos tags can be distinguished * (for robustness). */ public boolean surfaceWordEquals(Object obj) { return surfaceWordEquals(obj, false); } /** * Returns whether this sign and the given object have equal categories and * restrictions to surface words, * with the LF ignored according to the given flag; * with lexical signs, however, the original equals result is returned, so that * words with signs that differ just in their pos tags can be distinguished * (for robustness). */ public boolean surfaceWordEquals(Object obj, boolean ignoreLF) { if (obj == this) return true; if (!(obj instanceof Sign)) return false; Sign sign = (Sign) obj; // original equals for lex signs if (_history.getInputs() == null || sign._history.getInputs() == null) return equals(sign); // otherwise use surface words if (_words.size() != sign._words.size()) return false; for (int i = 0; i < _words.size(); i++) { Word word = _words.get(i); Word signWord = (Word) sign._words.get(i); if (!word.surfaceWordEquals(signWord)) return false; } return (ignoreLF) ? _cat.equalsNoLF(sign._cat) : _cat.equals(sign._cat); } /** Returns 'orthography :- category'. */ public String toString() { return getOrthography() + " :- " + _cat.toString(); // for lex head: + " --> " + _lexHead.getWordForm(); } /** * Returns the words in an XML doc, with no labeled spans for nominals. */ public Document getWordsInXml() { Set emptySet = Collections.emptySet(); return getWordsInXml(emptySet); } /** * Returns the words in an XML doc, with labeled spans for the given nominals, * and with pitch accents and boundary tones converted to elements. * Each orthographic word appears in a separate element, * with multiwords grouped under a multiword element. * Attribute-value pairs for the word (if any) appear on the word * or multiword element. * Words are also expanded using the grammar's tokenizer. */ public Document getWordsInXml(Set nominals) { TObjectIntHashMap nominalsMap = new TObjectIntHashMap(); setMaxOrthLengths(nominals, nominalsMap); Document doc = new Document(); Element root = new Element("seg"); doc.setRootElement(root); addWordsToXml(root, nominalsMap); return doc; } // finds the maximum orthography lengths for signs headed by the given nominals private void setMaxOrthLengths(Set nominals, TObjectIntHashMap nominalsMap) { // update map Nominal index = _cat.getIndexNominal(); if (index != null && nominals.contains(index)) { int orthLen = getOrthography().length(); if (!nominalsMap.containsKey(index) || orthLen > nominalsMap.get(index)) { nominalsMap.put(index, orthLen); } } // recurse Sign[] inputs = _history.getInputs(); if (inputs == null) return; for (int i = 0; i < inputs.length; i++) { inputs[i].setMaxOrthLengths(nominals, nominalsMap); } } // recursively adds orthographic words as XML to the given parent, // using the nominals map to determine labeled spans private void addWordsToXml(Element parent, TObjectIntHashMap nominalsMap) { // check for matching nominal as index of target cat; // if found, update parent to labeled span element Nominal index = _cat.getIndexNominal(); if (index != null && nominalsMap.containsKey(index) && nominalsMap.get(index) == getOrthography().length()) { // remove index key from map, to avoid duplicate spans with the same length nominalsMap.remove(index); // make span element, update parent Element span = new Element("span"); span.setAttribute("label", index.toString()); parent.addContent(span); parent = span; } // process inputs from derivation history Sign[] inputs = _history.getInputs(); if (inputs == null) { // in leaf case, word list must be a singleton Word word = _words.get(0); // check for boundary tone if (Grammar.isBoundaryTone(word.getForm())) { // add element for boundary tone Element boundary = new Element("boundary"); boundary.setAttribute("type", word.getForm()); parent.addContent(boundary); return; } // check for pitch accent if (word.getPitchAccent() != null) { // add pitchaccent element containing word(s) with corresponding accent Element pitchaccent = new Element("pitchaccent"); pitchaccent.setAttribute("type", word.getPitchAccent()); addWords(pitchaccent, word); parent.addContent(pitchaccent); return; } // otherwise add word(s) addWords(parent, word); return; } if (inputs.length == 1) { inputs[0].addWordsToXml(parent, nominalsMap); return; } for (int i = 0; i < inputs.length; i++) { inputs[i].addWordsToXml(parent, nominalsMap); } } // adds one or more word elements after expanding surface form; // multiwords are enclosed within a multiword element; // any attribute-value pairs are added to the word or multiword element private void addWords(Element parent, Word word) { List orthWords = Grammar.theGrammar.lexicon.tokenizer.expandWord(word); Element child; if (orthWords.size() == 1) { Element wordElt = new Element("word"); wordElt.addContent(orthWords.get(0)); child = wordElt; } else { Element multiwordElt = new Element("multiword"); for (int i = 0; i < orthWords.size(); i++) { Element wordElt = new Element("word"); wordElt.addContent(orthWords.get(i)); multiwordElt.addContent(wordElt); } child = multiwordElt; } for (Iterator> it = word.getAttrValPairs(); it.hasNext(); ) { Pair p = it.next(); String attr = p.a; String val = p.b; child.setAttribute(attr, val); } parent.addContent(child); } /** * Returns a string showing the bracketings implied by the derivation. * See DerivationHistory.toString to see the complete derivation in * vertical list form. */ public String getBracketedString() { Sign[] inputs = _history.getInputs(); if (inputs == null) return getOrthography(); if (inputs.length == 1) return inputs[0].getBracketedString(); StringBuffer sb = new StringBuffer(); sb.append("("); for (int i = 0; i < inputs.length; i++) { sb.append(inputs[i].getBracketedString()); if (i < (inputs.length - 1)) sb.append(" "); } sb.append(")"); return sb.toString(); } /** * Returns the category's supertag. */ public String getSupertag() { return _cat.getSupertag(); } /** * Returns the word form of the first word. */ public String getWordForm() { return _words.get(0).getForm(); } /** * Returns the POS tag of the first word. */ public String getPOS() { return _words.get(0).getPOS(); } /** * Sets the origin of the elementary predications. */ public void setOrigin() { HyloHelper.setOrigin(_cat.getLF(), this); } /** * Returns the index of the first word of the given lex sign in this sign's * list of words, or -1 if the given lex sign is not in this sign's derivation * history. */ public int wordIndex(Sign lexSign) { return wordIndex(lexSign, new int[]{0}); } // returns word index relative to input offset private int wordIndex(Sign lexSign, int[] offset) { if (this == lexSign) return offset[0]; if (isLexical()) { offset[0] += _words.size(); return -1; } Sign[] inputs = _history.getInputs(); for (int i = 0; i < inputs.length; i++) { int retval = inputs[i].wordIndex(lexSign, offset); if (retval >= 0) return retval; } return -1; } /** Adds a data object to the front of the list of data objects. */ public void addData(Object obj) { if (data == null) data = new LinkedList(); data.addFirst(obj); } /** Returns the first data object with the given class, or null if none. */ public Object getData(Class objClass) { if (data == null) return null; for (Object obj : data) { if (obj.getClass() == objClass) return obj; } return null; } /** Unfilled dependencies wrapper, for unique retrieval from data objects. */ public static class UnfilledDeps { public List unfilledDeps; public UnfilledDeps(List unfilledDeps) { this.unfilledDeps = unfilledDeps; } } /** Filled dependencies wrapper, for unique retrieval from data objects. */ public static class FilledDeps { public List filledDeps; public FilledDeps(List filledDeps) { this.filledDeps = filledDeps; } } /** Returns the unfilled dependencies for this sign, with caching. */ public List getUnfilledDeps() { // check cache UnfilledDeps udeps = (UnfilledDeps) getData(UnfilledDeps.class); if (udeps != null) return udeps.unfilledDeps; // lex case: calculate, store and return if (isLexical()) { List unfilledDeps = HyloHelper.getUnfilledLexDeps(_cat.getLF()); addData(new UnfilledDeps(unfilledDeps)); return unfilledDeps; } // otherwise compute filled deps, with unfilled determined as a side effect, and return cached result getFilledDeps(); udeps = (UnfilledDeps) getData(UnfilledDeps.class); return udeps.unfilledDeps; } /** Returns the filled dependencies for this sign, with caching. */ public List getFilledDeps() { // skip lex case if (isLexical()) return Collections.emptyList(); // check cache FilledDeps fdeps = (FilledDeps) getData(FilledDeps.class); if (fdeps != null) return fdeps.filledDeps; // otherwise get unfilled deps from children recursively List unfilledDeps = new ArrayList(5); Sign[] inputs = _history.getInputs(); for (int i = 0; i < inputs.length; i++) { unfilledDeps.addAll(inputs[i].getUnfilledDeps()); } // calculate filled deps List filledDeps = HyloHelper.getFilledLexDeps(unfilledDeps, _cat.getLF()); // store filled and unfilled, returning filled addData(new UnfilledDeps(unfilledDeps)); addData(new FilledDeps(filledDeps)); return filledDeps; } /** * Returns the sibling filled dependencies for this sign by recursively * filtering the filled dependencies from the input signs for those with * the same head. */ public List getSiblingFilledDeps() { List filledDeps = getFilledDeps(); if (filledDeps.isEmpty()) return Collections.emptyList(); List retval = new ArrayList(5); Sign[] inputs = _history.getInputs(); for (int i = 0; i < inputs.length; i++) { inputs[i].addSiblingFilledDeps(retval, filledDeps); } return retval; } // recursively adds sibling filled deps until lex items reached or // sibs with different heads found private void addSiblingFilledDeps(List retval, List filledDeps) { if (isLexical()) return; List candDeps = getFilledDeps(); if (!candDeps.isEmpty()) { List sibs = LexDependency.filterSameHead(candDeps, filledDeps); if (sibs.isEmpty()) return; retval.addAll(sibs); } Sign[] inputs = _history.getInputs(); for (int i = 0; i < inputs.length; i++) { inputs[i].addSiblingFilledDeps(retval, filledDeps); } } /** * Returns the descendant sign headed by the given dependent * by recursing through the input signs as long as the head remains * the same as the given head; otherwise returns null. */ public Sign getSignHeadedByDep(LexDependency lexdep) { // check same head if (!isLexical() && _lexHead == lexdep.lexHead) { Sign[] inputs = _history.getInputs(); for (int i = 0; i < inputs.length; i++) { // check for match if (inputs[i]._lexHead == lexdep.lexDep) return inputs[i]; // found it // otherwise recurse Sign retval = inputs[i].getSignHeadedByDep(lexdep); if (retval != null) return retval; } } // otherwise not found return null; } /** Tests serialization of simple types, including resolution. */ public void debugSerialization() throws IOException, ClassNotFoundException { // test serialization String filename = "tmp.ser"; ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(filename)); System.out.println("Writing this: " + this); System.out.println(this.getDerivationHistory()); out.writeObject(this); out.close(); ObjectInputStream in = new ObjectInputStream(new FileInputStream(filename)); System.out.print("Reading sign: "); Sign sign = (Sign) in.readObject(); System.out.println(sign); System.out.println(sign.getDerivationHistory()); in.close(); // test identity and equality System.out.println("this == sign?: " + (this == sign)); System.out.println("this.equals(sign)?: " + (this.equals(sign))); } } ================================================ FILE: src/opennlp/ccg/synsem/SignHash.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-5 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import gnu.trove.*; import java.util.*; import opennlp.ccg.lexicon.Word; /** * A set of signs, unique up to surface words. * Signs with lower derivational complexity are kept during insertion. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.13 $, $Date: 2009/12/21 02:15:44 $ */ public class SignHash extends THashSet { private static final long serialVersionUID = 1L; /** Hashing strategy that uses Sign's surfaceWordHashCode and surfaceWordEquals methods. */ protected static TObjectHashingStrategy surfaceWordHashingStrategy = new TObjectHashingStrategy() { private static final long serialVersionUID = 1L; public int computeHashCode(java.lang.Object o) { return ((Sign)o).surfaceWordHashCode(); } public boolean equals(java.lang.Object o1, java.lang.Object o2) { return ((Sign)o1).surfaceWordEquals((Sign)o2); } }; /** Default constructor. */ public SignHash() { super(surfaceWordHashingStrategy); } /** * Constructor which adds one sign. */ public SignHash(Sign sign) { this(); insert(sign); } /** * Constructor which adds a collection of signs. */ public SignHash(Collection c) { this(); for (Sign s : c) insert(s); } /** * Returns this as a set of signs. */ @SuppressWarnings("unchecked") public Set asSignSet() { return (Set) this; } /** * Adds a sign, keeping the one with lower derivational complexity * if there is an equivalent one there already; returns the old * sign if it was displaced, the new sign if there was no equivalent * old sign, or null if the sign was not actually added. */ public Sign insert(Sign sign) { int pos = index(sign); if (pos >= 0) { Sign oldSign = (Sign) _set[pos]; if (oldSign == sign) return null; if (sign.getDerivationHistory().compareTo(oldSign.getDerivationHistory()) < 0) { _set[pos] = sign; return oldSign; } else return null; } else { add(sign); return sign; } } /** Returns the signs sorted by their words lexicographically. */ public List getSignsSorted() { ArrayList retval = new ArrayList(asSignSet()); Collections.sort(retval, signComparator); return retval; } /** Comparator for signs to provide a persistent ordering. */ public static final Comparator signComparator = new Comparator() { public int compare(Sign sign1, Sign sign2) { return compareTo(sign1, sign2); } }; /** Compares signs by their derivation complexity, lists of words, then (somewhat desperately) cat hash codes. */ public static int compareTo(Sign sign1, Sign sign2) { int cmp = 0; cmp = sign1.getDerivationHistory().compareTo(sign2.getDerivationHistory()); if (cmp != 0) return cmp; List words1 = sign1.getWords(); List words2 = sign2.getWords(); cmp = compareTo(words1, words2); if (cmp != 0) return cmp; // TODO: implement compareTo method on categories int h1 = sign1.getCategory().hashCode(); int h2 = sign2.getCategory().hashCode(); if (h1 < h2) return -1; if (h1 > h2) return 1; return 0; } /** Compares lists of words lexicographically. */ public static int compareTo(List words1, List words2) { int i=0; while (i < words1.size() || i < words2.size()) { if (i == words1.size()) return -1; if (i == words2.size()) return 1; Word w1 = words1.get(i); Word w2 = words2.get(i); int cmp = w1.compareTo(w2); if (cmp != 0) return cmp; i++; } return 0; } } ================================================ FILE: src/opennlp/ccg/synsem/SignScorer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.util.Random; /** * Interface for sign scoring models. * * @author Michael White * @version $Revision: 1.2 $, $Date: 2008/11/09 02:59:49 $ */ public interface SignScorer { /** * Returns a score for the given sign and completeness flag, where higher * numbers are better than lower numbers. * When normalized, returns a score between 0 (worst) and 1 (best). */ public double score(Sign sign, boolean complete); /** A scorer that returns 0 for all signs. */ public static SignScorer nullScorer = new SignScorer() { public double score(Sign sign, boolean complete) { return 0; } }; /** A scorer that returns a random number in [0,1] for all signs. */ public static SignScorer randomScorer = new SignScorer() { Random random = new Random(); public double score(Sign sign, boolean complete) { return random.nextDouble(); } }; } ================================================ FILE: src/opennlp/ccg/synsem/Slash.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-7 Jason Baldridge, Gann Bierner and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.Serializable; import gnu.trove.*; import org.jdom.Element; import opennlp.ccg.unify.*; /** * A categorial slash which has an optional mode associated with it. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.10 $, $Date: 2009/11/28 03:39:27 $ */ public final class Slash implements Unifiable, Mutable, Serializable { private static final long serialVersionUID = -1935688863458012637L; public static final byte L = 0; public static final byte B = 1; public static final byte R = 2; public static final byte INERT_OR_ACTIVE = 0; public static final byte ACTIVE = 1; public static final byte INERT = 2; private final byte _dir; private final Modality _modality; private byte _ability = INERT_OR_ACTIVE; private boolean _modifier = false; private boolean _harmonicCompositionResult = false; public Slash(Element el) { String d = el.getAttributeValue("dir"); if (d == null) d = el.getAttributeValue("d"); if (d == null) { d = "|"; } _dir = encode(d.charAt(0)); String m = el.getAttributeValue("mode"); if (m == null) m = el.getAttributeValue("m"); if (m != null) { _modality = new SlashMode(m); } else { String vm = el.getAttributeValue("varmodality"); if (vm == null) vm = el.getAttributeValue("varModality"); if (vm != null) { _modality = new VarModality(vm); } else { _modality = new SlashMode(); } } String ability = el.getAttributeValue("ability"); if (null != ability) { setAbility(ability); } } public Slash() { this('|'); } public Slash(char sd) { _dir = encode(sd); _modality = new SlashMode(); } public Slash(char sd, String md) { _dir = encode(sd); _modality = new SlashMode(md); } public Slash(char sd, Modality md) { _dir = encode(sd); _modality = md; } private Slash(byte d, Modality m, byte a) { _dir = d; _modality = m; _ability = a; } public Element toXml() { Element retval = new Element("slash"); retval.setAttribute("dir", encode()); String ability = decodeAbility(); if (_modality instanceof SlashMode) { String mode = _modality.toString(); if (!mode.equals(".")) retval.setAttribute("mode", mode); } else if (_modality instanceof VarModality) retval.setAttribute("varmodality", ((VarModality) _modality).name()); if (ability != null) retval.setAttribute("ability", ability); return retval; } public Slash copy() { Slash retval = new Slash(_dir, (Modality) _modality.copy(), _ability); retval._modifier = _modifier; retval._harmonicCompositionResult = _harmonicCompositionResult; return retval; } public boolean occurs(Variable v) { return _modality.occurs(v); } public void deepMap(ModFcn mf) { mf.modify(this); } public boolean isActive() { return _ability == ACTIVE || _ability == INERT_OR_ACTIVE; } public boolean setAbility(String ability) { byte newAbility; if (ability.equals("inert")) { newAbility = INERT; } else if (ability.equals("active")) { newAbility = ACTIVE; } else { newAbility = INERT_OR_ACTIVE; } if (abilitiesMatch(_ability, newAbility)) { _ability = newAbility; return true; } else { return false; } } /** Returns a string for the ability or null if not set. */ public String decodeAbility() { if (_ability == INERT) return "inert"; else if (_ability == ACTIVE) return "active"; else return null; } /** Returns whether this cat is a modifier cat (defaults to false). */ public boolean isModifier() { return _modifier; } /** Sets whether this cat is a modifier cat. */ // NB: Might want to change this allow lex overrides of defaults public void setModifier(boolean modifier) { _modifier = modifier; } /** Returns whether this arg has resulted from harmonic composition. */ public boolean isHarmonicCompositionResult() { return _harmonicCompositionResult; } /** Sets whether this arg has resulted from harmonic composition. */ public void setHarmonicCompositionResult(boolean harmonicResult) { _harmonicCompositionResult = harmonicResult; } public void unifyCheck(Object u) throws UnifyFailure { if (u instanceof Slash) { if (!abilitiesMatch(_ability, ((Slash) u)._ability)) { throw new UnifyFailure(); } if (!directionsMatch(_dir, ((Slash) u)._dir)) { throw new UnifyFailure(); } _modality.unifyCheck(((Slash) u)._modality); } else { throw new UnifyFailure(); } } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof Slash) { Slash s2 = (Slash) u; byte newAbility = _ability; if (_ability == INERT_OR_ACTIVE) { newAbility = s2._ability; } else if (s2._ability == INERT_OR_ACTIVE) { newAbility = _ability; } else if (_ability != s2._ability) { throw new UnifyFailure(); } byte newDir = _dir; if (_dir == B) { newDir = s2._dir; } else if (s2._dir == B) { newDir = _dir; } else if (_dir != s2._dir) { throw new UnifyFailure(); } Modality newModality = (Modality) _modality.unify(((Slash) u)._modality, sub); Slash retval = new Slash(newDir, newModality, newAbility); retval._modifier = _modifier; return retval; } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { Slash retval = new Slash(_dir, (Modality) _modality.fill(sub), _ability); retval._modifier = _modifier; return retval; } // public boolean equals(Slash s) { // return directionsMatch(_dir, s._dir); // } public boolean sameDirAsModality() { return directionsMatch(_dir, _modality.getDirection()); } private static byte encode(char sd) { switch (sd) { case '/': return R; case '\\': return L; default: return B; } } public static boolean directionsMatch(byte s1, byte s2) { if (s1 == B || s2 == B) { return true; } else { return s1 == s2; } } private static boolean abilitiesMatch(byte ab1, byte ab2) { if (ab1 == INERT_OR_ACTIVE || ab2 == INERT_OR_ACTIVE) { return true; } else { return ab1 == ab2; } } /** * Returns a hash code based on the direction, ability and modality. */ public int hashCode() { return 31 * _dir + 7 * _ability + _modality.hashCode(); } /** * Returns whether this slash equals the given object * based on the direction, ability and modality. */ public boolean equals(Object obj) { if (this == obj) return true; if (obj.getClass() != this.getClass()) return false; Slash s = (Slash) obj; if (_dir != s._dir ||_ability != s._ability) return false; return _modality.equals(s._modality); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { int retval = 31 * _dir + 7 * _ability; if (_modality instanceof Variable) retval += ((Variable)_modality).hashCode(varMap); else retval += _modality.hashCode(); return retval; } /** * Returns whether this slash equals the given object up to variable names, * using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (this == obj) return true; if (obj.getClass() != this.getClass()) return false; Slash s = (Slash) obj; if (_dir != s._dir || _ability != s._ability) return false; if (_modality instanceof Variable) return ((Variable)_modality).equals(s._modality, varMap, varMap2); else return _modality.equals(s._modality); } // string for showing ability private String abilityStr() { if (_ability == ACTIVE) return "@"; else if (_ability == INERT) return "!"; else return ""; } public String toString() { switch (_dir) { case R: return "/" + abilityStr() + _modality.toString(R); case L: return "\\" + abilityStr() + _modality.toString(L); default: return "|" + abilityStr() + _modality; } } /** * Returns the direction for this slash as a string. */ public String encode() { switch (_dir) { case R: return "/"; case L: return "\\"; default: return "|"; } } /** * Returns the supertag for this slash. */ public String getSupertag() { return encode(); } /** * Returns a TeX-formatted string representation for this slash. */ public String toTeX() { StringBuffer sb = new StringBuffer(); String sup = "\\sups"; String sub = "\\subs"; String modTeX = null; switch (_dir) { case R: sb.append("/ "); sup = "\\supsb"; sub = "\\subsa"; modTeX = _modality.toTeX(R); break; case L: sb.append("\\bs "); sub = "\\subsb"; sup = "\\supsa"; modTeX = _modality.toTeX(L); break; default: sb.append("| "); sub = "\\subs"; sup = "\\sups"; modTeX = _modality.toTeX(); break; } if (_ability == ACTIVE) sb.append(sup).append("{").append("+").append("} "); if (_ability == INERT) sb.append(sup).append("{").append("-").append("} "); if ((modTeX != "") && (_ability == ACTIVE)) { if ((_dir == R)) sb.append("\\hspace{-1.45mm} "); else if (_dir == L) sb.append("\\hspace{-0.50mm} "); } if (modTeX != "") sb.append(sub).append("{").append(modTeX).append("} "); return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/synsem/SlashMode.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.Serializable; import org.jdom.Element; import opennlp.ccg.unify.*; /** * A mode that can decorate a categorial slash. * * @author Jason Baldridge * @version $Revision: 1.5 $, $Date: 2009/07/17 04:23:30 $ */ public final class SlashMode implements Modality, Serializable { private static final long serialVersionUID = -2387797559890373347L; public static final byte All = 0; public static final byte ApplicationOnly = 1; public static final byte Associative = 2; public static final byte Permutative = 3; public static final byte PermutativeRight = 4; public static final byte PermutativeLeft = 5; public static final byte APRight = 6; public static final byte APLeft = 7; private byte _mode; public SlashMode(Element el) { String m = el.getAttributeValue("mode"); if (m == null) m = el.getAttributeValue("m"); if (m == null) { m = "."; } _mode = byteVal(m); } public SlashMode() { this("."); } public SlashMode(String m) { _mode = byteVal(m); } private SlashMode(byte m) { _mode = m; } public Object copy() { return new SlashMode(_mode); } /** Returns a hash code based on the mode. */ public int hashCode() { return 31 * _mode; } /** Returns whether this slash mode equals the given object based on the mode. */ public boolean equals(Object obj) { if (this == obj) return true; if (obj.getClass() != this.getClass()) return false; SlashMode m = (SlashMode) obj; return _mode == m._mode; } // public boolean equals(SlashMode m) { // return _mode == m._mode; // } public boolean occurs(Variable var) { return false; } public void unifyCheck(Object o) throws UnifyFailure { if (!(o instanceof VarModality || (o instanceof SlashMode && modesMatch( _mode, ((SlashMode) o)._mode)))) { throw new UnifyFailure(); } } public Object unify(Object o, Substitution sub) throws UnifyFailure { if (o instanceof VarModality) { return ((VarModality) o).unify(this, sub); } else if (o instanceof SlashMode) { if (modesMatch(_mode, ((SlashMode) o)._mode)) { return copy(); } else { throw new UnifyFailure(); } } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { return copy(); } public String toString(byte slashDir) { if (slashDir == getDirection()) { switch (_mode) { case PermutativeRight: return "x"; case PermutativeLeft: return "x"; case APRight: return ""; case APLeft: return ""; default: return toString(); } } else { return toString(); } } public String toString() { switch (_mode) { case All: return "."; case ApplicationOnly: return "*"; case Associative: return "^"; case Permutative: return "x"; case PermutativeRight: return "x>"; case PermutativeLeft: return ""; case APLeft: return "<"; default: return "."; } } public String toTeX(byte slashDir) { if (slashDir == getDirection()) { switch (_mode) { case PermutativeRight: return "x"; case PermutativeLeft: return "x"; case APRight: return ""; case APLeft: return ""; default: return toTeX(); } } else { return toTeX(); } } public String toTeX() { switch (_mode) { case All: return "."; case ApplicationOnly: return "*"; case Associative: return "\\diamond"; case Permutative: return "x"; case PermutativeRight: return "x>"; case PermutativeLeft: return ""; case APLeft: return "<"; default: return "."; } } private static byte byteVal(String m) { if (m.equals(".")) { return All; } else if (m.equals(">")) { return APRight; } else if (m.equals("<")) { return APLeft; } else if (m.equals("*")) { return ApplicationOnly; } else if (m.equals("^")) { return Associative; } else if (m.equals("x")) { return Permutative; } else if (m.equals("x>")) { return PermutativeRight; } else if (m.equals(" " + doMatch); return doMatch; } } ================================================ FILE: src/opennlp/ccg/synsem/SyntacticFeatureExtractor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2009 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.util.*; import opennlp.ccg.perceptron.*; import opennlp.ccg.util.TrieMap; import opennlp.ccg.lexicon.Word; /** * A class for extracting generic features from derivations, * inspired by those in the C&C-07 normal form model. * * Features are extracted lazily for efficiency. * * @author Michael White * @version $Revision: 1.10 $, $Date: 2011/11/08 14:58:15 $ */ public class SyntacticFeatureExtractor implements FeatureExtractor { /** Feature prefix constant: "syn". */ public static final String PREFIX = "syn"; /** Feature map wrapper, for unique retrieval from a sign's data objects. */ public static class FeatureMapWrapper { public FeatureMap featureMap; public FeatureMapWrapper(FeatureMap featureMap) { this.featureMap = featureMap; } } /** Flag for whether to include distance features (defaults to true). */ public boolean includeDistFeats = true; /** The alphabet. */ protected Alphabet alphabet = null; /** Current feature map. */ protected FeatureMap currentMap = null; /** Current sign (for extracting features). */ protected Sign currentSign = null; /** Current input signs (for extracting features). */ protected Sign[] currentInputs = null; /** Current sibling (for extracting features). */ protected Sign currentSibling = null; /** Current words (for extracting features). */ protected List currentWords = null; /** Current head index (for extracting features). */ protected int currentHeadIndex = -1; /** Current sibling head index (for extracting features). */ protected int currentSibHeadIndex = -1; /** Current distance in words (for extracting features). */ protected String currentDistW = null; /** Current distance in puncts (for extracting features). */ protected String currentDistP = null; /** Current distance in verbs (for extracting features). */ protected String currentDistV = null; /** Lexical feature extractors. */ protected List>> lexExtractors = new ArrayList>>(); /** Rule feature extractors. */ protected List>> unaryRuleExtractors = new ArrayList>>(); /** Binary rule feature extractors. */ protected List>> binaryRuleExtractors = new ArrayList>>(); /** Distance feature extractors. */ protected List>> distExtractors = new ArrayList>>(); /** Constructor. */ public SyntacticFeatureExtractor() { // init lazy feature extractors lexExtractors.add(lexcat_word()); lexExtractors.add(lexcat_pos()); unaryRuleExtractors.add(unary_rule()); unaryRuleExtractors.add(unary_rule_word()); unaryRuleExtractors.add(unary_rule_pos()); binaryRuleExtractors.add(binary_rule()); binaryRuleExtractors.add(binary_rule_word()); binaryRuleExtractors.add(binary_rule_pos()); binaryRuleExtractors.add(rule_word_word()); binaryRuleExtractors.add(rule_word_pos()); binaryRuleExtractors.add(rule_pos_word()); binaryRuleExtractors.add(rule_pos_pos()); distExtractors.add(rule_word_dist()); distExtractors.add(rule_pos_dist()); distExtractors.add(rule_word_dist_puncts()); distExtractors.add(rule_pos_dist_puncts()); distExtractors.add(rule_word_dist_verbs()); distExtractors.add(rule_pos_dist_verbs()); } /** Sets the alphabet. */ public void setAlphabet(Alphabet alphabet) { this.alphabet = alphabet; } /** Returns the features for the given sign and completeness flag. */ public FeatureVector extractFeatures(Sign sign, boolean complete) { addFeatures(sign, complete); return getFeatureMap(sign); } /** Recursively adds features to the feature map for the given sign, if not already present. */ protected void addFeatures(Sign sign, boolean complete) { // check for existing map, otherwise make one if (getFeatureMap(sign) != null) return; // lex case if (sign.isLexical()) { currentSign = sign; currentMap = new FeatureMap(); inc(lexExtractors); } // non-terminal else { Sign[] inputs = sign.getDerivationHistory().getInputs(); // first recurse for (Sign child : inputs) addFeatures(child, false); // use input maps in making current map currentSign = sign; currentInputs = inputs; if (inputs.length == 1) { currentMap = new FeatureMap(getFeatureMap(inputs[0])); inc(unaryRuleExtractors); } else if (inputs.length == 2) { currentMap = new FeatureMap(getFeatureMap(inputs[0]), getFeatureMap(inputs[1])); currentSibling = sibling(sign, inputs); inc(binaryRuleExtractors); // dist feats if (includeDistFeats) { currentWords = null; // get words and head indices lazily currentDistW = null; currentDistP = null; currentDistV = null; // also reset current distances inc(distExtractors); } } } // store it storeFeatureMap(sign); } /** Stores the current feature map as a data object in the given sign. */ protected void storeFeatureMap(Sign sign) { sign.addData(new FeatureMapWrapper(currentMap)); } /** Returns the feature map for this extractor from the given sign (null if none). */ protected FeatureMap getFeatureMap(Sign sign) { FeatureMapWrapper fmw = (FeatureMapWrapper)sign.getData(FeatureMapWrapper.class); return (fmw != null) ? fmw.featureMap : null; } /** * Increments the count of the given features, if relevant. */ protected void inc(List>> extractors) { for (List> lazyExtractor : extractors) { Alphabet.Feature f = alphabet.indexLazy(lazyExtractor); if (f != null) currentMap.inc(f); } } /** Returns the sibling sign from among the two inputs. */ protected Sign sibling(Sign sign, Sign[] inputs) { if (sign.getLexHead() == inputs[0].getLexHead()) return inputs[1]; else return inputs[0]; } /** Sets the current words, if null, along with head indices. */ protected void setCurrentWords() { if (currentWords != null) return; currentWords = currentSign.getWords(); Word head = currentSign.getLexHead().getWords().get(0); Word sibHead = currentSibling.getLexHead().getWords().get(0); currentHeadIndex = find(currentWords, head); currentSibHeadIndex = find(currentWords, sibHead); } /** Returns the index of the given word in the list, or -1 if not found. */ protected int find(List words, Word word) { int len = words.size(); for (int i=0; i < len; i++) { if (words.get(i) == word) return i; } return -1; } /** Returns the distance in intervening words as 0w, 1w, 2w or 3w (for 3 or more). */ protected String distWords() { if (currentDistW != null) return currentDistW; setCurrentWords(); int dist = Math.abs(currentHeadIndex - currentSibHeadIndex) - 1; switch (dist) { case 0: return currentDistW = "0w"; case 1: return currentDistW = "1w"; case 2: return currentDistW = "2w"; default: return currentDistW = "3w"; } } /** Returns the distance in intervening punctuation marks as 0p, 1p, 2p or 3p (for 3 or more). */ protected String distPuncts() { if (currentDistP != null) return currentDistP; setCurrentWords(); int min = Math.min(currentHeadIndex, currentSibHeadIndex); int max = Math.max(currentHeadIndex, currentSibHeadIndex); int count = 0; for (int i=min+1; i < max; i++) { Word w = currentWords.get(i); if (isPunct(w)) count++; } switch (count) { case 0: return currentDistP = "0p"; case 1: return currentDistP = "1p"; case 2: return currentDistP = "2p"; default: return currentDistP = "3p"; } } /** * Returns whether a word is a punctuation mark that typically signals sentence-internal complexity. * The default implementation tests for commas, dashes (--), semi-colons and colons. */ protected boolean isPunct(Word word) { // NB: in principle could use POS, but sometimes punctuation marks seem to end up with IN as the POS tag String form = word.getForm(); return (form == "," || form == "--" || form == ";" || form == ":"); } /** Returns the distance in intervening verbs as 0v, 1v, or 2v (for 2 or more). */ protected String distVerbs() { if (currentDistV != null) return currentDistV; setCurrentWords(); int min = Math.min(currentHeadIndex, currentSibHeadIndex); int max = Math.max(currentHeadIndex, currentSibHeadIndex); int count = 0; for (int i=min+1; i < max; i++) { Word w = currentWords.get(i); if (isVerb(w)) count++; } switch (count) { case 0: return currentDistV = "0v"; case 1: return currentDistV = "1v"; default: return currentDistV = "2v"; } } /** * Returns whether a word is a verb. * The default implementation tests for a POS tag beginning with V. */ protected boolean isVerb(Word word) { String pos = word.getPOS(); return (pos.startsWith("V")); } // lex cat + word private List> lexcat_word() { List> retval = new ArrayList>(2); add_supertag(retval); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSign.getWordForm(); }}); return retval; } // add prefix + supertag private void add_supertag(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return PREFIX; }}); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSign.getSupertag(); }}); } // lex cat + pos private List> lexcat_pos() { List> retval = new ArrayList>(2); add_supertag(retval); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSign.getPOS(); }}); return retval; } // rule private List> unary_rule() { List> retval = new ArrayList>(2); add_unary_rule(retval); return retval; } private void add_unary_rule(List> retval) { add_supertag(retval); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentInputs[0].getSupertag(); }}); } private List> binary_rule() { List> retval = new ArrayList>(3); add_binary_rule(retval); return retval; } private void add_binary_rule(List> retval) { add_unary_rule(retval); retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentInputs[1].getSupertag(); }}); } // rule + head word private List> unary_rule_word() { List> retval = new ArrayList>(3); add_unary_rule_word(retval); return retval; } private void add_unary_rule_word(List> retval) { add_unary_rule(retval); add_lex_word(retval); } private void add_lex_word(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSign.getLexHead().getWordForm(); }}); } private List> binary_rule_word() { List> retval = new ArrayList>(4); add_binary_rule_word(retval); return retval; } private void add_binary_rule_word(List> retval) { add_binary_rule(retval); add_lex_word(retval); } // rule + head pos private List> unary_rule_pos() { List> retval = new ArrayList>(3); add_unary_rule_pos(retval); return retval; } private void add_unary_rule_pos(List> retval) { add_unary_rule(retval); add_lex_pos(retval); } private void add_lex_pos(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSign.getLexHead().getPOS(); }}); } private List> binary_rule_pos() { List> retval = new ArrayList>(4); add_binary_rule_pos(retval); return retval; } private void add_binary_rule_pos(List> retval) { add_binary_rule(retval); add_lex_pos(retval); } // rule + head word + sibling word private List> rule_word_word() { List> retval = new ArrayList>(5); add_binary_rule_word(retval); add_sibling_word(retval); return retval; } private void add_sibling_word(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSibling.getLexHead().getWordForm(); }}); } // rule + head word + sibling pos private List> rule_word_pos() { List> retval = new ArrayList>(5); add_binary_rule_word(retval); add_sibling_pos(retval); return retval; } private void add_sibling_pos(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return currentSibling.getLexHead().getPOS(); }}); } // rule + head pos + sibling word private List> rule_pos_word() { List> retval = new ArrayList>(5); add_binary_rule_pos(retval); add_sibling_word(retval); return retval; } // rule + head pos + sibling pos private List> rule_pos_pos() { List> retval = new ArrayList>(5); add_binary_rule_pos(retval); add_sibling_pos(retval); return retval; } // rule + head word + dist private List> rule_word_dist() { List> retval = new ArrayList>(5); add_binary_rule_word(retval); add_dist_words(retval); return retval; } private void add_dist_words(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return distWords(); }}); } // rule + head pos + dist private List> rule_pos_dist() { List> retval = new ArrayList>(5); add_binary_rule_pos(retval); add_dist_words(retval); return retval; } // rule + head word + dist in puncts private List> rule_word_dist_puncts() { List> retval = new ArrayList>(5); add_binary_rule_word(retval); add_dist_puncts(retval); return retval; } private void add_dist_puncts(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return distPuncts(); }}); } // rule + head pos + dist in puncts private List> rule_pos_dist_puncts() { List> retval = new ArrayList>(5); add_binary_rule_pos(retval); add_dist_puncts(retval); return retval; } // rule + head word + dist in verbs private List> rule_word_dist_verbs() { List> retval = new ArrayList>(5); add_binary_rule_word(retval); add_dist_verbs(retval); return retval; } private void add_dist_verbs(List> retval) { retval.add(new TrieMap.KeyExtractor(){public String getKey(){ return distVerbs(); }}); } // rule + head pos + dist in verbs private List> rule_pos_dist_verbs() { List> retval = new ArrayList>(5); add_binary_rule_pos(retval); add_dist_verbs(retval); return retval; } } ================================================ FILE: src/opennlp/ccg/synsem/TargetCat.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; /** * A category that can be a target of a curried cat. Basically, atomcats or * variable cats. * * @author Jason Baldridge * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:13 $ */ public interface TargetCat extends Category { } ================================================ FILE: src/opennlp/ccg/synsem/VarModality.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-7 Jason Baldridge and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.synsem; import java.io.Serializable; import gnu.trove.TObjectIntHashMap; import opennlp.ccg.unify.*; /** * A class for variables which can stand for slash modalities. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/07/17 04:23:30 $ **/ public class VarModality implements Variable, Indexed, Mutable, Modality, Serializable { private static final long serialVersionUID = 7465785777802095802L; protected final String _name; protected int _index; protected int _hashCode; private static int UNIQUE_STAMP = 0; public VarModality() { this("VM"+UNIQUE_STAMP++); } public VarModality(String name) { this(name, 0); } protected VarModality(String name, int index) { _name = name; _index = index; _hashCode = _name.hashCode() + _index; } public String name() { return _name; } public Object copy() { return new VarModality(_name, _index); } public void deepMap(ModFcn mf) { mf.modify(this); } public int getIndex() { return _index; } public void setIndex(int index) { _hashCode += index - _index; _index = index; } public boolean occurs(Variable var) { return equals(var); } public int hashCode() { return _hashCode; } public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof VarModality)) return false; VarModality vm = (VarModality) o; return _index == vm._index && _name.equals(vm._name); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { // see if this already in map if (varMap.containsKey(this)) return varMap.get(this); // otherwise add it int next = varMap.size() + 1; varMap.put(this, next); return next; } /** * Returns whether this var equals the given object up to variable names, * using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (this == obj) return true; if (obj.getClass() != this.getClass()) { return false; } VarModality vm = (VarModality) obj; if (varMap.get(this) != varMap2.get(vm)) return false; return true; } public void unifyCheck(Object o) throws UnifyFailure { if (!(o instanceof SlashMode || o instanceof VarModality)) { throw new UnifyFailure(); } } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (u instanceof SlashMode) { return sub.makeSubstitution(this, u); } else if (u instanceof VarModality) { VarModality var2 = (VarModality)u; Variable $var = new VarModality(_name+var2._name, UnifyControl.getUniqueVarIndex()); sub.makeSubstitution(this, $var); sub.makeSubstitution(var2, $var); return $var; } else { throw new UnifyFailure(); } } public Object fill(Substitution sub) throws UnifyFailure { Object val = sub.getValue(this); if (val != null) { return val; } else { return this; } } public byte getDirection() { return Slash.B; } public String toString(byte dir) { return toString(); } public String toString() { return _name; } public String toTeX(byte dir) { return toTeX(); } public String toTeX() { return _name; } } ================================================ FILE: src/opennlp/ccg/test/CrossValidateRealizer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import opennlp.ccg.grammar.*; import opennlp.ccg.realize.*; import opennlp.ccg.synsem.SignScorer; import opennlp.ccg.ngrams.*; import java.io.*; import java.net.*; import java.util.*; //import java.util.prefs.*; //import java.text.NumberFormat; import org.jdom.*; import org.jdom.input.*; /** * Runs cross-validation tests with the realizer. * * @author Michael White * @version $Revision: 1.30 $, $Date: 2011/03/20 20:11:58 $ */ public class CrossValidateRealizer implements ScorerMaker { /** The tester to use. */ public Regression tester = null; /** The path to the tmp dir. */ public String tmpdir = "tmp"; // the actual tmp dir private File tmpDir = null; /** The number of cross-validation folds, either 1.x or an int of at least 2. */ public double numFolds = 10; /** The scorer maker, for preparing and loading scoring models. */ public ScorerMaker scorerMaker = this; /** The pruning strategy, if any. */ public PruningStrategy pruningStrategy = null; /** Sets up the tester with the given grammar. */ public CrossValidateRealizer(URL grammarURL) throws IOException { // init tester tester = new Regression(); // load grammar System.out.println("Loading grammar from URL: " + grammarURL); tester.grammar = new Grammar(grammarURL); System.out.println(); } /** Sets up the folds in tmpdir. */ @SuppressWarnings({ "unchecked", "rawtypes" }) public void setupInputs(URL testbedURL) throws IOException { // ensure tmpdir exists tmpDir = new File(tmpdir); if (!tmpDir.exists()) { tmpDir.mkdirs(); } try { // load items System.out.println("Loading testbed from URL: " + testbedURL); SAXBuilder builder = new SAXBuilder(); Document inputDoc = builder.build(testbedURL.openStream()); System.out.println(); Element inputRoot = inputDoc.getRootElement(); List inputItems = inputRoot.getChildren("item"); System.out.println("Setting up inputs in tmpdir: " + tmpdir); // make, save shuffled doc Document shuffledDoc = new Document(); Element shuffledRoot = new Element("regression"); shuffledDoc.setRootElement(shuffledRoot); Random rand = new Random(); while (inputItems.size() > 0) { Element rItem = (Element) inputItems.remove(rand.nextInt(inputItems.size())); shuffledRoot.addContent(rItem); } FileOutputStream shuffledOut = new FileOutputStream(new File(tmpDir, "shuffled.xml")); tester.grammar.serializeXml(shuffledDoc, shuffledOut); shuffledOut.close(); List shuffledItems = shuffledRoot.getChildren("item"); int numItems = shuffledItems.size(); Element[] shuffledItemsArray = new Element[numItems]; shuffledItems.toArray(shuffledItemsArray); // need a non-live listing // get LF listing Element[] shuffledLFsArray = new Element[numItems]; for (int i = 0; i < numItems; i++) { Element item = shuffledItemsArray[i]; shuffledLFsArray[i] = item.getChild("lf"); // reduce content to just full-words (if present) Element fullWords = item.getChild("full-words"); item.setContent((List)null); if (fullWords != null) item.addContent(fullWords); } // make folds docs, leaving LFs out of training, and // removing any exact duplicates with test items int itemsPerFold = (int) Math.floor(numItems / (numFolds * 1.0)); Set testStrings = new HashSet(); int dups = 0; for (int i = 0; i < numFolds; i++) { testStrings.clear(); dups = 0; int foldStart = i * itemsPerFold; int foldLimit = (i < numFolds - 1) ? foldStart + itemsPerFold : numItems; Document testDoc = new Document(); Element testRoot = new Element("regression"); testDoc.setRootElement(testRoot); Document trainDoc = new Document(); Element trainRoot = new Element("regression"); trainDoc.setRootElement(trainRoot); // split items into train/test for (int j = 0; j < numItems; j++) { Element item = shuffledItemsArray[j]; item.detach(); Element lf = shuffledLFsArray[j]; if (foldStart <= j && j < foldLimit) { testRoot.addContent(item); item.addContent(lf); testStrings.add(item.getAttributeValue("string")); } else { // special case for 1.x folds: limit training data // to first numItems - itemsPerFold items if (numFolds < 2 && i == 1 && j >= (numItems - itemsPerFold)) continue; trainRoot.addContent(item); } } // check for dups in training items List trainingItems = trainRoot.getChildren("item"); for (Iterator it = trainingItems.iterator(); it.hasNext(); ) { Element item = (Element) it.next(); if (testStrings.contains(item.getAttributeValue("string"))) { it.remove(); dups++; } } if (dups > 0) { System.out.println("Removing " + dups + " test item duplicate(s) from training, fold " + i); } FileOutputStream testOut = new FileOutputStream(new File(tmpDir, testFileName(i))); tester.grammar.serializeXml(testDoc, testOut); testOut.close(); FileOutputStream trainOut = new FileOutputStream(new File(tmpDir, trainingFileName(i))); tester.grammar.serializeXml(trainDoc, trainOut); trainOut.close(); } System.out.println(); } catch (JDOMException exc) { throw (IOException) new IOException().initCause(exc); } } // training/test file names private String trainingFileName(int foldNum) { return "fold" + foldNum + "-train.xml"; } private String testFileName(int foldNum) { return "fold" + foldNum + "-test.xml"; } /** Does scorer prep (if any) on the folds already set-up in tmpdir. */ public void prepScorers() throws IOException { // ensure tmpDir set if (tmpDir == null) tmpDir = new File(tmpdir); System.out.println("Preparing scorers in tmpdir: " + tmpdir); System.out.println(); // do each fold for (int i = 0; i < numFolds; i++) { // make training/test files for fold File trainFile = new File(tmpDir, trainingFileName(i)); File testFile = new File(tmpDir, testFileName(i)); // prep scorer scorerMaker.prepScorer(tmpDir, i, trainFile, testFile); } // summary scorerMaker.prepScorersSummary(tmpDir); } /** Default, do-nothing implementation of ScorerMaker.setCVR. */ public void setCVR(CrossValidateRealizer cvr) {} /** Default, do-nothing implementation of ScorerMaker.prepScorer. */ public void prepScorer(File tmpDir, int foldNum, File trainFile, File testFile) throws IOException {} /** Default, do-nothing implementation of ScorerMaker.prepScorersSummary. */ public void prepScorersSummary(File tmpDir) throws IOException {} /** * Default implementation of ScorerMaker.loadScorer. * Loads an n-gram precision model with semantic class replacement, * using targets from the training data. */ public SignScorer loadScorer(File tmpDir, int foldNum, File trainFile) throws IOException { RegressionInfo trainingItems = new RegressionInfo(tester.grammar, trainFile); String[] targets = new String[trainingItems.numberOfItems()]; for (int i=0; i < trainingItems.numberOfItems(); i++) { targets[i] = trainingItems.getItem(i).sentence; } NgramPrecisionModel retval = (tester.ngramOrder > 0) ? new NgramPrecisionModel(targets, tester.ngramOrder, true) : new NgramPrecisionModel(targets, true); return retval; } /** Run the cross-validation test on the folds already set-up in tmpdir. */ public void runTest() throws IOException { // ensure tmpDir set if (tmpDir == null) tmpDir = new File(tmpdir); // turn-off parsing, stats tester.doParsing = false; tester.showStats = false; // setup realizer tester.realizer = new Realizer(tester.grammar); if (pruningStrategy != null) { tester.realizer.pruningStrategy = pruningStrategy; } // show realizer settings Regression.showRealizerSettings(); // do each fold for (int i = 0; i < numFolds; i++) { // make files for fold File trainFile = new File(tmpDir, trainingFileName(i)); File testFile = new File(tmpDir, testFileName(i)); // load scorer tester.scorer = scorerMaker.loadScorer(tmpDir, i, trainFile); // run test tester.runTest(testFile); } // show stats tester.showStats(); } /** Command-line routine for cross-validating realizer. */ public static void main(String[] args) throws IOException { String usage = "java opennlp.ccg.test.CrossValidateRealizer " + "(-folds N) (-tmp ) " + "(-setuponly) (-skipsetup) " + "(-preponly) (-skipprep) " + "(-ngramorder N) " + "(-scorermaker ) " + "(-pruningstrategy ) " + "(-g ) (-s ) ()"; if (args.length > 0 && args[0].equals("-h")) { System.out.println("Usage: " + usage); System.exit(0); } // args double numFolds = 0; String tmpdir = null; boolean setupOnly = false; boolean skipSetup = false; boolean prepOnly = false; boolean skipPrep = false; int ngramOrder = 0; String scorerMakerClass = null; String pruningStrategyClass = null; String grammarfile = "grammar.xml"; String testbedfile = "testbed.xml"; String statsfile = null; for (int i = 0; i < args.length; i++) { if (args[i].equals("-folds")) { numFolds = Double.parseDouble(args[++i]); if (numFolds < 1 || (numFolds >= 2 && numFolds != Math.round(numFolds))) { System.out.println("Error, folds must be 1.x or an int of at least 2"); System.exit(-1); } continue; } if (args[i].equals("-tmp")) { tmpdir = args[++i]; continue; } if (args[i].equals("-setuponly")) { setupOnly = true; continue; } if (args[i].equals("-skipsetup")) { skipSetup = true; continue; } if (args[i].equals("-preponly")) { prepOnly = true; continue; } if (args[i].equals("-skipprep")) { skipPrep = true; continue; } if (args[i].equals("-ngramorder")) { ngramOrder = Integer.parseInt(args[++i]); continue; } if (args[i].equals("-scorermaker")) { scorerMakerClass = args[++i]; continue; } if (args[i].equals("-pruningstrategy")) { pruningStrategyClass = args[++i]; continue; } if (args[i].equals("-g")) { grammarfile = args[++i]; continue; } if (args[i].equals("-s")) { statsfile = args[++i]; continue; } testbedfile = args[i]; } // make cross-validator URL grammarURL = new File(grammarfile).toURI().toURL(); CrossValidateRealizer cvr = new CrossValidateRealizer(grammarURL); if (numFolds > 0) cvr.numFolds = numFolds; if (tmpdir != null) cvr.tmpdir = tmpdir; if (ngramOrder > 0) cvr.tester.ngramOrder = ngramOrder; if (scorerMakerClass != null) { try { cvr.scorerMaker = (ScorerMaker) Class.forName(scorerMakerClass).newInstance(); cvr.scorerMaker.setCVR(cvr); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } if (pruningStrategyClass != null) { try { cvr.pruningStrategy = (PruningStrategy) Class.forName(pruningStrategyClass).newInstance(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } if (statsfile != null) cvr.tester.statsfile = statsfile; // set-up inputs URL testbedURL = new File(testbedfile).toURI().toURL(); if (!skipSetup) { cvr.setupInputs(testbedURL); } if (setupOnly) { System.exit(0); } // prep scorers if (!skipPrep) { cvr.prepScorers(); } if (prepOnly) { System.exit(0); } // run test System.gc(); cvr.runTest(); } } ================================================ FILE: src/opennlp/ccg/test/DerivMaker.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2016 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; //import java.util.*; import org.jdom.*; import opennlp.ccg.synsem.*; /** * Utility class for exporting derivations in xml. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/12/21 04:18:31 $ */ public class DerivMaker { /** * Returns a derivation in the same format as the converted CCGbank, * but just with the lexemes, POS tags and supertags. */ public static Element makeDeriv(Sign sign) { Element retval; if (sign.isLexical()) { retval = new Element("Leafnode"); retval.setAttribute("lexeme",sign.getOrthography()); retval.setAttribute("pos",sign.getPOS()); } else { retval = new Element("Treenode"); for (Sign child: sign.getDerivationHistory().getInputs()) retval.addContent(makeDeriv(child)); } retval.setAttribute("stag",sign.getSupertag()); return retval; } } ================================================ FILE: src/opennlp/ccg/test/GenTargets.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import opennlp.ccg.realize.*; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import opennlp.ccg.ngrams.*; import org.jdom.*; import java.io.*; import java.net.*; import java.util.*; /** * Generates initial target sentences from a list of logical forms. * The input is an XML file with a list of top-level LF elements. * The output is a testbed file, with the number of parses just * set to 1 (rather than actually being computed). * The best realization is determined using n-grams from an * existing testbed file. * * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/12/21 04:18:31 $ */ public class GenTargets { /** The grammar. */ private Grammar grammar; /** The realizer instance. */ private Realizer realizer; /** The n-gram scorer. */ private NgramScorer ngramScorer; /** The unique target strings. */ private Set uniqueTargets = new HashSet(); /** * Constructor: loads grammar, instantiates realizer, and sets up n-gram scorer * using targets with sem class replacement from the testbed. */ private GenTargets(URL grammarURL, File regressionFile) throws IOException { // load grammar System.out.println("Loading grammar from: " + grammarURL); grammar = new Grammar(grammarURL); // set up n-gram scorer System.out.println("Loading target phrases from: " + regressionFile); RegressionInfo rinfo = new RegressionInfo(grammar, regressionFile); String[] targets = new String[rinfo.numberOfItems()]; for (int i=0; i < targets.length; i++) { String target = rinfo.getItem(i).sentence; targets[i] = target; } // use targets with sem class replacement ngramScorer = new NgramPrecisionModel(targets, true); // instantiate realizer realizer = new Realizer(grammar); } // does realization, adds test case private void realize(Element lfElt, Element outRoot) throws IOException { // get LF LF lf = Realizer.getLfFromElt(lfElt); // run request realizer.realize(lf, ngramScorer); Chart chart = realizer.getChart(); // make test item (w/o trying to figure out the correct number of parses) String target = chart.bestEdge.getSign().getOrthography(); if (uniqueTargets.contains(target)) { System.out.println("Duplicate realization: " + target); return; } uniqueTargets.add(target); System.out.println("Best realization: " + target); Element testElt = RegressionInfo.makeTestItem(grammar, target, 1, lf); // add to output outRoot.addContent(testElt); if (!chart.bestEdge.complete()) { System.out.println("NB: realization incomplete!"); testElt.setAttribute("complete", "false"); } } /** Creates generator and runs it on the given input file. */ @SuppressWarnings("unchecked") public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.test.GenTargets (-g ) (-tb ) "; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } // args String grammarfile = "grammar.xml"; String testbedfile = "testbed.xml"; String inputfile = null; String outputfile = null; for (int i = 0; i < args.length; i++) { if (args[i].equals("-g")) { grammarfile = args[++i]; continue; } if (args[i].equals("-tb")) { testbedfile = args[++i]; continue; } if (inputfile == null) { inputfile = args[i]; continue; } if (outputfile == null) { outputfile = args[i]; continue; } } if (inputfile == null || outputfile == null) { System.out.println(usage); System.exit(0); } // create Generator File gFile = new File(grammarfile); URL grammarURL = gFile.toURI().toURL(); File tbFile = new File(testbedfile); if (!tbFile.exists()) { tbFile = new File(gFile.getParentFile(), testbedfile); } GenTargets gen = new GenTargets(grammarURL, tbFile); // load input LFs System.out.println("Loading LFs from: " + inputfile); Document doc = gen.grammar.loadFromXml(inputfile); // create output doc Document outDoc = new Document(); Element outRoot = new Element("regression"); outDoc.setRootElement(outRoot); // realize each one System.out.println("Realizing LFs ..."); Element root = doc.getRootElement(); List lfElts = root.getChildren("lf"); for (int i = 0; i < lfElts.size(); i++) { Element lfElt = (Element) lfElts.get(i); try { gen.realize(lfElt, outRoot); } catch (Exception exc) { System.out.println("Warning: unable to realize LF " + i + ": " + exc); } } // save file System.out.println("Saving results to: " + outputfile); FileOutputStream out = new FileOutputStream(outputfile); gen.grammar.serializeXml(outDoc, out); out.close(); System.out.println("Done."); } } ================================================ FILE: src/opennlp/ccg/test/Regression.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-9 Jason Baldridge and Michael White (University of Edinburgh / The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import java.io.*; import java.net.URL; import java.text.*; import java.util.*; import java.util.prefs.Preferences; import opennlp.ccg.TextCCG; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.hylo.*; import opennlp.ccg.lexicon.Tokenizer; import opennlp.ccg.lexicon.Word; import opennlp.ccg.ngrams.*; import opennlp.ccg.parse.ParseException; import opennlp.ccg.parse.Parser; import opennlp.ccg.parse.Supertagger; import opennlp.ccg.parse.supertagger.WordAndPOSDictionaryLabellingStrategy; import opennlp.ccg.realize.*; import opennlp.ccg.realize.hypertagger.ZLMaxentHypertagger; import opennlp.ccg.synsem.*; import opennlp.ccg.util.Pair; import opennlp.ccg.util.SingletonList; import opennlp.ccg.perceptron.*; import org.jdom.*; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; /** * Automates the testing of CCG grammars. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.151 $, $Date: 2011/12/11 16:51:58 $ */ public class Regression { /** Flag for whether to do parsing. */ public boolean doParsing = true; /** Flag for whether to do realization. */ public boolean doRealization = true; /** Flag for whether to just do even items. */ public boolean evenOnly = false; /** Flag for whether to just do odd items. */ public boolean oddOnly = false; /** Flag for whether to do garbage collection before each iteration. */ public boolean doGC = false; /** File to write events to (if any). */ public String eventfile = null; /** Flag for whether to include the gold sign when generating events. */ public boolean includeGoldInEvents = false; /** Directory for writing APML files (if any). */ public String apmldir = null; /** Flag for whether to show realization stats. */ public boolean showStats = true; /** Flag for whether to show parsing stats. */ public boolean showParseStats = false; /** File to dump realizer stats to (if any). */ public String statsfile = null; /** File prefix to write bleu test files to (if any). */ public String bleufileprefix = null; /** File to write n-best realizations to (if any). */ public String nbestrealfile = null; /** Flag for whether to normalize strings as for BLEU scoring in n-best output. */ public boolean nbestnormbleu = false; /** Directory to save best realization serializations to (if any). */ public String realserdir = null; /** File to write rescored sign scores to (if any). */ public String rescorefile = null; /** Map from info keys to best realization signs for serialization (if any). */ public Map bestRealMap = null; /** Flag for whether to include LFs in n-best output. */ public boolean nbestincludelfs = false; /** File to write n-best parses to (if any). */ public String nbestparsefile = null; /** The grammar to use for testing. */ public Grammar grammar = null; /** The parser to use for testing. */ public Parser parser = null; /** The realizer to use for testing. */ public Realizer realizer = null; /** The scorer to use for realizer testing (or null, for default). */ public SignScorer scorer = null; /** The scorer to use for parser testing. */ public SignScorer parseScorer = null; /** Flag for whether to only allow exact matches with the default scorer. */ public boolean exactMatches = false; /** The n-gram order to use with the default scorer (or 0, for default). */ public int ngramOrder = 0; /** The feature extractor to use in extracting events. */ public FeatureExtractor featureExtractor = null; // // the various totals // public int pCount = 0; public int pBadCount = 0; public int pFailedCount = 0; public int pExactCount = 0; public double totalF = 0.0; public double totalRecall = 0.0; public double totalPrecision = 0.0; public double totalDepsF = 0.0; public double totalDepsRecall = 0.0; public double totalDepsPrecision = 0.0; public double totalUnlabeledDepsF = 0.0; public double totalUnlabeledDepsRecall = 0.0; public double totalUnlabeledDepsPrecision = 0.0; public double totalFComplete = 0.0; public double totalRecallComplete = 0.0; public double totalPrecisionComplete = 0.0; public double totalDepsFComplete = 0.0; public double totalDepsRecallComplete = 0.0; public double totalDepsPrecisionComplete = 0.0; public double totalUnlabeledDepsFComplete = 0.0; public double totalUnlabeledDepsRecallComplete = 0.0; public double totalUnlabeledDepsPrecisionComplete = 0.0; public int pTotalEdges = 0; public int pTotalEdgesGood = 0; public int pMaxEdges = 0; public int pMaxEdgesGood = 0; public int pTotalUnpackingEdges = 0; public int pMaxUnpackingEdges = 0; public int pTotalCellMax = 0; public int pTotalCellMaxGood = 0; public int pMaxCellMax = 0; public int pMaxCellMaxGood = 0; public int pTotalLexTime = 0; public int pTotalParseTime = 0; public int pTotalChartTime = 0; public int pTotalUnpackingTime = 0; public int pMaxLexTime = 0; public int pMaxParseTime = 0; public int pMaxChartTime = 0; public int pMaxUnpackingTime = 0; public Map pBetaTallies = null; public int rCount = 0; public int rDoneCount = 0; public int rBadCount = 0; public int rExactCount = 0; public double totalScore = 0.0; public double totalScoreComplete = 0.0; public double totalReciprocalRank = 0.0; public int totalNominals = 0; public int totalTokens = 0; public int minTokens = 0; public int maxTokens = 0; public int totalRuleApps = 0; public int totalEdges = 0; public int totalEdgesCreated = 0; public int totalUnprunedEdges = 0; public int totalPrunedRemoved = 0; public int totalPrunedNeverAdded = 0; public int totalCellMax = 0; public int totalNewBest = 0; public int totalLex = 0; public int totalFirst = 0; public int totalBest = 0; public int totalPacked = 0; public int totalStoppedOrDone = 0; public int maxLex = 0; public int maxFirst = 0; public int maxBest = 0; public int maxNewBest = 0; public int maxPacked = 0; public int maxStoppedOrDone = 0; public int oracleBetter = 0; public int goldMissing = 0; public String maxLexStr = null; public String maxFirstStr = null; public String maxBestStr = null; public String maxNewBestStr = null; public String maxPackedStr = null; public String maxStoppedOrDoneStr = null; public List bestEstimatedScores = null; public List bestActualScores = null; public List itemRanks = null; public TimingMap lexMap = null; public TimingMap firstMap = null; public TimingMap bestMap = null; public TimingMap allMap = null; private PrintWriter events = null; private PrintWriter bleuGen = null; private PrintWriter bleuRef = null; private PrintWriter bleuSrc = null; private PrintWriter nbestrealPW = null; private PrintWriter rescorePW = null; private PrintWriter nbestparsePW = null; private XMLOutputter xmlOutputter = new XMLOutputter(); // for xml-escaping strings /** Constructor. */ public Regression() { // init resetTotals(); } /** Resets the various totals. */ public void resetTotals() { // parser pCount = 0; pBadCount = 0; pFailedCount = 0; pExactCount = 0; totalF = 0.0; totalRecall = 0.0; totalPrecision = 0.0; totalDepsF = 0.0; totalDepsRecall = 0.0; totalDepsPrecision = 0.0; totalUnlabeledDepsF = 0.0; totalUnlabeledDepsRecall = 0.0; totalUnlabeledDepsPrecision = 0.0; totalFComplete = 0.0; totalRecallComplete = 0.0; totalPrecisionComplete = 0.0; totalDepsFComplete = 0.0; totalDepsRecallComplete = 0.0; totalDepsPrecisionComplete = 0.0; totalUnlabeledDepsFComplete = 0.0; totalUnlabeledDepsRecallComplete = 0.0; totalUnlabeledDepsPrecisionComplete = 0.0; pTotalEdges = 0; pTotalEdgesGood = 0; pMaxEdges = 0; pMaxEdgesGood = 0; pTotalUnpackingEdges = 0; pMaxUnpackingEdges = 0; pTotalCellMax = 0; pTotalCellMaxGood = 0; pMaxCellMax = 0; pMaxCellMaxGood = 0; pTotalLexTime = 0; pTotalParseTime = 0; pTotalChartTime = 0; pTotalUnpackingTime = 0; pMaxLexTime = 0; pMaxParseTime = 0; pMaxChartTime = 0; pMaxUnpackingTime = 0; if (doParsing) { pBetaTallies = new TreeMap(); } // realizer rCount = 0; rDoneCount = 0; rBadCount = 0; rExactCount = 0; totalScore = 0.0; totalScoreComplete = 0.0; totalReciprocalRank = 0.0; totalNominals = 0; totalTokens = 0; minTokens = 0; maxTokens = 0; totalRuleApps = 0; totalEdges = 0; totalEdgesCreated = 0; totalUnprunedEdges = 0; totalPrunedRemoved = 0; totalPrunedNeverAdded = 0; totalCellMax = 0; totalNewBest = 0; totalLex = 0; totalFirst = 0; totalBest = 0; totalPacked = 0; totalStoppedOrDone = 0; maxLex = 0; maxFirst = 0; maxBest = 0; maxNewBest = 0; maxPacked = 0; maxStoppedOrDone = 0; oracleBetter = 0; goldMissing = 0; maxLexStr = null; maxFirstStr = null; maxBestStr = null; maxNewBestStr = null; maxPackedStr = null; maxStoppedOrDoneStr = null; if (doRealization) { bestActualScores = new ArrayList(); bestEstimatedScores = new ArrayList(); itemRanks = new ArrayList(); lexMap = new TimingMap("lex"); firstMap = new TimingMap("first"); bestMap = new TimingMap("best"); allMap = new TimingMap("all"); } } // sets up bleu output private void bleuSetup() throws IOException { // setup bleu files, if apropos if (bleufileprefix != null && doRealization) { bleuGen = new PrintWriter(new BufferedWriter(new FileWriter(bleufileprefix + "-gen.sgm"))); bleuRef = new PrintWriter(new BufferedWriter(new FileWriter(bleufileprefix + "-ref.sgm"))); bleuSrc = new PrintWriter(new BufferedWriter(new FileWriter(bleufileprefix + "-src.sgm"))); bleuGen.println(""); bleuRef.println(""); bleuSrc.println(""); } } // sets up n-best realization output private void nbestrealSetup() throws IOException { // set up file to write ref & n-best realizations (if any) if (nbestrealfile != null && doRealization) { nbestrealPW = new PrintWriter(new BufferedWriter(new FileWriter(nbestrealfile))); nbestrealPW.println(""); } } // sets up rescored sign score output private void rescoreSetup() throws IOException { // set up rescoring file if (rescorefile != null) { rescorePW = new PrintWriter(new BufferedWriter(new FileWriter(rescorefile))); rescorePW.println(""); } } // sets up n-best parsing output private void nbestparseSetup() throws IOException { // set up file to write sentence & n-best parses (if any) if (nbestparsefile != null && doParsing) { nbestparsePW = new PrintWriter(new BufferedWriter(new FileWriter(nbestparsefile))); nbestparsePW.println(""); } } // starts a doc private void bleuStartDoc(String id) { if (bleufileprefix != null && doRealization) { bleuGen.println(""); bleuRef.println(""); bleuSrc.println(""); } } // ends a doc private void bleuEndDoc() { if (bleufileprefix != null && doRealization) { bleuGen.println(""); bleuRef.println(""); bleuSrc.println(""); } } // finishes bleu output private void bleuFinish() throws IOException { // finish bleu files, if apropos if (bleufileprefix != null && doRealization) { bleuGen.println(""); bleuRef.println(""); bleuSrc.println(""); bleuGen.flush(); bleuGen.close(); bleuRef.flush(); bleuRef.close(); bleuSrc.flush(); bleuSrc.close(); } } // finishes n-best realization output private void nbestrealFinish() throws IOException { // finish n-best real file, if apropos if (nbestrealfile != null && doRealization) { nbestrealPW.println(""); nbestrealPW.flush(); nbestrealPW.close(); } } // finishes rescored sign output private void rescoreFinish() throws IOException { // finish rescoring file if (rescorefile != null) { rescorePW.println(""); rescorePW.flush(); rescorePW.close(); } } // finishes n-best parsing output private void nbestparseFinish() throws IOException { // finish n-best real file, if apropos if (nbestparsefile != null && doParsing) { nbestparsePW.println(""); nbestparsePW.flush(); nbestparsePW.close(); } } // resets bestRealMap private void realserStartDoc() { if (realserdir != null && doRealization) { bestRealMap = new HashMap(); } } // serializes bestRealMap private void realserEndDoc(String testName) throws IOException { if (realserdir != null && doRealization) { File serFile = new File(new File(realserdir), testName + ".ser"); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(serFile)); oos.writeObject(bestRealMap); oos.close(); } } // escapes string for xml output private String xmlEscape(String s) { return xmlOutputter.outputString(new Text(s)); } // normalizes realizations for BLEU scoring // at present, this means replacing underscores with spaces and escaping for xml output private String norm_bleu(String s) { return xmlEscape(s.replace('_', ' ')); } /** Runs the test on the items in the given file or directory of files. */ public void runTest(File regressionFile) throws IOException { // set up event file (if any) if (eventfile != null) events = EventFile.openWriter(new File(eventfile)); // set up bleu output, n-best realizations, rescoring, n-best parses (if apropos) bleuSetup(); nbestrealSetup(); rescoreSetup(); nbestparseSetup(); // do each file or files for (File f : getXMLFiles(regressionFile)) runSingleTest(f); // finish bleu, n-best realization output, rescoring, n-best parses (if apropos) bleuFinish(); nbestrealFinish(); rescoreFinish(); nbestparseFinish(); // close event file (if any) if (events != null) { events.flush(); events.close(); } // show stats (if apropos) if (rescorefile != null) return; if (doParsing && showParseStats) showParseStats(); if (doRealization && showStats) showStats(); } /** Returns a list of xml files from the given file or directory. */ public static List getXMLFiles(File file) { if (!file.isDirectory()) return new SingletonList(file); List retval = new ArrayList(); File[] files = file.listFiles(); Arrays.sort(files); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory() && files[i].getName().endsWith(".xml")) retval.add(files[i]); } return retval; } /** Runs the test on the items in the given file. */ private void runSingleTest(File regressionFile) throws IOException { String testName = regressionFile.getName(); int lastDot = testName.lastIndexOf('.'); if (lastDot > 0) testName = testName.substring(0, lastDot); // load testfile System.out.println("Loading: " + testName); System.out.println(); RegressionInfo rinfo = new RegressionInfo(grammar, regressionFile); // start bleu doc (if apropos) bleuStartDoc(testName); // start storing best realizations for serialization (if apropos) realserStartDoc(); // do each test int numItems = rinfo.numberOfItems(); System.out.println("Parse\tRealize\tString"); System.out.println("-----\t-------\t------"); for (int i=0; i < numItems; i++) { // check even/odd only if (i % 2 == 1 && evenOnly) continue; if (i % 2 == 0 && oddOnly) continue; RegressionInfo.TestItem testItem = rinfo.getItem(i); if (doGC) System.gc(); // short circuit for sign rescoring; realization only at the moment if (rescorefile != null) { String id = testItem.info; double score = scorer.score(testItem.sign, true); rescorePW.println(""); showOutcome("-", nfE.format(score), "", testItem.sign.getOrthography()); continue; } List parses = null; List parseScores = null; LF parsedLF = null; LF compactedLF = null; LF transformedParsedLF = null; boolean parsed = false; boolean parsedComplete = false; if (doParsing) { try { // use full-words or words from stored sign if possible List words = null; if (testItem.fullWords != null) { words = grammar.lexicon.tokenizer.tokenize(testItem.fullWords, true); // strip and if (words.get(0).getForm() == "") words.remove(0); if (words.get(words.size()-1).getForm() == "") words.remove(words.size()-1); } else if (testItem.sign != null) { words = testItem.sign.getWords(); } if (words != null) { // parse 'em parser.parse(words); } else { parser.parse(testItem.sentence); } // retrieve results parses = parser.getResult(); parseScores = parser.getScores(); parsed = true; parsedComplete = !parses.get(0).getCategory().isFragment(); // get LF of best parse, if needed if (showParseStats || (doRealization && testItem.lfElt == null && testItem.sign == null)) { Sign sign = parses.get(0); Category cat = sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); parsedLF = cat.getLF(); index = HyloHelper.convertNominals(parsedLF, sign, index); compactedLF = HyloHelper.compact(parsedLF, index); // get transformed version if needed if (testItem.sign == null) { transformedParsedLF = grammar.transformLF(compactedLF); } } } catch (ParseException e) { parses = Collections.emptyList(); parsed = false; } catch (Exception e) { parses = Collections.emptyList(); parsed = false; System.err.println("Uncaught exception in parsing: " + testItem.sentence); e.printStackTrace(System.err); } // update parse stats int count = parser.edgeCount(); pTotalEdges += count; if (count > pMaxEdges) pMaxEdges = count; if (parsedComplete) { pTotalEdgesGood += count; if (count > pMaxEdgesGood) pMaxEdgesGood = count; } count = parser.unpackingEdgeCount(); pTotalUnpackingEdges += count; if (count > pMaxUnpackingEdges) pMaxUnpackingEdges = count; int cellMax = parser.maxCellSize(); pTotalCellMax += cellMax; if (cellMax > pMaxCellMax) pMaxCellMax = cellMax; if (parsedComplete) { pTotalCellMaxGood += cellMax; if (cellMax > pMaxCellMaxGood) pMaxCellMaxGood = cellMax; } int time = parser.getLexTime(); pTotalLexTime += time; if (time > pMaxLexTime) pMaxLexTime = time; time = parser.getParseTime(); pTotalParseTime += time; if (time > pMaxParseTime) pMaxParseTime = time; time = parser.getChartTime(); pTotalChartTime += time; if (time > pMaxChartTime) pMaxChartTime = time; time = parser.getUnpackingTime(); pTotalUnpackingTime += time; if (time > pMaxUnpackingTime) pMaxUnpackingTime = time; double beta = parser.getSupertaggerBeta(); Integer betaTally = pBetaTallies.get(beta); pBetaTallies.put(beta, (betaTally != null) ? ++betaTally : 1); } // get test item LF, if needed LF testItemLF = null; if (testItem.lfElt != null && (doRealization || (showParseStats && parsed && testItem.sign == null))) { Element lfElt = testItem.lfElt; Document doc = new Document(); lfElt.detach(); doc.setRootElement(lfElt); testItemLF = grammar.loadLF(doc); } // compare EPs EPsScorer.Results parseScore = null; LF goldLF = null; if (showParseStats && parsedLF != null) { // get LF to score, gold LF LF lfToScore = null; if (testItem.sign != null) { // use LF from stored sign if available lfToScore = parsedLF; Category cat = testItem.sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); goldLF = cat.getLF(); index = HyloHelper.convertNominals(goldLF, testItem.sign, index); } else { // otherwise use test item LF lfToScore = transformedParsedLF; goldLF = testItemLF; } if (goldLF == null) { throw new RuntimeException( "Can't score parse: " + testItem.sentence + "!\n" + "No gold LF." ); } // score parse parseScore = EPsScorer.score(lfToScore, goldLF); } // update parsing results if (showParseStats && testItem.numOfParses > 0) { pCount++; if (parses.size() == 0 || !parsedComplete) pBadCount++; if (parses.size() == 0) pFailedCount++; if (parseScore != null) { if (parseScore.fscore == 1.0) pExactCount++; totalF += parseScore.fscore; totalRecall += parseScore.recall; totalPrecision += parseScore.precision; totalDepsF += parseScore.depsFscore; totalDepsRecall += parseScore.depsRecall; totalDepsPrecision += parseScore.depsPrecision; totalUnlabeledDepsF += parseScore.unlabeledDepsFscore; totalUnlabeledDepsRecall += parseScore.unlabeledDepsRecall; totalUnlabeledDepsPrecision += parseScore.unlabeledDepsPrecision; if (parsedComplete) { totalFComplete += parseScore.fscore; totalRecallComplete += parseScore.recall; totalPrecisionComplete += parseScore.precision; totalDepsFComplete += parseScore.depsFscore; totalDepsRecallComplete += parseScore.depsRecall; totalDepsPrecisionComplete += parseScore.depsPrecision; totalUnlabeledDepsFComplete += parseScore.unlabeledDepsFscore; totalUnlabeledDepsRecallComplete += parseScore.unlabeledDepsRecall; totalUnlabeledDepsPrecisionComplete += parseScore.unlabeledDepsPrecision; } } } // events output if (events != null && doParsing) { // nb: only dealing with complete parses at the moment // nb: gold LF must come from saved sign if (parses.size() > 0 && testItem.sign != null) { List bestSigns = new ArrayList(parses); Sign best = parses.get(0); // update best if not exact match if (parseScore.fscore != 1.0) { // check oracle best Pair bestPair = parser.oracleBest(goldLF); if (bestPair.a != null) oracleBetter++; if (bestPair.b) { best = bestPair.a; if (!bestSigns.contains(best)) bestSigns.add(best); } // add gold if missing, if apropos else { goldMissing++; if (includeGoldInEvents) { best = testItem.sign; //parser.addSupertaggerLogProbs(best); bestSigns.add(best); } } } EventFile.writeEvents(events, bestSigns, best, featureExtractor); } } // n-best parses output if (nbestparsePW != null) { XMLOutputter outputter = new XMLOutputter(); outputter.setFormat(Format.getPrettyFormat()); // header for item String extras = ""; if (parsedComplete) extras += " complete=\"true\""; String id = testItem.info; if (id == null) id = "" + i; nbestparsePW.println(""); String tagend = (nbestincludelfs) ? ">" : "/>"; // add best parse if (parseScore != null) { double edgeScore = parseScores.get(0); String scores = "score=\"" + nf.format(parseScore.fscore) + "\" edge-score=\"" + nfE.format(edgeScore) + "\""; nbestparsePW.println(""); } } // add remaining n-best for (int k=1; k < parses.size(); k++) { Sign sign = parses.get(k); double edgeScore = parseScores.get(k); Category cat = sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); LF parsedLFk = cat.getLF(); index = HyloHelper.convertNominals(parsedLFk, sign, index); LF compactedLFk = HyloHelper.compact(parsedLFk, index); LF lfToScore = parsedLFk; if (testItem.sign != null) { lfToScore = grammar.transformLF(compactedLFk); } EPsScorer.Results parseScoreK = EPsScorer.score(lfToScore, goldLF); String scores = "score=\"" + nf.format(parseScoreK.fscore) + "\" edge-score=\"" + nfE.format(edgeScore) + "\""; nbestparsePW.println(""); } } // close item nbestparsePW.println(""); } // determine string to show for parse result String starForBadSentence = ""; if (testItem.numOfParses == 0) starForBadSentence = "*"; String parseResult; if (!doParsing) { parseResult = "-"; } else if (parseScore != null) { parseResult = nf.format(parseScore.fscore); if (!parsedComplete) parseResult = "[" + parseResult + "]"; } else if (testItem.numOfParses == parses.size()) { parseResult = "ok"; } else if (testItem.numOfParses > 0 && parses.size() > 0) { // show num parses, if not the number expected parseResult = "(" + parses.size() + ")"; } else if (testItem.knownFailure) { parseResult = "(known)"; } else { parseResult = "FAILED"; } if (!doRealization || (doParsing && !parsed) || testItem.numOfParses == 0) { showOutcome(parseResult, "-", starForBadSentence, testItem.sentence); continue; } LF inputLF = null; // use given LF if (testItemLF != null) inputLF = testItemLF; // or LF from stored sign else if (testItem.sign != null) { Sign sign = testItem.sign; Category cat = sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); LF convertedLF = HyloHelper.compactAndConvertNominals(cat.getLF(), index, sign); inputLF = grammar.transformLF(convertedLF); } // otherwise use first parse else if (transformedParsedLF != null) inputLF = transformedParsedLF; // otherwise give up else { String suggestion = (!doParsing) ? "Try leaving off -noparsing option." : ""; throw new RuntimeException("No LF to realize! " + suggestion); } // set up n-gram precision scorer for default scoring and/or scoring results String[] targets = (testItem.alt == null) ? new String[] { testItem.sentence } : new String[] { testItem.sentence, testItem.alt }; NgramPrecisionModel defaultNgramScorer = new NgramPrecisionModel(targets); SignScorer scorerToUse = scorer; if (scorerToUse == null) { if (ngramOrder > 0 || exactMatches) { if (ngramOrder > 0) scorerToUse = new NgramPrecisionModel(targets, ngramOrder); else scorerToUse = new NgramPrecisionModel(targets); ((NgramPrecisionModel)scorerToUse).setExactMatches(exactMatches); } else scorerToUse = defaultNgramScorer; } // set targets for self-paraphrase biasing else if (scorerToUse instanceof SelfParaphraseBiaser) { ((SelfParaphraseBiaser)scorerToUse).setTargets(targets); } if (doGC) System.gc(); try { //Add gold std pred supertag mapping to the hypertagger class if (events != null && realizer.hypertagger != null && testItem.predInfo != null) { realizer.hypertagger.storeGoldStdPredInfo(testItem.predInfo); } realizer.realize(inputLF, scorerToUse); } catch (Throwable thrwbl) { System.out.println("Unable to process: " + testItem.sentence); thrwbl.printStackTrace(System.out); continue; } opennlp.ccg.realize.Chart chart = realizer.getChart(); String realizeResult = "ok"; boolean gramcomplete = true; boolean joined = false; Edge bestEdge = chart.bestEdge; if (!bestEdge.complete() || bestEdge.getSign().getCategory().isFragment()) { realizeResult = "[ok]"; gramcomplete = false; rBadCount++; if (chart.joinFragments) { bestEdge = chart.bestJoinedEdge; joined = true; } } String bestRealization = bestEdge.getSign().getOrthography(); double score = defaultNgramScorer.score(bestEdge.getSign(), false); // nb: use default n-gram precision score for reporting // events output if (events != null) { List bestEdges = chart.bestEdges(); // nb: only dealing with complete realizations at the moment if (bestEdges.size() > 0) { Pair bestPair = chart.oracleBest(testItem.sentence); Edge oracleBest = bestPair.a; if (oracleBest != null) { Sign best = oracleBest.getSign(); List bestSigns = new ArrayList(bestEdges.size()+1); for (Edge e : bestEdges) bestSigns.add(e.getSign()); if (bestEdge != oracleBest) oracleBetter++; if (!bestPair.b) { goldMissing++; if (includeGoldInEvents) { best = testItem.sign; } } if (!bestSigns.contains(best)) bestSigns.add(best); EventFile.writeEvents(events, bestSigns, best, featureExtractor); } } } // bleu output if (bleufileprefix != null) { String extras = " time=\""; if (chart.done) extras += chart.timeTilDone; else extras += chart.timeTilStopped; extras += "\""; extras += " score=\"" + nf.format(score) + "\""; if (gramcomplete) extras += " complete=\"true\""; if (joined) extras += " joined=\"true\""; String id = testItem.info; if (id == null) id = "" + i; bleuGen.println("" + norm_bleu(bestRealization) + ""); String sent = norm_bleu(testItem.sentence); bleuRef.println("" + sent + ""); bleuSrc.println("" + sent + ""); } // n-best realization output if (nbestrealPW != null) { XMLOutputter outputter = new XMLOutputter(); outputter.setFormat(Format.getPrettyFormat()); // header for item String extras = ""; if (gramcomplete) extras += " complete=\"true\""; if (joined) extras += " joined=\"true\""; String id = testItem.info; if (id == null) id = "" + i; nbestrealPW.println(""); // add ref sentence String ref = (nbestnormbleu) ? norm_bleu(testItem.sentence) : xmlEscape(testItem.sentence); nbestrealPW.println("" + ref + ""); // add best realization String scores = "score=\"" + nf.format(score) + "\" edge-score=\"" + nfE.format(bestEdge.score) + "\""; String best = (nbestnormbleu) ? norm_bleu(bestRealization) : xmlEscape(bestRealization); if (!nbestincludelfs) nbestrealPW.println("" + best + ""); else { nbestrealPW.println(""); nbestrealPW.println("" + best + ""); Sign sign = bestEdge.getSign(); Category cat = sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); LF lf = cat.getLF(); index = HyloHelper.convertNominalsToVars(lf, index); index = HyloHelper.convertNominals(lf, sign, index); LF lfc = HyloHelper.compact(lf, index); Element lfElt = grammar.makeLfElt(lfc); nbestrealPW.println(outputter.outputString(lfElt)); nbestrealPW.println(""); } // if complete, add remaining n-best if (bestEdge.complete()) { List bestEdges = chart.bestEdges(); for (int j=1; j < bestEdges.size(); j++) { Edge e = bestEdges.get(j); String eSent = e.getSign().getOrthography(); double eScore = defaultNgramScorer.score(e.getSign(), false); // nb: use default n-gram precision score for reporting String eScores = " score=\"" + nf.format(eScore) + "\" edge-score=\"" + nfE.format(e.score) + "\""; // add next realization String next = (nbestnormbleu) ? norm_bleu(eSent) : xmlEscape(eSent); if (!nbestincludelfs) nbestrealPW.println("" + next + ""); else { nbestrealPW.println(""); nbestrealPW.println("" + next + ""); Sign sign = e.getSign(); Category cat = sign.getCategory().copy(); Nominal index = cat.getIndexNominal(); LF lf = cat.getLF(); index = HyloHelper.convertNominalsToVars(lf, index); index = HyloHelper.convertNominals(lf, sign, index); LF lfc = HyloHelper.compact(lf, index); Element lfElt = grammar.makeLfElt(lfc); nbestrealPW.println(outputter.outputString(lfElt)); nbestrealPW.println(""); } } } // close item nbestrealPW.println(""); } // if apmldir non-null, output APML as apmldir/ex(i+1).apml if (apmldir != null) { String apmlfn = apmldir + "/ex" + (i+1) + ".apml"; grammar.saveToApml(bestEdge.getSign(), apmlfn); } // store best realization, if apropos and grammatically complete, keyed by info string or item position if (realserdir != null && gramcomplete) { String id = testItem.info; if (id == null) id = "i" + i; bestRealMap.put(testItem.info, bestEdge.getSign()); } // compute stats, show outcome rCount++; totalScore += score; if (gramcomplete) totalScoreComplete += score; int itemRank = 1; Tokenizer tokenizer = grammar.lexicon.tokenizer; String itemOrth = tokenizer.getOrthography(tokenizer.tokenize(testItem.sentence)); if (!bestRealization.equals(itemOrth)) { itemRank = 0; List bestEdges = chart.bestEdges(); for (int j = 0; j < bestEdges.size(); j++) { Edge edge = bestEdges.get(j); String str = edge.getSign().getOrthography(); if (str.equals(itemOrth)) { itemRank = j+1; break; } } if (itemRank > 0) totalReciprocalRank += (1.0 / itemRank); if (gramcomplete) { realizeResult = nf.format(score); if (itemRank > 0 && itemRank < 10) realizeResult += " "; if (itemRank > 0 && itemRank < 100) realizeResult += "#" + itemRank; } else { realizeResult = "[" + nf.format(score) + "]"; if (joined) realizeResult += "j"; } showOutcome(parseResult, realizeResult, starForBadSentence, testItem.sentence, bestRealization); } else { rExactCount++; totalReciprocalRank += 1.0; showOutcome(parseResult, realizeResult, starForBadSentence, testItem.sentence); } totalNominals += chart.numNominals; int tokens = testItem.sentence.split("\\s+").length; totalTokens += tokens; if (tokens < minTokens || minTokens == 0) minTokens = tokens; if (tokens > maxTokens) maxTokens = tokens; totalRuleApps += chart.edgeFactory.ruleApps(); totalEdges += chart.numEdgesInChart(); totalEdgesCreated += chart.numEdges; totalUnprunedEdges += chart.numUnprunedEdges(); totalPrunedRemoved += chart.numPrunedRemoved; totalPrunedNeverAdded += chart.numPrunedNeverAdded; totalCellMax += chart.cellMax; totalNewBest += chart.newBest; bestActualScores.add(new Double(score)); bestEstimatedScores.add(new Double(bestEdge.score)); itemRanks.add(new Integer(itemRank)); totalLex += chart.timeTilLex; if (chart.timeTilLex > maxLex) { maxLex = chart.timeTilLex; maxLexStr = testItem.sentence; } lexMap.add(chart.numNominals, chart.timeTilLex); totalFirst += chart.timeTilFirst; if (chart.timeTilFirst > maxFirst) { maxFirst = chart.timeTilFirst; maxFirstStr = testItem.sentence; } firstMap.add(chart.numNominals, chart.timeTilFirst); totalBest += chart.timeTilBest; if (chart.timeTilBest > maxBest) { maxBest = chart.timeTilBest; maxBestStr = testItem.sentence; } bestMap.add(chart.numNominals, chart.timeTilBest); if (chart.newBest > 0 && (chart.timeTilBest - chart.timeTilFirst) >= maxNewBest) { maxNewBest = chart.timeTilBest - chart.timeTilFirst; maxNewBestStr = testItem.sentence; } totalPacked += chart.timeTilPacked; if (chart.timeTilPacked > maxPacked) { maxPacked = chart.timeTilPacked; maxPackedStr = testItem.sentence; } if (chart.done) { rDoneCount++; totalStoppedOrDone += chart.timeTilDone; if (chart.timeTilDone > maxStoppedOrDone) { maxStoppedOrDone = chart.timeTilDone; maxStoppedOrDoneStr = testItem.sentence; } allMap.add(chart.numNominals, chart.timeTilDone); } else { totalStoppedOrDone += chart.timeTilStopped; if (chart.timeTilStopped > maxStoppedOrDone) { maxStoppedOrDone = chart.timeTilStopped; maxStoppedOrDoneStr = testItem.sentence; } allMap.add(chart.numNominals, chart.timeTilStopped); } } // end bleu doc (if apropos) bleuEndDoc(); System.out.println(); // serialize best realizations (if apropos) realserEndDoc(testName); } /** Shows the various parsing totals. */ public void showParseStats() { int pCompleteCount = pCount - pBadCount; int pFragCount = pBadCount - pFailedCount; String pComplete = "" + pCompleteCount; String pCompletePct = "" + nf.format(100.0 * pCompleteCount / pCount) + "%"; System.out.println("Strings parsed completely (in fragments, failed): " + pComplete + " (" + pFragCount + ", " + pFailedCount + ") " + pCompletePct); String pInexact = "" + (pCount - pExactCount); String pExactPct = "" + nf.format(100.0 * pExactCount / pCount) + "%"; System.out.println("Strings parsed exactly (inexactly): " + pExactCount + " (" + pInexact + ") " + pExactPct); String avgEdges = nf.format(1.0 * pTotalEdges / pCount); System.out.println("Average edge count (before unpacking): " + avgEdges); String avgEdgesGood = nf.format(1.0 * pTotalEdgesGood / pCompleteCount); System.out.println("Average edge count (before unpacking) for complete parses: " + avgEdgesGood); System.out.println("Max edge count: " + pMaxEdges); System.out.println("Max edge count for complete parses: " + pMaxEdgesGood); String avgUnpacked = nf.format(1.0 * pTotalUnpackingEdges / pCount); System.out.println("Average edges unpacked (created while unpacking): " + avgUnpacked); System.out.println("Max unpacked edges: " + pMaxUnpackingEdges); String avgLexTime = nf.format(1.0 * pTotalLexTime / pCount); String avgCellMax = nf.format(1.0 * pTotalCellMax / pCount); System.out.println("Average max cell size (before unpacking): " + avgCellMax); String avgCellMaxGood = nf.format(1.0 * pTotalCellMaxGood / pCompleteCount); System.out.println("Average max cell size (before unpacking) for complete parses: " + avgCellMaxGood); System.out.println("Max max cell size: " + pMaxCellMax); System.out.println("Max max cell size for complete parses: " + pMaxCellMaxGood); System.out.println("Average lex lookup time: " + avgLexTime); String avgParseTime = nf.format(1.0 * pTotalParseTime / pCount); System.out.println("Max lex lookup time: " + pMaxLexTime); System.out.println("Average parse time: " + avgParseTime); System.out.println("Max parse time: " + pMaxParseTime); String avgChartTime = nf.format(1.0 * pTotalChartTime / pCount); System.out.println("Average chart construction time: " + avgChartTime); System.out.println("Max chart construction time: " + pMaxChartTime); String avgUnpackingTime = nf.format(1.0 * pTotalUnpackingTime / pCount); System.out.println("Average unpacking time: " + avgUnpackingTime); System.out.println("Max unpacking time: " + pMaxUnpackingTime); System.out.println("Supertagger beta tallies:"); Set betas = pBetaTallies.keySet(); for (double beta : betas) { int tally = pBetaTallies.get(beta); System.out.println(beta + "\t" + tally); } String avgF = nf.format(100.0 * totalF / pCount); System.out.println("Labeled f-score: " + avgF); String avgRecall = nf.format(100.0 * totalRecall / pCount); System.out.println("Labeled recall: " + avgRecall); String avgPrecision = nf.format(100.0 * totalPrecision / pCount); System.out.println("Labeled precision: " + avgPrecision); String avgDepsF = nf.format(100.0 * totalDepsF / pCount); System.out.println("Labeled f-score deps only: " + avgDepsF); String avgDepsRecall = nf.format(100.0 * totalDepsRecall / pCount); System.out.println("Labeled recall deps only: " + avgDepsRecall); String avgDepsPrecision = nf.format(100.0 * totalDepsPrecision / pCount); System.out.println("Labeled precision deps only: " + avgDepsPrecision); String avgUnlabeledDepsF = nf.format(100.0 * totalUnlabeledDepsF / pCount); System.out.println("Unlabeled deps f-score: " + avgUnlabeledDepsF); String avgUnlabeledDepsRecall = nf.format(100.0 * totalUnlabeledDepsRecall / pCount); System.out.println("Unlabeled deps recall: " + avgUnlabeledDepsRecall); String avgUnlabeledDepsPrecision = nf.format(100.0 * totalUnlabeledDepsPrecision / pCount); System.out.println("Unlabeled deps precision: " + avgUnlabeledDepsPrecision); String avgFComplete = nf.format(100.0 * totalFComplete / pCompleteCount); System.out.println("Labeled f-score for complete parses: " + avgFComplete); String avgRecallComplete = nf.format(100.0 * totalRecallComplete / pCompleteCount); System.out.println("Labeled recall for complete parses: " + avgRecallComplete); String avgPrecisionComplete = nf.format(100.0 * totalPrecisionComplete / pCompleteCount); System.out.println("Labeled precision for complete parses: " + avgPrecisionComplete); String avgDepsFComplete = nf.format(100.0 * totalDepsFComplete / pCompleteCount); System.out.println("Labeled f-score deps only for complete parses: " + avgDepsFComplete); String avgDepsRecallComplete = nf.format(100.0 * totalDepsRecallComplete / pCompleteCount); System.out.println("Labeled recall deps only for complete parses: " + avgDepsRecallComplete); String avgDepsPrecisionComplete = nf.format(100.0 * totalDepsPrecisionComplete / pCompleteCount); System.out.println("Labeled precision deps only for complete parses: " + avgDepsPrecisionComplete); String avgUnlabeledDepsFComplete = nf.format(100.0 * totalUnlabeledDepsFComplete / pCompleteCount); System.out.println("Unlabeled deps f-score for complete parses: " + avgUnlabeledDepsFComplete); String avgUnlabeledDepsRecallComplete = nf.format(100.0 * totalUnlabeledDepsRecallComplete / pCompleteCount); System.out.println("Unlabeled deps recall for complete parses: " + avgUnlabeledDepsRecallComplete); String avgUnlabeledDepsPrecisionComplete = nf.format(100.0 * totalUnlabeledDepsPrecisionComplete / pCompleteCount); System.out.println("Unlabeled deps precision for complete parses: " + avgUnlabeledDepsPrecisionComplete); if (oracleBetter > 0) System.out.println("Oracle better: " + oracleBetter); if (goldMissing > 0) System.out.println("Gold missing: " + goldMissing); System.out.println(); } /** Shows the various realization totals. */ public void showStats() { int rCompleteCount = rCount - rBadCount; String rComplete = "" + rCompleteCount; String rCompletePct = "" + nf.format(100.0 * rCompleteCount / rCount) + "%"; System.out.println("Strings realized completely (in fragments): " + rComplete + " (" + rBadCount + ") " + rCompletePct); String rInexact = "" + (rCount - rExactCount); String rExactPct = "" + nf.format(100.0 * rExactCount / rCount) + "%"; System.out.println("Strings realized exactly (inexactly): " + rExactCount + " (" + rInexact + ") " + rExactPct); System.out.println("Strings where realization finished: " + rDoneCount); String avgScore = nf.format(totalScore / rCount); System.out.println("Avg score: " + avgScore); String avgScoreComplete = nf.format(totalScoreComplete / rCompleteCount); System.out.println("Avg score for complete realizations: " + avgScoreComplete); String meanReciprocalRank = nf.format(totalReciprocalRank / rCount); System.out.println("Mean reciprocal rank: " + meanReciprocalRank); String residualMRR = (rCount == rExactCount) ? "n/a" : nf.format((totalReciprocalRank - rExactCount) / (rCount - rExactCount)); System.out.println("Residual mean reciprocal rank: " + residualMRR); String avgNodes = nf.format(totalNominals * 1.0 / rCount); String avgTokens = nf.format(totalTokens * 1.0 / rCount); System.out.println("Avg num nodes, words: " + avgNodes + ", " + avgTokens); System.out.println("Num words (min-max): " + minTokens + "-" + maxTokens); String avgRuleApps = nf.format(totalRuleApps * 1.0 / rCount); System.out.println("Avg num rule apps: " + avgRuleApps); String avgEdges = nf.format(totalEdges * 1.0 / rCount); String avgEdgesCreated = nf.format(totalEdgesCreated * 1.0 / rCount); String avgUnprunedEdges = nf.format(totalUnprunedEdges * 1.0 / rCount); System.out.println("Avg num edges in chart: " + avgEdges); System.out.println("Avg num edges created: " + avgEdgesCreated); System.out.println("Avg num unpruned edges: " + avgUnprunedEdges); String avgRemoved = nf.format(totalPrunedRemoved * 1.0 / rCount); String avgNeverAdded = nf.format(totalPrunedNeverAdded * 1.0 / rCount); System.out.println("Avg num pruned edges removed, never added: " + avgRemoved + ", " + avgNeverAdded); String avgCellMax = nf.format(totalCellMax * 1.0 / rCount); System.out.println("Avg cell max: " + avgCellMax); String avgNewBest = nf.format(totalNewBest * 1.0 / rCount); System.out.println("Total, avg num new best realizations: " + totalNewBest + ", " + avgNewBest); String avgLex = nf.format(lexMap.mean()); String stdLex = nf.format(lexMap.sigma()); System.out.println("Avg (std) time 'til lex lookup finished: " + avgLex + " (" + stdLex + ")"); System.out.println("Max time 'til lex lookup finished: " + maxLex + " (" + maxLexStr + ")"); String avgFirst = nf.format(firstMap.mean()); String stdFirst = nf.format(firstMap.sigma()); System.out.println("Avg (std) time 'til first realization: " + avgFirst + " (" + stdFirst + ")"); System.out.println("Max time 'til first realization: " + maxFirst + " (" + maxFirstStr + ")"); String avgBest = nf.format(bestMap.mean()); String stdBest = nf.format(bestMap.sigma()); System.out.println("Avg (std) time 'til best realization: " + avgBest + " (" + stdBest + ")"); System.out.println("Max time 'til best realization: " + maxBest + " (" + maxBestStr +")"); System.out.println("Max time 'til new best realization: " + maxNewBest + " (" + maxNewBestStr +")"); String avgPacked = nf.format(totalPacked / rCount); System.out.println("Avg time 'til done packing: " + avgPacked); System.out.println("Max time 'til done packing: " + maxPacked + " (" + maxPackedStr +")"); String avgStoppedOrDone = nf.format(allMap.mean()); String stdStoppedOrDone = nf.format(allMap.sigma()); System.out.println("Avg (std) time 'til stopped/done with realizations: " + avgStoppedOrDone + " (" + stdStoppedOrDone + ")"); System.out.println("Max time 'til stopped/done with realizations: " + maxStoppedOrDone + " (" + maxStoppedOrDoneStr +")"); if (oracleBetter > 0) System.out.println("Oracle better: " + oracleBetter); if (goldMissing > 0) System.out.println("Gold missing: " + goldMissing); if (statsfile != null) { Document doc = new Document(); Element root = new Element("rstats"); doc.setRootElement(root); Element counts = new Element("counts"); root.addContent(counts); counts.setAttribute("realized", "" + rCount); counts.setAttribute("complete", "" + rComplete); counts.setAttribute("fragments", "" + rBadCount); counts.setAttribute("exact", "" + rExactCount); counts.setAttribute("inexact", rInexact); counts.setAttribute("finished", "" + rDoneCount); Element overall = new Element("overall"); root.addContent(overall); overall.setAttribute("avg-score", avgScore); overall.setAttribute("mean-reciprocal-rank", meanReciprocalRank); overall.setAttribute("residual-mrr", residualMRR); overall.setAttribute("avg-nodes", avgNodes); overall.setAttribute("avg-words", avgTokens); overall.setAttribute("min-words", "" + minTokens); overall.setAttribute("max-words", "" + maxTokens); Element rules = new Element("rules"); root.addContent(rules); rules.setAttribute("avg-apps", avgRuleApps); Element edges = new Element("edges"); root.addContent(edges); edges.setAttribute("avg", avgEdges); edges.setAttribute("avg-unpruned", avgUnprunedEdges); edges.setAttribute("avg-removed", avgRemoved); edges.setAttribute("avg-never-added", avgNeverAdded); edges.setAttribute("avg-cell-max", avgCellMax); if (oracleBetter > 0) edges.setAttribute("oracle-better", "" + oracleBetter); if (goldMissing > 0) edges.setAttribute("gold-missing", "" + goldMissing); Element newBest = new Element("new-best"); root.addContent(newBest); newBest.setAttribute("total", "" + totalNewBest); newBest.setAttribute("avg", avgNewBest); Element times = new Element("times-summary"); root.addContent(times); times.setAttribute("avg-lex", avgLex); times.setAttribute("std-lex", stdLex); times.setAttribute("avg-first", avgFirst); times.setAttribute("std-first", stdFirst); times.setAttribute("max-first", "" + maxFirst); times.setAttribute("avg-best", avgBest); times.setAttribute("std-best", stdBest); times.setAttribute("max-best", "" + maxBest); times.setAttribute("max-new-best", "" + maxNewBest); times.setAttribute("avg-packed", avgPacked); times.setAttribute("max-packed", "" + maxPacked); times.setAttribute("avg-stopped-or-done", avgStoppedOrDone); times.setAttribute("std-stopped-or-done", stdStoppedOrDone); times.setAttribute("max-stopped-or-done", "" + maxStoppedOrDone); Element strings = new Element("max-strings"); root.addContent(strings); Element lex = new Element("lex"); strings.addContent(lex); lex.addContent(maxLexStr); Element first = new Element("first"); strings.addContent(first); first.addContent(maxFirstStr); Element best = new Element("best"); strings.addContent(best); best.addContent(maxBestStr); Element newBest2 = new Element("new-best"); strings.addContent(newBest2); newBest2.addContent(maxNewBestStr); Element packed = new Element("packed"); strings.addContent(packed); packed.addContent(maxPackedStr); Element stoppedOrDone = new Element("stopped-or-done"); strings.addContent(stoppedOrDone); stoppedOrDone.addContent(maxStoppedOrDoneStr); Element scores = new Element("scores"); root.addContent(scores); for (int i = 0; i < bestActualScores.size(); i++) { Element score = new Element("score"); scores.addContent(score); score.setAttribute("val", bestActualScores.get(i).toString()); score.setAttribute("est", bestEstimatedScores.get(i).toString()); score.setAttribute("rank", itemRanks.get(i).toString()); } firstMap.saveTimes(root); bestMap.saveTimes(root); allMap.saveTimes(root); try { FileOutputStream out = new FileOutputStream(statsfile); grammar.serializeXml(doc, out); out.flush(); } catch (IOException exc) { System.out.println("Unable to write stats to: " + statsfile + " (" + exc + ")"); } } } // show outcome, with wrapping private static void showOutcome(String parseResult, String realizeResult, String starForBadSentence, String str) { showOutcome(parseResult, realizeResult, starForBadSentence, str, null); } // show outcome including best realization private static void showOutcome(String parseResult, String realizeResult, String starForBadSentence, String str, String bestRealization) { System.out.print(parseResult + "\t" + realizeResult + "\t"); simpleWrap(starForBadSentence + str); if (bestRealization != null) { System.out.print("\t\t"); simpleWrap("(best: " + bestRealization + ")"); } } // does simple wrapping at TEXTWIDTH private static void simpleWrap(String str) { int TEXTWIDTH = 60; for (int i = 0; i <= (str.length()-1)/TEXTWIDTH; i++) { if (i != 0) { System.out.print("\t\t"); } System.out.println(str.substring(i*TEXTWIDTH, Math.min(i*TEXTWIDTH + TEXTWIDTH, str.length()))); } } // formats to three decimal places private static final NumberFormat nf = initNF(); private static NumberFormat initNF() { NumberFormat f = NumberFormat.getInstance(); f.setMinimumIntegerDigits(1); f.setMinimumFractionDigits(1); f.setMaximumFractionDigits(2); return f; } // formats to three decimal places in scientific notation private static final NumberFormat nfE = initNFE(); private static NumberFormat initNFE() { DecimalFormat f = new DecimalFormat("0.###E0"); return f; } /** Shows realizer settings for current test. */ static void showRealizerSettings() { // get, show prefs Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); boolean useIndexing = prefs.getBoolean(EdgeFactory.USE_INDEXING, true); boolean useChunks = prefs.getBoolean(EdgeFactory.USE_CHUNKS, true); boolean useLicensing = prefs.getBoolean(EdgeFactory.USE_FEATURE_LICENSING, true); boolean useCombos = prefs.getBoolean(opennlp.ccg.realize.Chart.USE_COMBOS, true); boolean usePacking = prefs.getBoolean(opennlp.ccg.realize.Chart.USE_PACKING, false); int timeLimit = prefs.getInt(opennlp.ccg.realize.Chart.TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); double nbTimeLimit = prefs.getDouble(opennlp.ccg.realize.Chart.NEW_BEST_TIME_LIMIT, opennlp.ccg.realize.Chart.NO_TIME_LIMIT); int pruningVal = prefs.getInt(opennlp.ccg.realize.Chart.PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); int cellPruningVal = prefs.getInt(opennlp.ccg.realize.Chart.CELL_PRUNING_VALUE, opennlp.ccg.realize.Chart.NO_PRUNING); String msg = "Timing realization with index filtering " + ((useIndexing) ? "on" : "off") + ", "; msg += "chunks " + ((useChunks) ? "on" : "off") + ", "; msg += "licensing " + ((useLicensing) ? "on" : "off") + ", "; if (usePacking) msg += "packing on, "; else { msg += "combos " + ((useCombos) ? "on" : "off") + ", "; if (timeLimit == opennlp.ccg.realize.Chart.NO_TIME_LIMIT) msg += "no time limit, "; else msg += "a time limit of " + timeLimit + " ms, "; if (nbTimeLimit == opennlp.ccg.realize.Chart.NO_TIME_LIMIT) msg += "no new best time limit, "; else { msg += "a new best time limit of "; if (nbTimeLimit >= 1) msg += ((int)nbTimeLimit) + " ms, "; else msg += nbTimeLimit + " of first, "; } } if (pruningVal == opennlp.ccg.realize.Chart.NO_PRUNING) msg += "no pruning, "; else msg += "a pruning value of " + pruningVal + ", "; msg += "and "; if (cellPruningVal == opennlp.ccg.realize.Chart.NO_PRUNING) msg += "no cell pruning"; else msg += "a cell pruning value of " + cellPruningVal; System.out.println(msg); System.out.println(); } /** * Writes the target strings from the given testbed to the given textfile. */ public void writeTargets(File tbFile, String textfile) throws IOException { writeTargets(tbFile, textfile, false, false, false); } /** * Writes the target strings with semantic class replacement * from the given testbed to the given textfile. */ public void writeTargetsSC(File tbFile, String textfile) throws IOException { writeTargets(tbFile, textfile, true, false, false); } /** * Writes the target strings with all associated factors * from the given testbed to the given textfile. */ public void writeTargetsF(File tbFile, String textfile) throws IOException { writeTargets(tbFile, textfile, false, true, false); } /** * Writes the target strings with all associated factors with semantic class replacement * from the given testbed to the given textfile. */ public void writeTargetsFSC(File tbFile, String textfile) throws IOException { writeTargets(tbFile, textfile, true, true, false); } // writes targets, optionally with sem class replacement or factors, // and optionally reversing the words; ungrammatical options are filtered out private void writeTargets( File tbFile, String filename, boolean semClassReplacement, boolean withFactors, boolean reverse ) throws IOException { // open text file String option = ""; if (withFactors) option = " with factors"; if (semClassReplacement) option += " with semantic class replacement"; if (reverse) option += ", reversed"; System.out.println("Writing text file" + option + ": " + filename); System.out.println(); PrintWriter tOut = new PrintWriter(new BufferedWriter(new FileWriter(filename))); HashSet unique = new HashSet(); Tokenizer tokenizer = grammar.lexicon.tokenizer; // loop through files for (File f : getXMLFiles(tbFile)) { // load testbed System.out.println("Loading testbed from: " + f); RegressionInfo tbInfo = new RegressionInfo(grammar, f); int numItems = tbInfo.numberOfItems(); // do each test item for (int i = 0; i < numItems; i++) { // check even/odd only if (i % 2 == 1 && evenOnly) continue; if (i % 2 == 0 && oddOnly) continue; RegressionInfo.TestItem testItem = tbInfo.getItem(i); // check grammatical if (testItem.numOfParses == 0) continue; String s = testItem.sentence; // get parsed words if doing more than just text List words = null; if (semClassReplacement || withFactors) { // use words from sign or pre-parsed full words if available if (testItem.sign != null) words = testItem.sign.getWords(); else if (testItem.fullWords != null) words = tokenizer.tokenize(testItem.fullWords, true); // otherwise parse else words = grammar.getParsedWords(s); } else words = tokenizer.tokenize(s); // reverse, if apropos if (reverse) { List tmp = words; words = new ArrayList(words.size()); words.add(Word.createWord("")); for (int j = tmp.size()-1; j >= 0; j--) { Word w = tmp.get(j); if (w.getForm() == "" || w.getForm() == "") continue; // skip or words.add(w); } words.add(Word.createWord("")); } // write str, add to unique set String str = (!withFactors) ? tokenizer.getOrthography(words, semClassReplacement) : tokenizer.format(words, semClassReplacement); tOut.println(str); unique.add(str); System.out.print("."); // indicate progress } System.out.println(); } tOut.flush(); tOut.close(); System.out.println(); System.out.println("Unique strings: " + unique.size()); System.out.println(); } private void writeDerivationFactors(File tbFile, String filename) throws IOException { // open text file System.out.println("Writing derivation factors file: " + filename); System.out.println(); PrintWriter tOut = new PrintWriter(new BufferedWriter(new FileWriter(filename))); Tokenizer tokenizer = grammar.lexicon.tokenizer; // loop through files for (File f : getXMLFiles(tbFile)) { // load testbed System.out.println("Loading testbed from: " + f); RegressionInfo tbInfo = new RegressionInfo(grammar, f); int numItems = tbInfo.numberOfItems(); // do each test item, using the saved sign for (int i = 0; i < numItems; i++) { RegressionInfo.TestItem testItem = tbInfo.getItem(i); if (testItem.numOfParses == 0) continue; // check grammatical Sign sign = testItem.sign; List factors = GenerativeSyntacticModel.getFactors(sign); for (Word w : factors) { tOut.print(tokenizer.format(w)); tOut.print(" "); } tOut.println(); System.out.print("."); // indicate progress } System.out.println(); } tOut.flush(); tOut.close(); System.out.println(); } /** Command-line routine for regression testing. */ public static void main(String[] args) throws IOException { String usage = "java opennlp.ccg.test.Regression \n" + " (-noparsing) (-norealization) (-even|-odd) (-gc) \n" + " (-nullscorer) (-randomscorer) \n" + " (-depthfirst) (-exactmatches) (-aanfilter ()) \n" + " (-scorer ) \n" + " (-parsescorer ) \n" + " (-extractor ) \n" + " (-ngrampruningstrategy) (-pruningstrategy ) \n" + " (-hypertagger | -htconfig ) (-htgold) \n" + " (-supertagger | -stconfig ) \n" + " (-ngramorder N) (-lm|-lmsc ) \n" + " (-srilm " + Arrays.toString(SRILMNgramModelType.values()) + ")\n"+ " (-flm|-flmsc ) \n" + " (-text|-textsc|-textf|-textfsc ) (-reverse) \n" + " (-derivf ) \n" + " (-2events ) (-includegoldinevents) \n" + " (-2apml ) (-bleu ) \n" + " (-nbestrealfile ) (-nbestnormbleu) (-realserdir ) \n" + " (-nbestincludelfs) \n" + " (-rescorefile ) \n" + " (-nbestparsefile ) \n" + " (-g ) (-s ) (|)"; if (args.length > 0 && args[0].equals("-h")) { System.out.println("Usage: \n\n" + usage); System.exit(0); } // setup Regression tester Regression tester = new Regression(); // args String grammarfile = "grammar.xml"; String regressionfile = "testbed.xml"; boolean depthFirst = false; boolean aanfilter = false; String excfile = null; String scorerClass = null; String parseScorerClass = null; String extractorClass = null; boolean ngrampruningstrategy = false; String pruningStrategyClass = null; String hypertaggerClass = null, htconfig = null; boolean htgold = false; String supertaggerClass = null, stconfig = null; String lmfile = null; String flmfile = null; boolean useSemClasses = false; boolean withFactors = false; boolean reverse = false; String textfile = null; String derivfactorsfile = null; boolean srilm = false; SRILMNgramModelType srilmModelType = SRILMNgramModelType.STANDARD; for (int i = 0; i < args.length; i++) { if (args[i].startsWith("-D")) { String prop = args[i].substring(2); int equalpos = prop.indexOf("="); String key = prop.substring(0, equalpos); String val = prop.substring(equalpos+1); System.setProperty(key, val); continue; } if (args[i].equals("-noparsing")) { tester.doParsing = false; continue; } if (args[i].equals("-norealization")) { tester.doRealization = false; continue; } if (args[i].equals("-even")) { tester.evenOnly = true; continue; } if (args[i].equals("-odd")) { tester.oddOnly = true; continue; } if (args[i].equals("-gc")) { tester.doGC = true; continue; } if (args[i].equals("-nullscorer")) { tester.scorer = SignScorer.nullScorer; tester.parseScorer = SignScorer.nullScorer; continue; } if (args[i].equals("-randomscorer")) { tester.scorer = SignScorer.randomScorer; tester.parseScorer = SignScorer.randomScorer; continue; } if (args[i].equals("-depthfirst")) { depthFirst = true; continue; } if (args[i].equals("-exactmatches")) { tester.exactMatches = true; continue; } if (args[i].equals("-aanfilter")) { aanfilter = true; if (i < args.length-1 && args[i+1].charAt(0) != '-') excfile = args[++i]; continue; } if (args[i].equals("-scorer")) { scorerClass = args[++i]; continue; } if (args[i].equals("-parsescorer")) { parseScorerClass = args[++i]; continue; } if (args[i].equals("-extractor")) { extractorClass = args[++i]; continue; } if (args[i].equals("-ngrampruningstrategy")) { ngrampruningstrategy = true; continue; } if (args[i].equals("-pruningstrategy")) { pruningStrategyClass = args[++i]; continue; } if (args[i].equals("-hypertagger")) { hypertaggerClass = args[++i]; continue; } if (args[i].equals("-htconfig")) { htconfig = args[++i]; continue; } if (args[i].equals("-htgold")) { htgold = true; continue; } if (args[i].equals("-supertagger")) { supertaggerClass = args[++i]; continue; } if (args[i].equals("-stconfig")) { stconfig = args[++i]; continue; } if (args[i].equals("-ngramorder")) { tester.ngramOrder = Integer.parseInt(args[++i]); continue; } if (args[i].equals("-lm")) { lmfile = args[++i]; continue; } if (args[i].equals("-lmsc")) { lmfile = args[++i]; useSemClasses = true; continue; } if (args[i].equals("-flm")) { flmfile = args[++i]; continue; } if (args[i].equals("-flmsc")) { flmfile = args[++i]; useSemClasses = true; continue; } if (args[i].equals("-reverse")) { reverse = true; continue; } if (args[i].equals("-text")) { textfile = args[++i]; continue; } if (args[i].equals("-textsc")) { textfile = args[++i]; useSemClasses = true; continue; } if (args[i].equals("-textf")) { textfile = args[++i]; withFactors = true; continue; } if (args[i].equals("-textfsc")) { textfile = args[++i]; useSemClasses = true; withFactors = true; continue; } if (args[i].equals("-derivf")) { derivfactorsfile = args[++i]; continue; } if (args[i].equals("-2events")) { tester.eventfile = args[++i]; continue; } if (args[i].equals("-includegoldinevents")) { tester.includeGoldInEvents = true; continue; } if (args[i].equals("-2apml")) { tester.apmldir = args[++i]; continue; } if (args[i].equals("-bleu")) { tester.bleufileprefix = args[++i]; continue; } if (args[i].equals("-nbestrealfile")) { tester.nbestrealfile = args[++i]; continue; } if (args[i].equals("-nbestnormbleu")) { tester.nbestnormbleu = true; continue; } if (args[i].equals("-realserdir")) { tester.realserdir = args[++i]; continue; } if (args[i].equals("-nbestincludelfs")) { tester.nbestincludelfs = true; continue; } if (args[i].equals("-rescorefile")) { tester.rescorefile = args[++i]; continue; } if (args[i].equals("-nbestparsefile")) { tester.nbestparsefile = args[++i]; continue; } if (args[i].equals("-g")) { grammarfile = args[++i]; continue; } if (args[i].equals("-s")) { tester.statsfile = args[++i]; continue; } if (args[i].equals("-srilm")) { srilm = true; if(i < (args.length - 1)) { String type = args[i + 1]; try { srilmModelType = SRILMNgramModelType.valueOf(type); i++; } catch(IllegalArgumentException iae) { srilmModelType = SRILMNgramModelType.STANDARD; System.err.println( "Warning: unknown SRILM n-gram model type " + type + " specified, using default (" + srilmModelType + ")"); } } continue; } regressionfile = args[i]; } // load grammar URL grammarURL = new File(grammarfile).toURI().toURL(); System.out.println("Loading grammar from URL: " + grammarURL); tester.grammar = new Grammar(grammarURL); System.out.println(); // with -aanfilter ( 0) ? tester.ngramOrder : 3; String reversedStr = (reverse) ? "reversed " : ""; System.out.println("Loading " + reversedStr + order + "-gram model from file: " + lmfile); NgramScorer lmScorer = (srilm) ? new SRILMNgramModel(order, new File(lmfile), useSemClasses, srilmModelType) : new StandardNgramModel(order, lmfile, useSemClasses); if (reverse) lmScorer.setReverse(true); if (aanfilter) lmScorer.addFilter(aanFilter); tester.scorer = lmScorer; System.out.println(); } // with -flm|-flmsc options, load factored n-gram model family if (flmfile != null) { String reversedStr = (reverse) ? "reversed " : ""; System.out.println("Loading " + reversedStr + "factored n-gram model family from file: " + flmfile); NgramScorer flmScorer = new FactoredNgramModelFamily(flmfile, useSemClasses); if (reverse) flmScorer.setReverse(true); if (aanfilter) flmScorer.addFilter(aanFilter); tester.scorer = flmScorer; tester.ngramOrder = flmScorer.getOrder(); System.out.println(); } // with -text|-textsc|-textf|-textfsc options, just write text file and exit if (textfile != null) { File tbFile = new File(regressionfile); tester.writeTargets(tbFile, textfile, useSemClasses, withFactors, reverse); System.exit(0); } // with -derivf option, just write derivation factors file and exit if (derivfactorsfile != null) { File tbFile = new File(regressionfile); tester.writeDerivationFactors(tbFile, derivfactorsfile); System.exit(0); } // setup parser if (tester.doParsing) { tester.parser = new Parser(tester.grammar); // instantiate scorer, if any if (parseScorerClass != null) { try { System.out.println("Instantiating parsing sign scorer from class: " + parseScorerClass); tester.parseScorer = (SignScorer) Class.forName(parseScorerClass).newInstance(); tester.showParseStats = true; // turn parsing stats on System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } // set parser scorer, if any if (tester.parseScorer != null) tester.parser.setSignScorer(tester.parseScorer); // also turn on parse stats if doing n-best output if (tester.nbestparsefile != null) tester.showParseStats = true; // instantiate supertagger, if any if (supertaggerClass != null || stconfig != null) { try { Supertagger supertagger; if (supertaggerClass != null) { System.out.println("Instantiating supertagger from class: " + supertaggerClass); supertagger = (Supertagger) Class.forName(supertaggerClass).newInstance(); } else { System.out.println("Instantiating supertagger from config file: " + stconfig); supertagger = WordAndPOSDictionaryLabellingStrategy.supertaggerFactory(stconfig); } tester.parser.setSupertagger(supertagger); if (tester.eventfile != null) { supertagger.setIncludeGold(true); // use gold tags during training tester.parser.setSupertaggerMostToLeastRestrictiveDirection(false); // reverse direction to start with least restrictive setting } System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } } // setup realizer, show settings if (tester.doRealization) { tester.realizer = new Realizer(tester.grammar); tester.realizer.depthFirst = depthFirst; // instantiate pruning strategy, if any if (ngrampruningstrategy) { int order = (tester.ngramOrder > 0) ? tester.ngramOrder : 3; System.out.println("Instantiating n-gram diversity pruning strategy with order " + order); tester.realizer.pruningStrategy = new NgramDiversityPruningStrategy(order); System.out.println(); } if (pruningStrategyClass != null) { try { System.out.println("Instantiating pruning strategy from class: " + pruningStrategyClass); tester.realizer.pruningStrategy = (PruningStrategy) Class.forName(pruningStrategyClass).newInstance(); System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } if (hypertaggerClass != null || htconfig != null) { try { Hypertagger hypertagger; if (hypertaggerClass != null) { System.out.println("Instantiating hypertagger from class: " + hypertaggerClass); hypertagger = (Hypertagger) Class.forName(hypertaggerClass).newInstance(); } else { System.out.println("Instantiating hypertagger from config file: " + htconfig); hypertagger = ZLMaxentHypertagger.ZLMaxentHypertaggerFactory(htconfig); } tester.realizer.hypertagger = hypertagger; if (tester.eventfile != null) { hypertagger.setIncludeGold(true); // use gold tags during training // NB: could investigate most-to-least-restrictive direction } if (htgold) hypertagger.setIncludeGold(true); // use gold tags, eg for paraphrasing System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } showRealizerSettings(); } // ensure dir for event file exists; // set up feature extractor if (tester.eventfile != null) { File tmp = new File(tester.eventfile); File tmpParent = tmp.getParentFile(); if (tmpParent != null) tmpParent.mkdirs(); System.out.println("Writing event file to: " + tester.eventfile); System.out.println(); } // instantiate feature extractor, if generating events if (tester.eventfile != null) { // ensure just doing parsing or realization if (tester.doParsing && tester.doRealization) { throw new RuntimeException("Events can't be generated for parsing and realization at the same time."); } FeatureExtractor extractor = null; if (extractorClass != null) { try { System.out.println("Instantiating feature extractor from class: " + extractorClass); extractor = (FeatureExtractor) Class.forName(extractorClass).newInstance(); tester.featureExtractor = extractor; System.out.println(); } catch (Exception exc) { throw (RuntimeException) new RuntimeException().initCause(exc); } } // use or combine with scorer, if it's also a feature extractor if (tester.scorer instanceof FeatureExtractor) { if (extractor != null) tester.featureExtractor = new ComposedFeatureExtractor( new FeatureExtractor[] { (FeatureExtractor)tester.scorer, extractor } ); else tester.featureExtractor = (FeatureExtractor) tester.scorer; } else if (tester.parseScorer instanceof FeatureExtractor) { if (extractor != null) tester.featureExtractor = new ComposedFeatureExtractor( new FeatureExtractor[] { (FeatureExtractor)tester.parseScorer, extractor } ); else tester.featureExtractor = (FeatureExtractor) tester.parseScorer; } // otherwise use an n-gram precision model if (tester.featureExtractor == null) tester.featureExtractor = new NgramPrecisionModel(new String[]{""}, true); // set new alphabet tester.featureExtractor.setAlphabet(new Alphabet(10000)); } // ensure apmldir exists if (tester.apmldir != null) { File apmlDir = new File(tester.apmldir); if (!apmlDir.exists()) { apmlDir.mkdirs(); } System.out.println("Writing APML files to dir: " + tester.apmldir); System.out.println(); } // ensure dir for bleu files exists if (tester.bleufileprefix != null) { File tmp = new File(tester.bleufileprefix + "-gen.sgm"); File tmpParent = tmp.getParentFile(); if (tmpParent != null) tmpParent.mkdirs(); System.out.println("Writing BLEU files to: " + tester.bleufileprefix + "-*.sgm"); System.out.println(); } // ensure dir for nbestrealfile exists if (tester.nbestrealfile != null) { File tmp = new File(tester.nbestrealfile); File tmpParent = tmp.getParentFile(); if (tmpParent != null) tmpParent.mkdirs(); System.out.println("Writing N-best realizations to: " + tester.nbestrealfile); System.out.println(); } // ensure realserdir exists if (tester.realserdir != null) { File realserDir = new File(tester.realserdir); if (!realserDir.exists()) { realserDir.mkdirs(); } System.out.println("Writing best realization serialization files to dir: " + tester.realserdir); System.out.println(); } // ensure dir for rescorefile exists if (tester.rescorefile != null) { File tmp = new File(tester.rescorefile); File tmpParent = tmp.getParentFile(); if (tmpParent != null) tmpParent.mkdirs(); System.out.println("Writing rescored sign scores to: " + tester.rescorefile); System.out.println(); } // run test tester.runTest(new File(regressionfile)); } } ================================================ FILE: src/opennlp/ccg/test/RegressionInfo.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.synsem.*; import org.jdom.*; import org.jdom.input.*; import java.io.*; import java.util.*; /** * Manages the info in a regression test file. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.16 $, $Date: 2011/08/27 19:27:01 $ */ public class RegressionInfo { // the grammar private Grammar grammar; // the test items private TestItem[] testItems; /** Test item. */ public class TestItem { /** The test sentence/phrase. */ public String sentence; /** An alternative paraphrase to target, or null if none. */ public String alt = null; /** The desired number of parses. */ public int numOfParses = 1; /** Whether the sentence/phrase is known to fail to parse. */ public boolean knownFailure = false; /** The full words for the sentence/phrase, or null if none, formatted by the configured tokenizer. */ public String fullWords = null; /** The LF, in XML, for the sentence/phrase, or null if none. */ public Element lfElt = null; /** Any additionally id info, or null if none. */ public String info = null; /** The gold std nominal id name & supertag for LF predicates. @deprecated Should use fullWords. */ public String predInfo = null; /** The gold standard derivation. */ public Sign sign = null; /** Returns the id from info, without prefixed "ID=" if present. */ public String getId() { if (info == null) return null; int pos = info.indexOf('='); return (pos < 0) ? info : info.substring(pos+1); } } /** Reads in the given regression test file and corresponding .ser file, if any. */ public RegressionInfo(Grammar grammar, File regressionFile) throws FileNotFoundException { this(grammar, new FileInputStream(regressionFile), serStream(regressionFile)); } /** Reads in a regression test from the given input stream and object input stream. */ @SuppressWarnings("unchecked") public RegressionInfo(Grammar grammar, InputStream istr, ObjectInputStream serStream) { this.grammar = grammar; SAXBuilder builder = new SAXBuilder(); try { Map signMap = readSerStream(serStream); Document doc = builder.build(istr); Element root = doc.getRootElement(); List items = root.getChildren("item"); testItems = new TestItem[items.size()]; for (int i = 0; i < items.size(); i++) { Element item = (Element) items.get(i); TestItem testItem = new TestItem(); testItems[i] = testItem; testItem.sentence = item.getAttributeValue("string"); testItem.alt = item.getAttributeValue("alt"); testItem.numOfParses = Integer.parseInt(item.getAttributeValue("numOfParses")); testItem.knownFailure = ("true".equals(item.getAttributeValue("known"))) ? true : false; Element fullWordsElt = item.getChild("full-words"); if (fullWordsElt != null) testItem.fullWords = fullWordsElt.getTextNormalize(); testItem.lfElt = item.getChild("lf"); testItem.info = item.getAttributeValue("info"); Element predInfoElt = item.getChild("pred-info"); if (predInfoElt != null) testItem.predInfo = predInfoElt.getAttributeValue("data"); if (signMap != null && testItem.info != null) testItem.sign = signMap.get(testItem.info); } } catch (Exception e) { throw (RuntimeException) new RuntimeException().initCause(e); } } /** Returns the corresponding .ser file for loading sign objects. */ public static File serFile(File regressionFile) { String name = regressionFile.getName(); String prefix = name.substring(0, name.lastIndexOf('.')); return new File(regressionFile.getParentFile(), prefix + ".ser"); } /** Returns object input stream for corresponding .ser file, or null if none. */ public static ObjectInputStream serStream(File regressionFile) { File serFile = serFile(regressionFile); if (serFile.exists()) { try { return new ObjectInputStream(new FileInputStream(serFile)); } catch (FileNotFoundException e) { throw (RuntimeException) new RuntimeException().initCause(e); } catch (IOException e) { throw (RuntimeException) new RuntimeException().initCause(e); } } else return null; } /** Reads in a map of info keys and gold standard signs from the given stream, or returns null if the stream is null. */ @SuppressWarnings("unchecked") public static Map readSerStream(ObjectInputStream serStream) throws IOException { if (serStream == null) return null; try { return (Map) serStream.readObject(); } catch (ClassNotFoundException e) { throw (RuntimeException) new RuntimeException().initCause(e); } } /** Writes the map of info keys and gold standard signs to the corresponding .ser file. */ public static void writeSerFile(Map signMap, File regressionFile) throws FileNotFoundException, IOException { File serFile = serFile(regressionFile); ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(serFile)); oos.writeObject(signMap); oos.close(); } /** Returns the number of test items. */ public int numberOfItems() { return testItems.length; } /** Returns the test item with the given index. */ public TestItem getItem(int i) { return testItems[i]; } /** * Makes an XML test item from the given test item object. */ public static Element makeTestItem(TestItem testItem) { Element item = new Element("item"); item.setAttribute("numOfParses", "" + testItem.numOfParses); if (testItem.knownFailure) item.setAttribute("known", "true"); item.setAttribute("string", testItem.sentence); if (testItem.alt != null) item.setAttribute("alt", testItem.alt); if (testItem.fullWords != null) { Element fullWordsElt = new Element("full-words"); item.addContent(fullWordsElt); fullWordsElt.addContent(testItem.fullWords); } if (testItem.lfElt != null) { testItem.lfElt.detach(); item.addContent(testItem.lfElt); } if (testItem.info != null) item.setAttribute("info", testItem.info); return item; } /** * Makes an XML test item with the given string, number of parses and LF, * applying the configured to-XML transformations. */ public Element makeTestItem(String target, int numParses, LF lf) throws IOException { return makeTestItem(grammar, target, numParses, lf); } /** * Makes an XML test item with the given string, number of parses and LF, * applying the configured to-XML transformations. */ public static Element makeTestItem(Grammar grammar, String target, int numParses, LF lf) throws IOException { return makeTestItem(grammar, target, numParses, lf, null); } /** * Makes an XML test item with the given string, number of parses, LF and info attribute, * applying the configured to-XML transformations. */ public static Element makeTestItem(Grammar grammar, String target, int numParses, LF lf, String info) throws IOException { Element item = new Element("item"); item.setAttribute("numOfParses", "" + numParses); item.setAttribute("string", target); if (lf != null) item.addContent(grammar.makeLfElt(lf)); if (info != null) item.setAttribute("info", info); return item; } /** * Adds the given sign with its string, number of parses and converted LF * as a test item to the testbed with the given filename, applying the configured to-XML * transformations. */ public static void addToTestbed(Grammar grammar, Sign sign, int numParses, LF lf, String filename) throws IOException { // ensure dirs exist for filename File file = new File(filename); File parent = file.getParentFile(); if (parent != null && !parent.exists()) { parent.mkdirs(); } // load or make doc Document doc; Element root; boolean newDoc = false; if (file.exists()) { // read XML SAXBuilder builder = new SAXBuilder(); try { doc = builder.build(file); } catch (JDOMException jde) { throw (IOException) new IOException().initCause(jde); } root = doc.getRootElement(); } else { doc = new Document(); root = new Element("regression"); doc.setRootElement(root); newDoc = true; } // load or make sign map Map signMap = readSerStream(serStream(file)); if (signMap == null) signMap = new HashMap(); // find unique id int count = 0; String id = "i" + count; while (signMap.containsKey(id)) id = "i" + ++count; // make test item String target = sign.getOrthography(); Element item = makeTestItem(grammar, target, numParses, lf, id); // append new item if (!newDoc) root.addContent(" "); // nb: for some reason, this gets the indenting right root.addContent(item); // add sign to map signMap.put(id, sign); // save FileOutputStream out = new FileOutputStream(file); grammar.serializeXml(doc, out); out.close(); writeSerFile(signMap, file); } } ================================================ FILE: src/opennlp/ccg/test/ScorerMaker.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import opennlp.ccg.synsem.SignScorer; import java.io.*; /** * Interface for making custom scorers for use in cross-validation tests with the realizer. * * @author Michael White * @version $Revision: 1.6 $, $Date: 2007/12/21 05:13:37 $ */ public interface ScorerMaker { /** * Sets the context for this scorer maker. */ public void setCVR(CrossValidateRealizer cvr); /** * Prepares a scoring model from the training data. * The data can be accessed by creating a RegressionInfo * object from the given training file. * The test data is also made available to optionally * compute perplexity or other measures. * The model can be stored in a file in tmpDir, keyed off of foldNum. */ public void prepScorer(File tmpDir, int foldNum, File trainFile, File testFile) throws IOException; /** * Optionally summarizes perplexity or other measures * after all calls to prepScorer. */ public void prepScorersSummary(File tmpDir) throws IOException; /** * Loads a scoring model created from the training data. */ public SignScorer loadScorer(File tmpDir, int foldNum, File trainFile) throws IOException; } ================================================ FILE: src/opennlp/ccg/test/TimingMap.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import java.util.*; import org.jdom.*; /** * Utility class for managing average times per number of nodes. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/12/21 04:18:31 $ */ public class TimingMap { private String label; private List times = new ArrayList(); private HashMap> map = new HashMap>(); /** Constructor, with label. */ public TimingMap(String label) { this.label = label; } /** Adds a number, time pair. */ public void add(int num, int time) { Integer timeInt = new Integer(time); times.add(timeInt); Integer key = new Integer(num); List timesPerNum = map.get(key); if (timesPerNum == null) { timesPerNum = new ArrayList(); map.put(key, timesPerNum); } timesPerNum.add(timeInt); } /** Returns the mean time. */ public double mean() { int total = 0; for (int i = 0; i < times.size(); i++) { Integer time = times.get(i); total += time.intValue(); } return (1.0 * total) / times.size(); } /** Returns the standard deviation. */ public double sigma() { if (times.size() < 2) return -1; // NA double mean = mean(); double numerator = 0; for (int i = 0; i < times.size(); i++) { Integer time = times.get(i); numerator += Math.pow(time.intValue() - mean, 2); } int denominator = times.size() - 1; return Math.sqrt(numerator / denominator); } /** Saves the times and times per number (with average) as XML elements under the given one. */ public void saveTimes(Element root) { Element timesElt = new Element("times"); root.addContent(timesElt); timesElt.setAttribute("label", label); Element listElt = new Element("list"); timesElt.addContent(listElt); listElt.setAttribute("mean", "" + mean()); listElt.setAttribute("sigma", "" + sigma()); for (int i = 0; i < times.size(); i++) { Element timeElt = new Element("time"); listElt.addContent(timeElt); timeElt.setAttribute("val", times.get(i).toString()); } Element perNumsElt = new Element("per-nums"); timesElt.addContent(perNumsElt); Set keys = map.keySet(); List nums = new ArrayList(keys.size()); nums.addAll(keys); Collections.sort(nums); int min = nums.get(0).intValue(); int max = nums.get(nums.size()-1).intValue(); for (int num = min; num <= max; num++) { Element perNumElt = new Element("per"); perNumsElt.addContent(perNumElt); perNumElt.setAttribute("num", "" + num); Integer numKey = new Integer(num); List timesPer = map.get(numKey); if (timesPer == null) { perNumElt.setAttribute("count", "0"); continue; } int sum = 0; int count = timesPer.size(); perNumElt.setAttribute("count", "" + count); for (int i = 0; i < timesPer.size(); i++) { Integer time = (Integer) timesPer.get(i); sum += time.intValue(); Element timeElt = new Element("time"); perNumElt.addContent(timeElt); timeElt.setAttribute("val", time.toString()); } double mean = (sum * 1.0) / count; perNumElt.setAttribute("mean", "" + mean); } } } ================================================ FILE: src/opennlp/ccg/test/UpdateTestbed.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; import opennlp.ccg.grammar.*; import opennlp.ccg.lexicon.*; import org.jdom.*; import java.io.*; import java.net.*; import java.util.*; /** * Utility class to update testbed files; just adds full words based on * sign or parse at present. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/12/21 04:18:31 $ */ public class UpdateTestbed { /** The grammar. */ private Grammar grammar; /** Constructor. */ private UpdateTestbed(URL grammarURL) throws IOException { // load grammar System.out.println("Loading grammar from URL: " + grammarURL); grammar = new Grammar(grammarURL); } /** Adds full words for each test item, if missing, based on the sign or parse. */ private void addFullWords(File tbFile) throws IOException { // load testbed System.out.println("Loading testbed from: " + tbFile); RegressionInfo tbInfo = new RegressionInfo(grammar, tbFile); // create output doc Document outDoc = new Document(); Element outRoot = new Element("regression"); outDoc.setRootElement(outRoot); // update each one int numItems = tbInfo.numberOfItems(); Tokenizer tokenizer = grammar.lexicon.tokenizer; System.out.print("Adding full words "); for (int i = 0; i < numItems; i++) { RegressionInfo.TestItem testItem = tbInfo.getItem(i); if (testItem.fullWords == null) { List words = (testItem.sign != null) ? testItem.sign.getWords() : grammar.getParsedWords(testItem.sentence); testItem.fullWords = tokenizer.format(words); } outRoot.addContent(RegressionInfo.makeTestItem(testItem)); System.out.print("."); // indicate progress } System.out.println(); // save file, backing up original File tbFileBackup = new File(tbFile.getParentFile(), tbFile.getName() + "~"); System.out.println("Backing up testbed to: " + tbFileBackup); tbFile.renameTo(tbFileBackup); System.out.println("Saving results to: " + tbFile); FileOutputStream out = new FileOutputStream(tbFile); grammar.serializeXml(outDoc, out); out.close(); } /** Updates the given input file. */ public static void main(String[] args) throws IOException { String usage = "Usage: java opennlp.ccg.test.UpdateTestbed (-g ) (-add-full-words) ()"; if (args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); } // args String grammarfile = "grammar.xml"; String testbedfile = "testbed.xml"; boolean addFullWords = false; for (int i = 0; i < args.length; i++) { if (args[i].equals("-g")) { grammarfile = args[++i]; continue; } if (args[i].equals("-add-full-words")) { addFullWords = true; continue; } testbedfile = args[i]; } // create updater, check testbed exists File gFile = new File(grammarfile); URL grammarURL = gFile.toURI().toURL(); File tbFile = new File(testbedfile); if (!tbFile.exists()) { tbFile = new File(gFile.getParentFile(), testbedfile); } if (!tbFile.exists()) { System.out.println("Unable to find testbed file: " + testbedfile); System.exit(-1); } UpdateTestbed updater = new UpdateTestbed(grammarURL); // do tasks if (addFullWords) updater.addFullWords(tbFile); System.out.println("Done."); } } ================================================ FILE: src/opennlp/ccg/test/Validator.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.test; // import javax.xml.parsers.*; import org.xml.sax.*; import org.xml.sax.helpers.*; // import java.net.*; /** * Validates XML files against their declared schemas. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2005/10/20 18:49:42 $ */ public class Validator { public static void main(String[] args) throws Exception { if (args.length == 0) { System.out.println("Usage: java opennlp.ccg.test.Validator "); System.exit(1); } // configure schema validating XML parser XMLReader parser = getXercesSchemaValidatingParser(); // parse for (int i = 0; i < args.length; i++) { // System.out.println("Parsing: " + args[i]); parser.parse(args[i]); } } // NB: this requires xercesImpl.jar, but on the other hand it does not seem // possible to validate with the version of JAXP that comes with JDK 1.4.1 // (cf. JAXP sample SAXLocalNameCount.java) private static XMLReader getXercesSchemaValidatingParser() throws Exception { String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser"; XMLReader parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME); String VALIDATION_FEATURE_ID = "http://xml.org/sax/features/validation"; String SCHEMA_VALIDATION_FEATURE_ID = "http://apache.org/xml/features/validation/schema"; parser.setFeature(VALIDATION_FEATURE_ID, true); parser.setFeature(SCHEMA_VALIDATION_FEATURE_ID, true); return parser; } } ================================================ FILE: src/opennlp/ccg/unify/EmptySubstitution.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import java.util.Iterator; /** * A Substitution which does not hold any substitutions. * * @author Jason Baldridge * @version $Revision: 1.2 $, $Date: 2005/10/20 17:30:30 $ */ public class EmptySubstitution implements Substitution { public Object makeSubstitution(Variable var, Object u) throws UnifyFailure { return u; } public Object getValue(Variable var) { return null; } public Iterator varIterator() { return null; } } ================================================ FILE: src/opennlp/ccg/unify/Feature.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * An attribute paired with a value, possibly with an index that ties * it to another feature. * * @author Jason Baldridge * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:13 $ **/ public interface Feature { public String getAttribute (); public Unifiable getValue (); public short getIndex (); public void setValue (Unifiable u); public void setIndex (short index); } ================================================ FILE: src/opennlp/ccg/unify/FeatureStructure.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-5 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import java.util.Set; /** * A feature structure containing attributes and their associated values. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/02/16 17:31:38 $ */ public interface FeatureStructure extends Unifiable, Mutable { /** * Store a attribute/value pair * * @param attribute the attribute of the feature * @param value the value of the feature */ public void setFeature(String attribute, Object value); /** * Get the value corresponding to an attribute. * * @param attribute the attribute of the feature * @return the value of the feature */ public Object getValue(String attribute); /** * Checks to see if the feature structure contains a feature with * the given attribute. * * @param attribute the attribute * @return if this structure contains that attribute */ public boolean hasAttribute(String attribute); /** * Checks to see if an attribute has a particular value in this structure * * @param attribute the attribute * @param value the value * @return if this structure contains that attribute/value pair */ public boolean attributeHasValue(String attribute, Object value); /** * The all attributes in set form * * @return the set of attributes */ public Set getAttributes(); /** * The number of features in this feature structure * * @return number of features in this feature structure */ public int size(); /** * Returns whether or not this feature structure contains any features * * @return whether this feature structure contains any features */ public boolean isEmpty(); /** * Explictly clear the attribute value mappings in this feature structure * * @param b the empty value */ public void clear(); /** * Makes a deep copy of this feature structure. * * @return a copy of this feature structure */ public FeatureStructure copy(); /** * Computes whether this feature structure contains (is a superset * of) another feature structure. * * @param fs the possibly contained feature structure * @return if that structure is a subset of this one */ public boolean contains(FeatureStructure fs); /** * Changes this feature structure such that all its features that * are in another feature structure are changed to have the values of * the other feature structure. This is destructive. * * @param f the feature structure to inherit from * @return the changed feature structure */ public FeatureStructure inherit(FeatureStructure fs); /** * Determines if this feature structure is exactly the same as another. This * means that for every feature, the structures have exactly the same value. * * @param f the other feature structure * @return if this structure is the same as the other */ public boolean equals(FeatureStructure fs); /** Returns the index. */ public int getIndex(); /** Sets the index. */ public void setIndex(int index); /** Returns the inheritsFrom (default unification) index. */ public int getInheritsFrom(); /** * Returns the supertag info for this feature structure. */ public String getSupertagInfo(); /** * Returns a TeX formatted feature structure. */ public String toTeX(); } ================================================ FILE: src/opennlp/ccg/unify/GFeatStruc.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-7 Jason Baldridge, Gann Bierner, Michael White and Gunes Erkan // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import opennlp.ccg.synsem.LF; import opennlp.ccg.hylo.*; import opennlp.ccg.grammar.*; import gnu.trove.*; import org.jdom.*; import java.util.*; /** * A feature structure for use with CCG categories. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @author Gunes Erkan * @version $Revision: 1.29 $, $Date: 2011/11/11 15:30:42 $ */ public class GFeatStruc extends HashMap implements FeatureStructure { private static final long serialVersionUID = 1L; boolean _empty = true; int _index = 0; int _inheritsFrom = 0; public GFeatStruc() { super(3); } public GFeatStruc(int i) { super(i); } @SuppressWarnings("unchecked") public GFeatStruc(Element fsEl) { super(fsEl.getChildren().size()); String index = fsEl.getAttributeValue("id"); if (index != null) { _index = Integer.parseInt(index); } String inheritsFrom = fsEl.getAttributeValue("inheritsFrom"); if (inheritsFrom != null) { _inheritsFrom = Integer.parseInt(inheritsFrom); } List feats = fsEl.getChildren(); if (feats.size() == 0) { setFeature(fsEl); } else { for (Iterator featIt=feats.iterator(); featIt.hasNext();) { setFeature((Element)featIt.next()); } } } public Element toXml() { Element retval = new Element("fs"); if (_index > 0) retval.setAttribute("id", Integer.toString(_index)); if (_inheritsFrom > 0) retval.setAttribute("inheritsFrom", Integer.toString(_inheritsFrom)); List keys = new ArrayList(keySet()); Collections.sort(keys); if (size() == 1 && get(keys.get(0)) instanceof SimpleType) { String attr = keys.get(0); SimpleType val = (SimpleType) get(attr); retval.setAttribute("attr", attr); retval.setAttribute("val", val.getName()); } else { for (String attr : keys) { Element featElt = new Element("feat"); featElt.setAttribute("attr", attr); retval.addContent(featElt); Object val = get(attr); if (val instanceof SimpleType) featElt.setAttribute("val", ((SimpleType) val).getName()); else { if (val instanceof GFeatVar) { GFeatVar var = (GFeatVar) val; Element varElt = new Element("featvar"); featElt.addContent(varElt); String name = var.name(); String typeName = var.getType().getName(); if (!typeName.equals(Types.TOP_TYPE)) name += ":" + typeName; varElt.setAttribute("name", name); } else if (val instanceof LF) featElt.addContent(HyloHelper.toXml((LF)val)); else throw new RuntimeException("Unsupported feature value type in constructing XML: " + val); } } } return retval; } public void deepMap(ModFcn mf) { for (Iterator attributes=keySet().iterator(); attributes.hasNext();) { Object val1 = getValue(attributes.next()); if (val1 instanceof Mutable) { ((Mutable)val1).deepMap(mf); } } mf.modify(this); } public void setFeature(String attribute, Object val) { put(attribute, val); _empty = false; } private void setFeature(Element e) { String attr = e.getAttributeValue("attr"); if (attr == null) attr = e.getAttributeValue("a"); if (attr == null) { return; } String val = e.getAttributeValue("val"); if (val == null) val = e.getAttributeValue("v"); Object value; if (val != null) { value = Grammar.theGrammar.types.getSimpleType(val); } else { Element valEl = (Element)e.getChildren().get(0); if (valEl.getName().equals("featvar") || valEl.getName().equals("fvar")) { String[] name = valEl.getAttributeValue("name").split(":"); if (name[0]==null) name = valEl.getAttributeValue("n").split(":",2); if (name.length<2) { value = new GFeatVar(name[0]); } else value = new GFeatVar(name[0], Grammar.theGrammar.types.getSimpleType(name[1])); } else { value = HyloHelper.getLF((Element)e.getChildren().get(0)); } } setFeature(attr, value); } public Object getValue(String attribute) { return get(attribute); } public boolean hasAttribute(String attribute) { return containsKey(attribute); } public boolean attributeHasValue(String attribute, Object val) { return val.equals(getValue(attribute)); } public Set getAttributes() { return keySet(); } public void clear() { clear(); _empty = true; } /** Returns true iff this feature structure has the same index and set of attr-val pairs. */ public boolean equals(FeatureStructure fs) { if (!(fs instanceof GFeatStruc)) return false; GFeatStruc bfs = (GFeatStruc)fs; if (_index != bfs._index) return false; if (size() != bfs.size()) return false; Set atts1 = getAttributes(); Set atts2 = bfs.getAttributes(); if (!atts1.containsAll(atts2)) return false; for (Iterator it = atts1.iterator(); it.hasNext(); ) { String att = it.next(); if (!getValue(att).equals(bfs.getValue(att))) return false; } return true; } /** Returns a hash code consistent with equals. */ public int hashCode() { return super.hashCode() + _index; } public FeatureStructure copy() { GFeatStruc $fs = new GFeatStruc(size()); $fs.setIndex(_index); $fs._inheritsFrom = _inheritsFrom; for (Iterator i=getAttributes().iterator(); i.hasNext();) { String a = i.next(); $fs.setFeature(a, UnifyControl.copy(getValue(a))); } return $fs; } public boolean contains(FeatureStructure fs) { if (size() < fs.size()) return false; Set atts1 = getAttributes(); Set atts2 = fs.getAttributes(); if (atts1.containsAll(atts2)) { for (Iterator i2 = atts2.iterator(); i2.hasNext();) { String a2 = i2.next(); boolean foundA2 = false; for (Iterator i1 = atts1.iterator(); !foundA2 && i1.hasNext();) { String a1 = i1.next(); if (a1.equals(a2)) { if (!getValue(a1).equals(fs.getValue(a2))) return false; foundA2 = true; } } } return true; } else { return false; } } public boolean occurs(Variable v) { for (Iterator i = values().iterator(); i.hasNext();) { Object $_ = i.next(); if ($_ instanceof Unifiable && ((Unifiable)$_).occurs(v)) return true; } return false; } public void unifyCheck(Object u) throws UnifyFailure { if (!(u instanceof FeatureStructure)) { throw new UnifyFailure(); } // look for incompatible string-valued features FeatureStructure fs2 = (FeatureStructure)u; Set keys1 = getAttributes(); for (Iterator i1=keys1.iterator(); i1.hasNext();) { String k1 = i1.next(); Object val1 = getValue(k1); if (!(val1 instanceof SimpleType)) continue; Object val2 = fs2.getValue(k1); if (!(val2 instanceof SimpleType)) continue; ((SimpleType)val1).unifyCheck(val2); } } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (!(u instanceof FeatureStructure)) { throw new UnifyFailure(); } FeatureStructure fs2 = (FeatureStructure)u; FeatureStructure $fs = new GFeatStruc(size()); Set keys1 = getAttributes(); Set keys2 = fs2.getAttributes(); for (Iterator i1=keys1.iterator(); i1.hasNext();) { String k1 = i1.next(); Object val1 = getValue(k1); Object val2 = fs2.getValue(k1); if (val2 != null) { $fs.setFeature(k1, Unifier.unify(val1, val2, sub)); } else { $fs.setFeature(k1, UnifyControl.copy(val1)); } } for (Iterator i2=keys2.iterator(); i2.hasNext();) { String k2 = i2.next(); if (!keys1.contains(k2)) $fs.setFeature(k2, UnifyControl.copy(fs2.getValue(k2))); } int fs2Index = fs2.getIndex(); int newIndex = 0; if (_index == 0) { newIndex = fs2Index; } else if (fs2Index == 0) { newIndex = _index; } else if (sub instanceof GSubstitution) { newIndex = ((GSubstitution)sub).makeNewIndex(_index, fs2Index); } $fs.setIndex(newIndex); if (sub instanceof GSubstitution && newIndex > 0) { ((GSubstitution)sub).addIndexedObject(newIndex,$fs); } return $fs; } public Object fill(Substitution sub) throws UnifyFailure { FeatureStructure $fs = copy(); for (Iterator i = $fs.getAttributes().iterator(); i.hasNext();) { String a = i.next(); Object value = getValue(a); if (value instanceof Variable) { Object varVal = sub.getValue((Variable)value); if (null != varVal) { $fs.setFeature(a, Unifier.unify(value,varVal,sub)); } } } if (_index > 0 && sub instanceof GSubstitution) { FeatureStructure otherVals = (FeatureStructure)((GSubstitution)sub).getIndexedObject(_index); if (null != otherVals) { if (!$fs.equals(otherVals)) { $fs = (FeatureStructure)$fs.unify(otherVals, sub); $fs.setIndex(otherVals.getIndex()); } } } return $fs; } public FeatureStructure inherit(FeatureStructure fs) { FeatureStructure $fs = copy(); for (Iterator i = fs.getAttributes().iterator(); i.hasNext();) { String a = i.next(); $fs.setFeature(a, UnifyControl.copy(fs.getValue(a))); } return $fs; } public int getIndex() { return _index; } public void setIndex(int index) { _index = index; } public int getInheritsFrom() { return _inheritsFrom; } private void addFeatureString(String attribute, StringBuffer sb) { Object val = getValue(attribute); sb.append(attribute).append('=').append(val.toString()); } public String toString() { // if (_empty) return ""; StringBuffer sb = new StringBuffer(size()*4); if (_index > 0) { sb.append('<'); sb.append(_index); sb.append('>'); } if (_empty) return sb.toString(); String featsToShow = Grammar.theGrammar.prefs.featsToShow; sb.append('{'); List filteredKeys = new ArrayList(size()); if (featsToShow.length() == 0) { filteredKeys.addAll(keySet()); } else { for (Iterator it = keySet().iterator(); it.hasNext(); ) { String key = it.next(); if (featsToShow.indexOf(key) != -1) filteredKeys.add(key); } } String[] keys = new String[filteredKeys.size()]; filteredKeys.toArray(keys); Arrays.sort(keys); for (int i=0; i < keys.length; i++) { addFeatureString(keys[i], sb); if (i < keys.length - 1) sb.append(", "); } sb.append('}'); return sb.toString(); } /** * Returns the supertag info for this feature structure. * In particular, returns the values of any non-variable * features of interest, within square brackets. * The features of interest are configurable * at the grammar level. */ public String getSupertagInfo() { if (_empty) return ""; StringBuffer sb = new StringBuffer(); ArrayList attrs = new ArrayList(getAttributes()); Collections.sort(attrs); Set supertagFeatures = Grammar.theGrammar.supertagFeatures; for (int i = 0; i < attrs.size(); i++) { String attr = attrs.get(i); if (!supertagFeatures.contains(attr)) continue; Object val = getValue(attr); if (val instanceof Variable) continue; String s = val.toString(); if (s.equals("+") || s.equals("-")) s = s + attr; sb.append('[').append(s).append(']'); } return sb.toString(); } private void addFeatureTeX(String attribute, StringBuffer sb) { Object val = getValue(attribute); String s = cleanText(val.toString()); if (s.equals("+") || s.equals("-")) s = attribute + s; sb.append(" ").append(s); } // makes sure every special character is handled correctly in LaTeX private String cleanText(String s) { String str = s; try { //order matters!! str = str.replaceAll("\\\\", " \\\\\\backslash "); str = str.replaceAll("\\{", " \\\\\\{ "); str = str.replaceAll("\\}", " \\\\\\} "); str = str.replaceAll("\\$", " \\\\\\$ "); str = str.replaceAll("\\#", " \\\\\\# "); str = str.replaceAll("\\%", " \\\\\\% "); str = str.replaceAll("\\&", " \\\\\\& "); str = str.replaceAll("\\~", " \\\\\\tilde\\{\\} "); str = str.replaceAll("\\_", " \\\\\\_ "); str = str.replaceAll("\\^", " \\\\\\hat\\{\\} "); } catch (Exception e) { System.out.println("Error while evaluating RegExp: " + e.toString()); } return str; } public String toTeX() { StringBuffer sb = new StringBuffer(); if ((_index > 0)&&(_empty)) { sb.append(" \\subsf{ < "); sb.append(_index); sb.append(" > } "); } if (_empty) return sb.toString(); String featsToShow = Grammar.theGrammar.prefs.featsToShow; sb.append(" \\subsf{ "); if ((_index > 0)) { sb.append(" < "); sb.append(_index); sb.append(" > "); } List filteredKeys = new ArrayList(size()); if (featsToShow.length() == 0) { filteredKeys.addAll(keySet()); } else { for (Iterator it = keySet().iterator(); it.hasNext(); ) { String key = it.next(); if (featsToShow.indexOf(key) != -1) filteredKeys.add(key); } } String[] keys = new String[filteredKeys.size()]; filteredKeys.toArray(keys); Arrays.sort(keys); for (int i=0; i < keys.length; i++) { addFeatureTeX(keys[i], sb); if (i < keys.length - 1) sb.append(" , "); } sb.append(" } "); return sb.toString(); } /** * Returns a hash code using the given map from vars to ints, * to allow for equivalence up to variable names. */ public int hashCode(TObjectIntHashMap varMap) { int retval = 0; // nb: treat index as a regular var if (_index != 0) { // see if index already in map if (varMap.containsKey(_index)) retval = varMap.get(_index); // otherwise add it else { int next = varMap.size() + 1; varMap.put(_index, next); retval = next; } } // otherwise treat missing index as unique, keyed to negative identity hash else { int next = varMap.size() + 1; varMap.put(-1 * Math.abs(System.identityHashCode(this)), next); retval = next; } if (_empty) { return retval; } // sort keys Set keySet = keySet(); String[] keys = new String[keySet.size()]; keySet.toArray(keys); Arrays.sort(keys); // do each key for (int i=0; i atts1 = getAttributes(); Set atts2 = fs.getAttributes(); if (!atts1.containsAll(atts2)) return false; for (Iterator it = atts1.iterator(); it.hasNext(); ) { String att = it.next(); Object val = getValue(att); Object val2 = fs.getValue(att); if (val instanceof Variable && val2 instanceof Variable) { if (!((Variable)val).equals(val2, varMap, varMap2)) return false; } else { if (!val.equals(val2)) return false; } } return true; } } ================================================ FILE: src/opennlp/ccg/unify/GFeatVar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-7 Jason Baldridge, Gunes Erkan and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import java.io.Serializable; import gnu.trove.TObjectIntHashMap; import opennlp.ccg.grammar.*; /** * A class for variables which can stand for any feature. * * @author Jason Baldridge * @author Gunes Erkan * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/07/17 04:23:30 $ **/ public class GFeatVar implements Variable, Indexed, Mutable, Serializable { private static final long serialVersionUID = -5526887599728099988L; protected final String _name; protected int _index; protected int _hashCode; protected SimpleType type; public GFeatVar(String name) { this(name, 0, null); } public GFeatVar(String name, SimpleType st) { this(name, 0, st); } protected GFeatVar(String name, int index, SimpleType st) { _name = name; _index = index; type = (st != null) ? st : Grammar.theGrammar.types.getSimpleType(Types.TOP_TYPE); _hashCode = _name.hashCode() + _index + type.getIndex(); } public String name() { return _name; } public Object copy() { return new GFeatVar(_name, _index, type); } public void deepMap(ModFcn mf) { mf.modify(this); } public int getIndex() { return _index; } public SimpleType getType() { return type; } public void setIndex(int index) { _hashCode += index - _index; _index = index; } public boolean occurs(Variable var) { return equals(var); } public int hashCode() { return _hashCode; } public boolean equals(Object o) { if (this == o) return true; if (!(o instanceof GFeatVar)) return false; GFeatVar var = (GFeatVar) o; return _index == var._index && _name.equals(var._name) && type.equals(var.type); } /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap) { // see if this already in map if (varMap.containsKey(this)) return varMap.get(this); // otherwise add it int next = varMap.size() + 1; varMap.put(this, next); return next; } /** * Returns whether this var equals the given object up to variable names, * using the given maps from vars to ints. * (Note that the name and index may differ, but the types must be equal.) */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2) { if (this == obj) return true; if (obj.getClass() != this.getClass()) { return false; } GFeatVar gv = (GFeatVar) obj; if (varMap.get(this) != varMap2.get(gv)) return false; if (!this.type.equals(gv.type)) return false; return true; } public void unifyCheck(Object o) throws UnifyFailure {} public Object unify(Object u, Substitution sub) throws UnifyFailure { if (equals(u)) { return this; } else if (u instanceof SimpleType) { SimpleType st1 = getType(); SimpleType st2 = (SimpleType)u; return sub.makeSubstitution(this, st2.unify(st1, sub)); } else if (u instanceof GFeatVar) { GFeatVar var = (GFeatVar) u; if (var.occurs(this)) throw new UnifyFailure(); SimpleType st1 = getType(); SimpleType st2 = var.getType(); SimpleType st3 = (SimpleType) st2.unify(st1, sub); // substitute var with most specific type if (st3.equals(st2)) return sub.makeSubstitution(this, var); else if (st3.equals(st1)) return sub.makeSubstitution(var, this); else { // need a new var with intersection type GFeatVar var3 = new GFeatVar(_name, UnifyControl.getUniqueVarIndex(), st3); sub.makeSubstitution(var, var3); return sub.makeSubstitution(this, var3); } } else { return sub.makeSubstitution(this, u); } } public Object fill(Substitution sub) throws UnifyFailure { Object val = sub.getValue(this); if (val != null) { return val; } else { return this; } } public String toString() { String retval = _name; if (!type.getName().equals(Types.TOP_TYPE)) retval += ":" + type.getName(); return retval; } } ================================================ FILE: src/opennlp/ccg/unify/GSubstitution.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge, University of Edinburgh (Michael White) // and Gunes Erkan // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import gnu.trove.*; import java.util.*; /** * Implementation of Substitution interface which ensures that all * the categories it contains are updated as new substitutions are * made. * * @author Jason Baldridge * @author Michael White * @author Gunes Erkan * @version $Revision: 1.13 $, $Date: 2009/12/21 03:27:19 $ */ public class GSubstitution extends THashMap implements Substitution { private static final long serialVersionUID = 1L; private TIntObjectHashMap _indexedObjects = new TIntObjectHashMap(); private TIntIntHashMap _newFeatStrucIndexes = new TIntIntHashMap(); /** * Request the Substitution to identify a variable with an * object. Automagically condenses the Substitution so that all * other values in this Substitution get the new value for the * variable if they contain it. * * @param var the variable whose value has been determined * @param o the Object identified with the variable * @return the Object identified with the variable, which has * potentially undergone further unifications as a result of * making the substitution * @exception throws UnifyFailure if the Object cannot be unified * with a previous value substituted for the Variable. */ @SuppressWarnings("unchecked") public Object makeSubstitution(Variable var, Object u) throws UnifyFailure { Object val1 = getValue(var); if (u instanceof Variable) { Variable var2 = (Variable)u; Object val2 = getValue(var2); // check if var -> u already if (val1 != null && val1.equals(u)) return u; // check if u -> var already if (val2 != null && val2.equals(var)) return var; // otherwise continue if (val1 != null) { if (val1 instanceof Unifiable && ((Unifiable)val1).occurs(var2)) { throw new UnifyFailure(); } if (val2 != null) { u = Unifier.unify(var, val2, this); } else { u = makeSubstitution(var2, val1); } } else if (val2 != null) { if (val2 instanceof Unifiable && ((Unifiable)val2).occurs(var)) { throw new UnifyFailure(); } makeSubstitution(var, val2); } } else if (val1 != null) { u = Unifier.unify(val1, u, this); } put(var, u); for (Iterator i=keySet().iterator(); i.hasNext();) { Variable v = (Variable)i.next(); Object res = getValue(v); if (res instanceof Unifiable) { res = ((Unifiable)res).fill(this); } put(v, res); } if (u instanceof Unifiable) { u = ((Unifiable)u).fill(this); } return u; } /** * Try to get the value of a variable from this Substitution. * Returns null if the variable is unknown to the Substitution. * * @param var the variable whose value after unification is desired * @return the Object which this variable has been unified with */ public Object getValue(Variable var) { Object val = get(var); if (null != val) { if (val instanceof Variable) { Object deepVal = getValue((Variable)val); if (null != deepVal) { val = deepVal; } } } return val; } @SuppressWarnings("unchecked") public Iterator varIterator() { return keySet().iterator(); } public int makeNewIndex(int fs1Index, int fs2Index) { int index = UnifyControl.getUniqueFeatureStructureIndex(); int fs1IndexUpdated = getUpdatedIndex(fs1Index); int fs2IndexUpdated = getUpdatedIndex(fs2Index); addReindex(fs1IndexUpdated, index); addReindex(fs2IndexUpdated, index); return index; } public void addReindex(int oldIndex, int newIndex) { // avoid creating a pointer cycle if (oldIndex == newIndex) return; if (_newFeatStrucIndexes.containsKey(newIndex)) { throw new RuntimeException( "Whoops! Index map already contains newIndex: " + newIndex + "\n" + this ); } _newFeatStrucIndexes.put(oldIndex, newIndex); } public int getUpdatedIndex(int oldIndex) { if (!_newFeatStrucIndexes.containsKey(oldIndex)) return oldIndex; return getUpdatedIndex(_newFeatStrucIndexes.get(oldIndex)); } public void addIndexedObject(int index, Object o) { _indexedObjects.put(index, o); } public Object getIndexedObject(int index) { return _indexedObjects.get(getUpdatedIndex(index)); } public void condense() throws UnifyFailure { int[] keys = _indexedObjects.keys(); for (int i=0; i < keys.length; i++) { Object obj = _indexedObjects.get(keys[i]); if (obj instanceof Unifiable) { Object filled = ((Unifiable)obj).fill(this); _indexedObjects.put(keys[i], filled); } } // drop old indexed objects for (int i = 0; i < keys.length; i++) { if (_newFeatStrucIndexes.containsKey(keys[i])) { _indexedObjects.remove(keys[i]); } } } @SuppressWarnings("unchecked") public String toString() { StringBuffer sb = new StringBuffer(); sb.append("vars: \t"); for (Iterator keys=keySet().iterator(); keys.hasNext();) { Object key = keys.next(); sb.append(key).append('=').append(get(key)).append('\t'); } sb.append('\n'); sb.append("indexes: \t"); int indexKeys[] = _newFeatStrucIndexes.keys(); for (int i = 0; i < indexKeys.length; i++) { sb.append(indexKeys[i] + "->" + _newFeatStrucIndexes.get(indexKeys[i]) + "\t"); } return sb.toString(); } } ================================================ FILE: src/opennlp/ccg/unify/GUnifier.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import opennlp.ccg.synsem.*; /** * A unifier for CCG categories. * * @author Jason Baldridge * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:13 $ */ public class GUnifier { public static Category unify (Category c1, Category c2) throws UnifyFailure { return (Category)unify(c1, c2, new EmptySubstitution()); } public static Category unify (Category c1, Category c2, Substitution sub) throws UnifyFailure { if (c1 instanceof AtomCat && c2 instanceof ComplexCat) { c2.unifyCheck(c1); return (Category)c2.unify(c1, sub); } c1.unifyCheck(c2); return (Category)Unifier.unify(c1, c2, sub); } } ================================================ FILE: src/opennlp/ccg/unify/Indexed.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * An object which can be indexed. * * @author Jason Baldridge * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:13 $ **/ public interface Indexed { /** * Gets the index of this Indexed object. * * @return the index **/ public int getIndex (); /** * Sets the index of this Indexed object. * * @param index An int which provides a unique index **/ public void setIndex (int index); } ================================================ FILE: src/opennlp/ccg/unify/ModFcn.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * Interface for functions which modify Mutable objects. * * @author Jason Baldridge * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:13 $ */ public interface ModFcn { /** * Changes a Mutable object in some way. * * @param mf a function to be applied **/ public void modify (Mutable m); } ================================================ FILE: src/opennlp/ccg/unify/Mutable.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * Interface for objects which might contain recursive structure and are * mutable. * * @author Jason Baldridge * @version $Revision: 1.1.1.1 $, $Date: 2003/02/28 18:02:13 $ */ public interface Mutable { /** * Applies a ModFcn to this Mutable and then applies it to all fields * which are themselves Mutables. * * @param mf a function to be applied **/ public void deepMap (ModFcn mf); } ================================================ FILE: src/opennlp/ccg/unify/SelfCondensingSub.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import java.util.*; /** * Implementation of Substitution interface which ensures that all the * categories it contains are updated as new substitutions are made. * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2009/12/21 03:27:19 $ */ public class SelfCondensingSub extends HashMap implements Substitution { private static final long serialVersionUID = 1L; /** * Request the Substitution to identify a variable with an object. * Automagically condenses the Substitution so that all other values in this * Substitution get the new value for the variable if they contain it. * * @param var * the variable whose value has been determined * @param o * the Object identified with the variable * @return the Object identified with the variable, which has potentially * undergone further unifications as a result of making the * substitution * @exception throws * UnifyFailure if the Object cannot be unified with a * previous value substituted for the Variable. */ public Object makeSubstitution(Variable var, Object u) throws UnifyFailure { Object val1 = getValue(var); if (u instanceof Variable) { Variable var2 = (Variable) u; Object val2 = getValue(var2); if (val1 != null) { if (val2 != null) u = Unifier.unify(var, val2, this); else u = makeSubstitution(var2, val1); } else { if (val2 != null) makeSubstitution(var, val2); else put(var, var2); } } else if (val1 != null) { u = Unifier.unify(val1, u, this); } put(var, u); for (Iterator i = keySet().iterator(); i.hasNext();) { Variable v = i.next(); Object res = getValue(v); if (res instanceof Unifiable) { res = ((Unifiable) res).fill(this); } put(v, res); } if (u instanceof Unifiable) { u = ((Unifiable) u).fill(this); } return u; } /** * Try to get the value of a variable from this Substitution. Returns null * if the variable is unknown to the Substitution. * * @param var * the variable whose value after unification is desired * @return the Object which this variable has been unified with */ public Object getValue(Variable var) { return get(var); } public Iterator varIterator() { return keySet().iterator(); } } ================================================ FILE: src/opennlp/ccg/unify/SimpleSubstitution.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-5 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import java.util.*; /** * Simple implementation of Substitution interface. * * @author Jason Baldridge * @version $Revision: 1.3 $, $Date: 2005/10/20 17:30:30 $ */ public class SimpleSubstitution extends HashMap implements Substitution { private static final long serialVersionUID = 1L; public SimpleSubstitution() {} public SimpleSubstitution(Map map) { super(map); } public Object makeSubstitution(Variable var, Object u) throws UnifyFailure { if (u instanceof Unifiable) { u = ((Unifiable) u).fill(this); } put(var, u); return u; } public Object getValue(Variable var) { return get(var); } public Iterator varIterator() { return keySet().iterator(); } } ================================================ FILE: src/opennlp/ccg/unify/SimpleType.java ================================================ /////////////////////////////////////////////////////////////////////////////// //// Copyright (C) 2003-9 Gunes Erkan and Michael White //// //// This library is free software; you can redistribute it and/or //// modify it under the terms of the GNU Lesser General Public //// License as published by the Free Software Foundation; either //// version 2.1 of the License, or (at your option) any later version. //// //// This library is distributed in the hope that it will be useful, //// but WITHOUT ANY WARRANTY; without even the implied warranty of //// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //// GNU Lesser General Public License for more details. //// //// You should have received a copy of the GNU Lesser General Public //// License along with this program; if not, write to the Free Software //// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. //////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import opennlp.ccg.grammar.*; import java.io.ObjectStreamException; import java.io.Serializable; import java.util.*; /** * A simple type for feature values in CCG categories. * * Note that during deserialization, the type is resolved using the current grammar. * * @author Gunes Erkan * @author Michael White * @version $Revision: 1.8 $, $Date: 2009/07/17 04:23:30 $ */ public class SimpleType implements Unifiable, Serializable { private static final long serialVersionUID = 7028285176993549672L; private int index; private String name; private BitSet bitset; private BitSet tempBitset = new BitSet(); private transient Types types; public SimpleType(int i, String n, BitSet bs, Types t) { index = i; name = n; bitset = bs; types = t; } public int getIndex() { return index; } public BitSet getBitSet() { return bitset; } public String getName() { return name; } public String toString() { return name; } public void unifyCheck(Object u) throws UnifyFailure { if (!(u instanceof SimpleType)) { throw new UnifyFailure(); } } public Object unify(Object u, Substitution sub) throws UnifyFailure { if (!(u instanceof SimpleType)) { throw new UnifyFailure(); } if (this == u) return this; SimpleType st2 = (SimpleType) u; tempBitset.clear(); tempBitset.or(bitset); tempBitset.and(st2.getBitSet()); int resultTypeIndex = tempBitset.nextSetBit(0); if (resultTypeIndex == -1) { throw new UnifyFailure(); } return types.getIndexMap().get(resultTypeIndex); } public Object fill(Substitution s) throws UnifyFailure { return this; } public boolean occurs(Variable v) { return false; } public int hashCode() { return index; } public boolean equals(Object o) { if (!(o instanceof SimpleType)) return false; if (index == ((SimpleType)o).getIndex()) return true; else return false; } /** Returns canonical version of deserialized type based on current grammar. */ public Object readResolve() throws ObjectStreamException { return Grammar.theGrammar.types.getSimpleType(name); } } ================================================ FILE: src/opennlp/ccg/unify/Substitution.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import java.util.*; /** * Specifies how variable are to be replaced to make two objects unify. * * @author Gann Bierner & Jason Baldridge * @version $Revision: 1.2 $, $Date: 2005/10/20 17:30:30 $ */ public interface Substitution { /** * Request the Substitution to identify a variable with an object. * * @param var * the variable whose value has been determined * @param o * the Object identified with the variable * @return the Object identified with the variable, which has potentially * undergone further unifications as a result of making the * substitution * @exception throws * UnifyFailure if the Object cannot be unified with a * previous value substituted for the Variable. */ public Object makeSubstitution(Variable var, Object u) throws UnifyFailure; /** * Try to get the value of a variable from this Substitution. Should return * null if the variable is unknown to the Substitution. * * @param var * the variable whose value after unification is desired * @return the Object which this variable has been unified with */ public Object getValue(Variable var); public Iterator varIterator(); } ================================================ FILE: src/opennlp/ccg/unify/Unifiable.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-5 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * An interface for classes that may be unified. * * @author Gann Bierner * @version $Revision: 1.2 $, $Date: 2005/10/19 21:27:16 $ */ public interface Unifiable { /** * Determines if a Variable occurs within this Unifiable * * @param v * the Variable to check for * @return whether or not the Variable occurs */ public boolean occurs(Variable v); /** * Tests for equality with the given Object. * * @param o * object to test for equality * @return true if this Unifiable is equal to o, false if * not. */ public boolean equals(Object o); /** * Unify this Unfiable with another Object. * * @param o * object to unify with * @param s * Substitution containing the variable resolutions * @exception UnifyFailure * if this Unifiable cannot be unified with the Object * @return an object which represents the unification of this Unifiable with * the Object */ public Object unify(Object u, Substitution s) throws UnifyFailure; /** * Check if this Unifiable can unify with another Object. This should be * implemented as a quick check to allow users of the Unifiable to scan a * group of Unifications to rapidly see if the entire group is at least * possible before descending into each one with a full unification * procedure. Thus, if a call to this method does not result in a * UnifyFailure exception being thrown, it doesn't mean that the Object can * definitely be unified with this Unifiable -- what is important is that * when a call to this method throws a UnifyFailure exception, it permits * one to avoid calling the unify() method on other Unifiables in a group * because the quick check failed on this one. * * @param o * object to check for unifiability * @exception UnifyFailure * if this Unifiable cannot be unified with the Object */ public void unifyCheck(Object u) throws UnifyFailure; /** * Replaces any variables in this Unifiable with the values found for them * in the Substitution argument. * * @param s * Substitution containing the variable resolutions * @return a copy of this Unifiable with all variables from the Substitution * replaced by their values. */ public Object fill(Substitution s) throws UnifyFailure; } ================================================ FILE: src/opennlp/ccg/unify/Unifier.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002 Jason Baldridge and Gann Bierner // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * A unification utility that abstracts a few basic issues such * Variables and not needed to pass a substitution object explictly. * * @author Jason Baldridge * @version $Revision: 1.2 $, $Date: 2004/11/11 17:50:13 $ **/ public class Unifier { /** * Uses a SelfCondensingSub underlyingly so that it * is not necessary to pass a substitution object explictly. * * @param u1 the first of two Unifiables to unify * @param u2 the second of two Unifiables to unify * @return the result of unifying u1 and u2 **/ public static final Object unify (Object u1, Object u2) throws UnifyFailure { Substitution sub = new SelfCondensingSub(); Object result = unify(u1, u2, sub); if (result instanceof Unifiable) { result = ((Unifiable)result).fill(sub); } return result; } /** * Method which handles ordering to make sure that the Unifiable * unify() method is called on the Variable if either of the * arguments is a Variable. This way, under a unification scheme * for a set of classes, you don't have to have each Unifiable * check to see if the thing it is trying to be unified with is a * Variable. * * @param u1 the first of two Unifiables to unify * @param u2 the second of two Unifiables to unify * @param sub the substitution object holding global unification * information * @return the result of unifying u1 and u2 **/ public static final Object unify (Object u1, Object u2, Substitution sub) throws UnifyFailure { // !!!!!!!!!!!!!!!!!!!!!!!! CAUTION !!!!!!!!!!!!!!!!!!!!!!!! // the order of this if-else statement is important, so be // careful before you change it! // !!!!!!!!!!!!!!!!!!!!!!!! CAUTION !!!!!!!!!!!!!!!!!!!!!!!! if (u2 instanceof Variable) { return ((Variable)u2).unify(u1, sub); } else if (u1 instanceof Unifiable) { return ((Unifiable)u1).unify(u2, sub); } else if (u2 instanceof Unifiable) { return ((Unifiable)u2).unify(u1, sub); } else if (u1.equals(u2)) { return u1; } else { throw new UnifyFailure(); } } } ================================================ FILE: src/opennlp/ccg/unify/UnifyControl.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import opennlp.ccg.synsem.*; import gnu.trove.*; /** * Center of command for the unification process. * Needs work to allow for multithreading. * * @author Jason Baldridge * @author Michael White * @version $Revision: 1.6 $, $Date: 2005/10/20 17:30:30 $ */ public class UnifyControl { /** * An integer used to keep variables unique in lexical items. */ private static int _varIndex = 0; /** * A function that makes variables unique. */ private static ModFcn uniqueFcn = new ModFcn() { public void modify (Mutable m) { if (m instanceof Indexed && m instanceof Variable) { ((Indexed)m).setIndex(_varIndex); } }}; /** * An integer used to keep feature structure indexes unique. */ private static int _fsIndex = 1; private static TIntIntHashMap _reindexed = new TIntIntHashMap(); private static CategoryFcn indexFcn = new CategoryFcnAdapter() { public void forall (Category c) { FeatureStructure fs = c.getFeatureStructure(); if (fs != null) { int index = fs.getIndex(); if (index > 0) { int $index = _reindexed.get(index); if ($index == 0) { $index = _fsIndex++; _reindexed.put(index, $index); } fs.setIndex($index); } } } }; /** Resets the uniqueness counters. */ public static void startUnifySequence() { _varIndex = 0; _fsIndex = 1; } /** Sets the var and feature structure indices to unique values. */ public static void reindex(Category cat) { reindex(cat, null); } /** Sets the var and feature structure indices to unique values. */ public static void reindex(Category cat, Category anotherCat) { _reindexed.clear(); cat.forall(indexFcn); cat.deepMap(uniqueFcn); if (cat != anotherCat && anotherCat != null) { anotherCat.forall(indexFcn); anotherCat.deepMap(uniqueFcn); } _varIndex++; } public static int getUniqueVarIndex() { return ++_varIndex; } public static int getUniqueFeatureStructureIndex() { return ++_fsIndex; } public static Object copy(Object o) { if (o instanceof Category) { return ((Category)o).copy(); } else if (o instanceof GFeatVar) { return ((GFeatVar)o).copy(); } else if (o instanceof LF) { return ((LF)o).copy(); } else if (o instanceof GFeatStruc) { return ((GFeatStruc)o).copy(); } else { return o; } } } ================================================ FILE: src/opennlp/ccg/unify/UnifyFailure.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-3 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; /** * @author Jason Baldridge * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/11/28 03:39:27 $ */ public class UnifyFailure extends Exception { private static final long serialVersionUID = 1L; /** Constructor. */ public UnifyFailure() {} /** Constructor with message. */ public UnifyFailure(String m) { super(m); } /** Constructor with two args that failed to unify. */ public UnifyFailure(String arg1, String arg2) { super("Unable to unify " + arg1 + " with " + arg2 + "."); } /** Returns exception message. */ public String toString() { String msg = getMessage(); return "Unify Failure: " + (msg != null ? msg : "(no message)"); } } ================================================ FILE: src/opennlp/ccg/unify/Variable.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2002-7 Jason Baldridge, Gann Bierner and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.unify; import gnu.trove.TObjectIntHashMap; /** * A variable that can stand for some class of Unifiable objects. * * @author Gann Bierner * @author Michael White * @version $Revision: 1.2 $, $Date: 2007/12/20 21:30:22 $ **/ public interface Variable extends Unifiable { /** * Returns the name of this variable. * * @return the variable's name **/ public String name(); /** * Returns a hash code using the given map from vars to ints. */ public int hashCode(TObjectIntHashMap varMap); /** * Returns whether this var equals the given object up to variable names, * using the given maps from vars to ints. */ public boolean equals(Object obj, TObjectIntHashMap varMap, TObjectIntHashMap varMap2); } ================================================ FILE: src/opennlp/ccg/util/ArrayListWithIdentityEquals.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.*; /** * An array list where equality is checked using == on the list elements, which * are assumed to be canonical. * The hashCode method is compatible with SingletonList and StructureSharingList. * * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/12/21 03:27:18 $ */ public class ArrayListWithIdentityEquals extends ArrayList { private static final long serialVersionUID = 1L; /** Default constructor. */ public ArrayListWithIdentityEquals() {} /** Constructor with initial collection. */ public ArrayListWithIdentityEquals(Collection c) { super(c); } /** Constructor with initial capacity. */ public ArrayListWithIdentityEquals(int initialCapacity) { super(initialCapacity); } /** Returns a hash code for this list, using identity hash codes of the list elements. */ public int hashCode() { int hc = 1; for (int i = 0; i < size(); i++) { hc = 31*hc + System.identityHashCode(get(i)); } return hc; } /** Returns whether this list equals the given object, using identity tests on the list elements. */ public boolean equals(Object obj) { if (this == obj) return true; if (!(obj instanceof List)) return false; List list = (List) obj; if (size() != list.size()) return false; for (int i = 0; i < size(); i++) { if (get(i) != list.get(i)) return false; } return true; } } ================================================ FILE: src/opennlp/ccg/util/CompositeFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.Set; /** * Composes the effects of several {@linkplain Filter filters} into a single filter. * The component filters are interpreted conjunctively, so that they all must allow a * given element for the composite filter to allow it. * * @author Scott Martin */ public class CompositeFilter implements Filter { Set> filters; /** * Creates a new empty composite filter. */ public CompositeFilter() { this.filters = new HashSet>(); } /** * Creates a new composite filter made up of the specified filters. * @see #CompositeFilter(Collection) */ @SafeVarargs public CompositeFilter(Filter... edgeFilters) { this(Arrays.asList(edgeFilters)); } /** * Creates a new composite filter made up of the specified filters. * The specified filters can be filters on an superclass of this filter's * type parameter. */ public CompositeFilter(Collection> filters) { this.filters = new HashSet>(filters); } /** * Gets the set of filters this composite filter is made up of. * @return An unmodifiable set view of the filters making up this composite filter. */ public Set> filters() { return Collections.unmodifiableSet(filters); } /** * Tests whether this filter contains a given filter. * @return true if the specified filter is one of the ones making up this * composite filter. */ public boolean containsFilter(Filter filter) { return filters.contains(filter); } /** * Adds a filter to this composite filter, if it is not already present. * @param filter The filter to add. * @return true if the filter was not already contained. * @throws IllegalArgumentException If filter is null. * * @see Collection#add(Object) */ public boolean addFilter(Filter filter) { if(filter == null) { throw new IllegalArgumentException("filter is null"); } return filters.add(filter); } /** * Removes the specified filter. * @param filter The filter to remove. * @return true if the specified filter was removed from this composite filter. */ public boolean removeFilter(Filter filter) { return filters.remove(filter); } /** * Tests whether the given element is allowed by applying each of this composite filter's components * to it one by one, calling each of their {@link Filter#allows(Object)} method exactly once wit the * specified argument. * * @return false if one of the filters making up this composite filters returns false from its * {@link Filter#allows(Object)} method for the argument e, otherwise true. In particular, * this means that an empty composite filter returns true for every argument. */ @Override public boolean allows(E e) { for(Filter f : filters) { if(!f.allows(e)) { return false; } } return true; } } ================================================ FILE: src/opennlp/ccg/util/DelegatedFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; /** * Abstract class for filters that delegates to another filter that tests elements that do not * necessarily apply to the same type of elements as this filter. This filter's * {@link #allows(Object)} method returns the value of the * {@linkplain #getDelegateFilter() delegate filter}'s allows(...) method * for the value returned by {@link #delegateValueFor(Object)} for the specified * element. *

    * Concrete subclasses need to specify (1) the delegate filter (via their constructor), and * (2) a way to determine which element of type D the delegate filter should use * based on a specified element of type E by implementing {@link #delegateValueFor(Object)}. * A typical example is the case when a filter is desired that compares elements of type E, but * the comparison needs to take place on some type-D object somehow derived from instances of * E, e.g. by an accessor method. * * @param The type of elements that this filter applies to. * @param The type of elements that the delegated filter applies to. * * @author Scott Martin */ public abstract class DelegatedFilter implements Filter { Filter delegateFilter; /** * Creates a delegated filter with the specified filter to delegate to. The delegated filter will be * used in the test for {@link #allows(Object)}, through the {@link #delegateValueFor(Object)}. */ protected DelegatedFilter(Filter delegateFilter) { this.delegateFilter = delegateFilter; } /** * Gets the filter that this filter delegates to. * @return The filter specified at creation. * @see #DelegatedFilter(Filter) */ public Filter getDelegateFilter() { return delegateFilter; } /** * Tests whether this filter allows the specified element by testing whether its * {@linkplain #getDelegateFilter() delegate filter} allows the value of {@link #delegateValueFor(Object)} * for the argument e. * @return true if the delegate filter's {@link Filter#allows(Object)} method returns true * for the element returned by delegateValueFor(e). * @see #delegateValueFor(Object) */ @Override public boolean allows(E e) { return delegateFilter.allows(delegateValueFor(e)); } /** * Gets the element of type D that the delegated filter should use in its * {@link Filter#allows(Object)} comparisons, given the specified type-E element. * @param e The element to obtain a type-D element for. * @return The element that the delegated filter should use for comparison, based on e. */ public abstract D delegateValueFor(E e); } ================================================ FILE: src/opennlp/ccg/util/DisplayPrefs.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2006 Ben Wing // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.prefs.Preferences; import opennlp.ccg.TextCCG; /** * Simple class for holding preferences for converting an object to a * string or other displayable representation. * * @author Ben Wing * @version $Revision: 1.5 $, $Date: 2007/06/22 01:52:21 $ */ public class DisplayPrefs { /* Whether to show feature info along with each nonterminal */ public boolean showFeats = false; /* Whether to show semantic info (logical forms) */ public boolean showSem = false; /* Which features to show. */ public String featsToShow = ""; /** Constructor sets initial prefs from current user prefs. */ public DisplayPrefs() { Preferences prefs = Preferences.userNodeForPackage(TextCCG.class); showFeats = prefs.getBoolean(TextCCG.SHOW_FEATURES, false); showSem = prefs.getBoolean(TextCCG.SHOW_SEMANTICS, false); featsToShow = prefs.get(TextCCG.FEATURES_TO_SHOW, ""); } } ================================================ FILE: src/opennlp/ccg/util/Filter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; /** * Interface for filters that characterize a set by some membership criteria. * This interface uses a single method, {@link #allows(Object)}, to allow implementing * classes to say whether the specified element should be a member of the collection or not. *

    * Filters can be thought of as characteristic functions for sets. The type parameter is * used to signal what kind of elements a filter applies to. * * @param The type of elements that this filter applies to. * * @see FilteredSet * @see FilteredMap * @author Scott Martin */ public interface Filter { /** * Tests whether the specified element is allowed. * @return true if the provided element should be allowed into the collection. */ boolean allows(E e); } ================================================ FILE: src/opennlp/ccg/util/FilteredMap.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.AbstractMap; import java.util.LinkedHashMap; import java.util.Map; import java.util.Set; /** * A map whose keys must be allowed by a specified {@linkplain #getKeyFilter() key filter}. * The filtered mappings are kept in the same order they occurred in in the * {@linkplain #getOriginalMap() original map}. * * @see Filter * @author Scott Martin */ public class FilteredMap extends AbstractMap { Map originalMap; Filter keyFilter; private final Map map = new LinkedHashMap(); /** * Creates a new filtered map including only the elements in originalMap whose * keys are {@linkplain Filter#allows(Object) allowed} by the specified * keyFilter. *

    * Filtered maps maintain an {@linkplain #entrySet() entry set} whose entries occur in * the same order as they occurred in the original map. * * @param originalMap The map to draw this map's elements from. * @param keyFilter The filter that decides which keys from the original map should be * represented in this filtered map. * @throws IllegalArgumentException If keyFilter is null. * @see Map#putAll(Map) */ public FilteredMap(Map originalMap, Filter keyFilter) { if(keyFilter == null) { throw new IllegalArgumentException("keyFilter is null"); } this.originalMap = originalMap; this.keyFilter = keyFilter; putAll(originalMap); } /** * Gets the map that this map draws its elements from. * @return The map specified at creation * @see #FilteredMap(Map, Filter) */ public Map getOriginalMap() { return originalMap; } /** * Gets this map's key filter. */ public Filter getKeyFilter() { return keyFilter; } /** * Gets the entry set for this map. Each entry's key is * guaranteed to be allowable according to this map's {@linkplain #getKeyFilter() key * filter}. * * @return The subset of the {@linkplain #getOriginalMap() original map}'s entries that * are allowable by the key filter. * * @see Map#entrySet() */ @Override public Set> entrySet() { return map.entrySet(); } /** * Provides the ability to put new mappings into this filtered map, provided the specified * key is {@linkplain Filter#allows(Object) allowed} by this map's * {@linkplain #getKeyFilter() key filter}. * * @return the element previously associated with key if the specified * key is allowed by the key filter (and null if none was associated). * This method always returns null for key/value pairs in which the specified * key is not allowed by the key filter in effect. */ @Override public V put(K key, V value) { if(keyFilter.allows(key)) { return map.put(key, value); } return null; } } ================================================ FILE: src/opennlp/ccg/util/FilteredSet.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.AbstractSet; import java.util.Collection; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Set; /** * A set whose elements must be {@linkplain Filter#allows(Object) allowed} by a specified * {@link Filter}. Conceptually, instances of this class represent the sets described by the * characteristic function corresponding to their {@linkplain #getFilter() filter}. *

    * The elements in the filtered set are kept in the same order they occur in according to the * {@linkplain #getOriginalCollection() original collection}'s iterator. * * @see Filter * @author Scott Martin */ public class FilteredSet extends AbstractSet { Collection originalCollection; Filter filter; private final Set set = new LinkedHashSet(); /** * Creates a new filtered set based on the specified collection and filter. The resulting set will * contain all the members of the original collection for which the specified filter's * {@link Filter#allows(Object)} method returns true. * @param originalCollection The collection from which this filtered set will draw its elements. * @param filter The filter that decides which of the members of originalCollection are * allowable. The specified filter can apply to elements of any superclass of this filtered set's * type parameter. * @throws IllegalArgumentException If filter is null. * * @see #addAll(Collection) */ public FilteredSet(Collection originalCollection, Filter filter) { if(filter == null) { throw new IllegalArgumentException("filter is null"); } this.filter = filter; this.originalCollection = originalCollection; addAll(originalCollection); } /** * Gets the original collection from which this filtered set's elements are drawn. * @return The collection specified at creation. * @see #FilteredSet(Collection, Filter) */ public Collection getOriginalCollection() { return originalCollection; } /** * Gets the filter used by this filtered set to determine which elements are allowed in it. * @return The filter specified at creation. * @see #FilteredSet(Collection, Filter) */ public Filter getFilter() { return filter; } /** * Gets an iterator over the elements in this filtered set. */ @Override public Iterator iterator() { return set.iterator(); } /** * Gets the size of this filtered set (the number of elements it contains). */ @Override public int size() { return set.size(); } /** * Adds an element if it conforms to the {@linkplain #getFilter() filter in effect}, determined by * consulting the filter's {@link Filter#allows(Object)} method using the supplied element. * @param e The element to add, after testing its allowability according to this filtered set's filter. * @return true if the filter allows e and this set changed as a result of the addition * (because the specified element e was not already contained). */ @Override public boolean add(E e) { return filter.allows(e) && set.add(e); } } ================================================ FILE: src/opennlp/ccg/util/GroupMap.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-6 Jason Baldridge, Gann Bierner and // Michael White (University of Edinburgh, The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import gnu.trove.*; import java.io.Serializable; import java.util.*; /** * A map where putting a value does not replace an old value but is rather * included in a set of values for that key. * The map may use identity equals on keys. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/07/17 04:23:30 $ */ public class GroupMap implements Serializable { private static final long serialVersionUID = -2995356057195571222L; // the underlying map private THashMap map; /** Default constructor. */ public GroupMap() { this(false); } /** Constructor with flag for whether to use identity instead of equals on keys. */ public GroupMap(boolean useIdentityEquals) { if (useIdentityEquals) map = new THashMap(new TObjectIdentityHashingStrategy()); else map = new THashMap(); } /** Adds the given key-value pair to the map, and returns null. */ @SuppressWarnings("unchecked") public Object put(KeyType key, ValType value) { // get current val Object currentVal = map.get(key); // if none, add value to map if (currentVal == null) { map.put(key, value); } // if already a set, add value to set else if (currentVal instanceof Set) { Set set = (Set) currentVal; set.add(value); } // otherwise replace with a set including both values else { Set set = new THashSet(); set.add((ValType)currentVal); set.add(value); map.put(key, set); } // return null, since we're not really replacing the old val return null; } /** Returns the set of values for the given key (or null). */ @SuppressWarnings("unchecked") public Set get(KeyType key) { // get val Object val = map.get(key); // return if null or already a set if (val == null || val instanceof Set) { return (Set) val; } // otherwise replace val with a set and return it Set set = new THashSet(); set.add((ValType)val); map.put(key, set); return set; } /** Adds a key-value pair to the map for all the given vals. */ public void putAll(KeyType key, Collection vals) { for (ValType val : vals) put(key, val); } /** Returns the size of the underlying map. */ public int size() { return map.size(); } /** Returns the keys. */ @SuppressWarnings("unchecked") public Set keySet() { return (Set) map.keySet(); } /** Returns whether the keys contain the given one. */ public boolean containsKey(KeyType key) { return map.containsKey(key); } /** Removes the given key, returning its previous value (if any). */ Set remove(KeyType key) { Set retval = get(key); map.remove(key); return retval; } } ================================================ FILE: src/opennlp/ccg/util/IntHashSetMap.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import gnu.trove.*; /** * A map from ints to sets which allows objects with the same key to be * added without overriding previous puts. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.6 $, $Date: 2005/10/13 20:33:49 $ */ public class IntHashSetMap extends TIntObjectHashMap { private static final long serialVersionUID = 1L; /** Adds the given key-value pair to the map. */ public Object put(int key, Object value) { THashSet val = (THashSet) get(key); if (val==null) { val = new THashSet(); val.add(value); super.put(key, val); } else { val.add(value); } return val; } } ================================================ FILE: src/opennlp/ccg/util/Interner.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.*; import java.lang.ref.*; /** * A utility class for interning (canonicalizing) objects. * A WeakHashMap is used as the backing store, so that interned objects can * be garbage collected. * Normally, it is easier to use the globalIntern method (sharing a global * backing store) than to allocate separate interners. * Individual interners can be constructed to use soft references to * the interned objects, so that they are kept around longer than is the * case with weak references (the default). * * @author Michael White * @version $Revision: 1.5 $, $Date: 2005/10/13 20:33:49 $ * */ public class Interner { // the backing store private Map> weakMap = new WeakHashMap>(); // flag for whether to use soft references private boolean softRefs = false; /** Default constructor. */ public Interner() {} /** Constructor with soft references flag. */ public Interner(boolean softRefs) { this.softRefs = softRefs; } /** * Returns a canonical version of the given object. * The returned object is .equals() to the given one. * If the given object is not equal to one already seen, * then the returned object will be == to the given one. */ public T intern(T obj) { // check if equivalent key already in map if (weakMap.containsKey(obj)) { // return existing canonical obj if so Reference ref = weakMap.get(obj); return ref.get(); } // otherwise add this object to the map, wrapped in a // weak/soft reference so that it can still be gc'ed Reference ref = (softRefs) ? new SoftReference(obj) : new WeakReference(obj); weakMap.put(obj, ref); return obj; } /** * Returns the canonical version of the given object, if any, * otherwise returns null. */ public T getInterned(T obj) { // get weak reference to canonical obj, if any Reference ref = weakMap.get(obj); // return obj, if any, otherwise null return (ref != null) ? ref.get() : null; } /** Returns the number of interned objects. */ public int size() { return weakMap.size(); } // the global interner private static Interner globalInterner = null; /** * Returns a canonical version of the given object using a global interner. * The returned object is .equals() to the given one. * If the given object is not equal to one already seen, * then the returned object will be == to the given one. */ public static Object globalIntern(Object obj) { if (globalInterner == null) globalInterner = new Interner(); return globalInterner.intern(obj); } /** * Returns the canonical version of the given object using the global interner, if any, * otherwise returns null. */ public static Object getGlobalInterned(Object obj) { if (globalInterner == null) return null; return globalInterner.getInterned(obj); } /** Returns the number of interned objects in the global interner. */ public static int globalSize() { if (globalInterner == null) return 0; return globalInterner.size(); } /** Tests the implementation. */ public static void main(String[] args) { Interner interner = new Interner(); int SIZE = 100000; Integer[] ints = new Integer[SIZE]; System.out.println("Adding " + SIZE + " ints to interner."); for (int i = 0; i < SIZE; i++) { ints[i] = new Integer(i); Integer interned = interner.intern(ints[i]); if (interned != ints[i]) { System.out.println("Whoops: ints[" + i + "] not == to interned: " + interned); System.exit(-1); } } System.out.println("interner.size(): " + interner.size()); // should be SIZE System.out.println("Doing gc()."); System.gc(); System.out.println("interner.size(): " + interner.size()); // should be the same System.out.println(); System.out.println("Now adding " + SIZE + " equivalent ints to interner."); for (int i = 0; i < SIZE; i++) { Integer intI = new Integer(i); Integer interned = interner.intern(intI); if (interned == intI) { System.out.println("Whoops: intI (i=" + i + ") is == to interned: " + interned); System.exit(-1); } } System.out.println("interner.size(): " + interner.size()); // should be the same System.out.println(); System.out.println("Next adding " + SIZE + " new, unreferenced ints to interner."); for (int i = SIZE; i < SIZE*2; i++) { Integer intI = new Integer(i); Integer interned = interner.intern(intI); if (interned != intI) { System.out.println("Whoops: intI (i=" + i + ") not == to interned: " + interned); System.exit(-1); } } System.out.println("interner.size(): " + interner.size()); // should be larger than SIZE System.out.println("Doing gc()."); System.gc(); System.out.println("interner.size(): " + interner.size()); // should be back to SIZE System.out.println(); } } ================================================ FILE: src/opennlp/ccg/util/InverseFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; /** * A filter that wraps another filter an inverts its effects. Specifically, for each argument that * the wrapped filter's {@link Filter#allows(Object)} method returns true, this filter * returns false, and vice versa. * * @author Scott Martin */ public class InverseFilter implements Filter { Filter originalFilter; /** * Creates a new filter based on the specified filter, inverting its effects. The specified filter can * apply to any superclass of this filter's type parameter. * @param originalFilter The filter to invert. * @throws IllegalArgumentException If originalFilter is null. */ public InverseFilter(Filter originalFilter) { if(originalFilter == null) { throw new IllegalArgumentException("originalFilter is null"); } this.originalFilter = originalFilter; } /** * Gets the original, non-inverted filter that this inverse filter wraps. * @return The filter specified at creation. * @see #InverseFilter(Filter) */ public Filter getOriginalFilter() { return originalFilter; } /** * Tests whether this filter allows a specified element by calling the original filter's * {@link Filter#allows(Object)} method and reversing its boolean value. * * @param e The element to test. * @return A value equivalent to calling !getOriginalFilter().allows(e). */ @Override public boolean allows(E e) { return !originalFilter.allows(e); } } ================================================ FILE: src/opennlp/ccg/util/JLineReader.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 David Reitter and University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.io.*; import java.util.*; import jline.*; /** * A command-line reader based on JLine. * * @author David Reitter * @author Michael White * @version $Revision: 1.4 $, $Date: 2009/12/21 03:27:18 $ */ public class JLineReader extends LineReader { // reader for console input ConsoleReader reader; // command history, buffer History history = null; StringWriter histbuf = null; /** Constructor with completion strings. */ public JLineReader(String[] completions) throws IOException { // init reader reader = new ConsoleReader(); // store commands for 'tab' argument completion List completors = new LinkedList(); completors.add(new SimpleCompletor(completions)); reader.addCompletor(new ArgumentCompletor(completors)); } /** Sets the command history. */ public void setCommandHistory(String histStr) throws IOException { // initialize history with max size = 50 history = new History(); history.setMaxSize(50); if (!histStr.equals("")) { histStr = histStr.replaceAll("
    ", "\n"); // using
    to get around XML problem in Java 1.4 StringReader sreader = new StringReader(histStr); history.load(sreader); } // set to reader's history reader.setHistory(history); } /** Gets the current command history. */ public String getCommandHistory() throws IOException { if (history == null) return ""; StringBuffer retbuf = new StringBuffer(); List commands = history.getHistoryList(); for (Iterator it = commands.iterator(); it.hasNext(); ) { retbuf.append(it.next().toString()); if (it.hasNext()) retbuf.append("
    "); // using
    to get around XML problem in Java 1.4 } return retbuf.toString(); } /** Returns an input string, using the given prompt. */ public String readLine(String prompt) throws IOException { return reader.readLine(prompt); } } ================================================ FILE: src/opennlp/ccg/util/LineReader.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) and David Reitter // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.io.*; /** * Abstract command-line reader. * * @author Michael White * @author David Reitter * @version $Revision: 1.2 $, $Date: 2005/10/13 20:33:49 $ */ abstract public class LineReader { /** Creates a default line reader (currently a JLineReader) with the given completion strings. */ public static LineReader createLineReader(String[] completions) throws IOException { return new JLineReader(completions); } /** Sets the command history. */ abstract public void setCommandHistory(String histStr) throws IOException; /** Gets the current command history. */ abstract public String getCommandHistory() throws IOException; /** Returns an input string, using the given prompt. */ abstract public String readLine(String prompt) throws IOException; } ================================================ FILE: src/opennlp/ccg/util/ListMap.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2006 Michael White (The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import gnu.trove.*; import java.util.*; /** * A map where putting a value does not replace an old value * but is instead included in a list of values for that key. * The map may use identity equals on keys. * (NB: A ListMap is essentially a GroupMap that uses lists instead of sets.) * * @author Michael White * @version $Revision: 1.1 $, $Date: 2006/08/15 18:21:31 $ */ public class ListMap { // the underlying map private THashMap map; /** Default constructor. */ public ListMap() { this(false); } /** Constructor with flag for whether to use identity instead of equals on keys. */ public ListMap(boolean useIdentityEquals) { if (useIdentityEquals) map = new THashMap(new TObjectIdentityHashingStrategy()); else map = new THashMap(); } /** Adds the given key-value pair to the map, and returns null. */ @SuppressWarnings("unchecked") public Object put(KeyType key, ValType value) { // get current val Object currentVal = map.get(key); // if none, add value to map if (currentVal == null) { map.put(key, value); } // if already a list, add value to list else if (currentVal instanceof List) { List list = (List) currentVal; list.add(value); } // otherwise replace with a list including both values else { List list = new ArrayList(3); list.add((ValType)currentVal); list.add(value); map.put(key, list); } // return null, since we're not really replacing the old val return null; } /** Returns the list of values for the given key (or null). */ @SuppressWarnings("unchecked") public List get(KeyType key) { // get val Object val = map.get(key); // return if null or already a list if (val == null || val instanceof List) { return (List) val; } // otherwise replace val with a list and return it List list = new ArrayList(1); list.add((ValType)val); map.put(key, list); return list; } /** Adds a key-value pair to the map for all the given vals. */ public void putAll(KeyType key, Collection vals) { for (ValType val : vals) put(key, val); } /** Returns the size of the underlying map. */ public int size() { return map.size(); } /** Returns the keys. */ @SuppressWarnings("unchecked") public Set keySet() { return (Set) map.keySet(); } /** Returns whether the keys contain the given one. */ public boolean containsKey(KeyType key) { return map.containsKey(key); } /** Removes the given key, returning its previous value (if any). */ List remove(KeyType key) { List retval = get(key); map.remove(key); return retval; } } ================================================ FILE: src/opennlp/ccg/util/MembershipFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.Set; /** * A filter that allows elements based on their membership in a set specified at creation. Later modifications * to this set will be reflected in the behavior of this filter's {@link #allows(Object)} method because * the set is not copied at creation. * * @author Scott Martin */ public class MembershipFilter implements Filter { /** * The set to test for membership. */ protected Set members; /** * Creates a new membership filter based on the specified set of members. This set can later be modified * and have its new membership reflected by this filter's {@link #allows(Object)} method because the * specified set is not copied by this filter. * * @param members The set to test for membership. * @throws IllegalArgumentException If members is null. */ public MembershipFilter(Set members) { if(members == null) { throw new IllegalArgumentException("members is null"); } this.members = members; } /** * Tests whether this membership filter allows the specified element by testing whether the membership * set contains the element. * * @param e The element to test membership for. * @return true if the set of members specified at creation contains e. * @see #MembershipFilter(Set) */ @Override public boolean allows(E e) { return members.contains(e); } } ================================================ FILE: src/opennlp/ccg/util/Pair.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003 Jason Baldridge, Gann Bierner and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.io.Serializable; /** * Dinky class to package pairs of things. * * @author Jason Baldridge * @author Gann Bierner * @author Michael White * @version $Revision: 1.7 $, $Date: 2009/12/21 03:27:18 $ */ public final class Pair implements Serializable { private static final long serialVersionUID = 3626104184233533389L; /** The first element of the pair. */ public final TypeA a; /** The second element of the pair. */ public final TypeB b; /** Constructor. */ public Pair(TypeA a, TypeB b) { this.a = a; this.b = b; } /** Returns a hash code constructed from those of a and b. */ public int hashCode() { return a.hashCode() - b.hashCode(); } /** Returns true if the given object pairs the same elements. */ public boolean equals(Object obj) { if (this == obj) return true; if (!(obj instanceof Pair)) return false; Pair p = (Pair) obj; return a.equals(p.a) && b.equals(p.b); } /** Returns "[a/b]". */ public String toString() { return "["+a+"/"+b+"]"; } } ================================================ FILE: src/opennlp/ccg/util/SingletonList.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.io.Serializable; import java.util.*; /** * An unmodifiable list of one element. * Equality is checked using == on the list element, which * is assumed to be canonical. * * @author Michael White * @version $Revision: 1.5 $, $Date: 2009/12/21 03:27:18 $ */ public class SingletonList extends AbstractList implements Serializable { private static final long serialVersionUID = -4340168177098319085L; /** The single list element. */ public final T elt; /** Constructor. */ public SingletonList(T elt) { this.elt = elt; } /** Returns the size of this list. */ public int size() { return 1; } /** Returns the ith element of the list. */ public T get(int i) { if (i == 0) return elt; else throw new IndexOutOfBoundsException("No element with index: " + i); } /** Returns a hash code for this list, using the identity hash code of the list element. */ public int hashCode() { return 31 + System.identityHashCode(elt); } /** Returns whether this list equals the given object, using identity tests on the list element. */ public boolean equals(Object obj) { if (this == obj) return true; if (!(obj instanceof List)) return false; List list = (List) obj; if (size() != list.size()) return false; return (get(0) == list.get(0)); } } ================================================ FILE: src/opennlp/ccg/util/StructureSharingList.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2003-4 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.io.Serializable; import java.util.*; /** * An unmodifiable list formed by sequencing two sublists. * The sublists are assumed to remain unchanged as well. * Equality is checked using == on the list elements, which * are assumed to be canonical. * * @author Michael White * @version $Revision: 1.7 $, $Date: 2009/12/21 03:27:18 $ */ public class StructureSharingList extends AbstractList implements Serializable { private static final long serialVersionUID = 6692080357319326492L; /** The first sublist. */ public final List first; /** The second sublist. */ public final List second; // cached hashcode private int hashcode = -1; // size private final int size; /** Constructor. */ public StructureSharingList(List first, List second) { this.first = first; this.second = second; this.size = first.size() + second.size(); } /** Returns the size of this list. */ public int size() { return size; } /** Returns the ith element of the list. */ public T get(int i) { if (i < first.size()) { return first.get(i); } else { return second.get(i - first.size()); } } /** Returns a hash code for this list, using identity hash codes of the list elements. */ public int hashCode() { // check whether already cached if (hashcode != -1) return hashcode; int hc = 1; for (int i = 0; i < size(); i++) { hc = 31*hc + System.identityHashCode(get(i)); } // cache then return hashcode = hc; return hc; } /** Returns whether this list equals the given object, using identity tests on the list elements. */ public boolean equals(Object obj) { if (this == obj) return true; if (!(obj instanceof List)) return false; List list = (List) obj; if (list instanceof StructureSharingList) { StructureSharingList ssl = (StructureSharingList) list; if (first == ssl.first && second == ssl.second) return true; } if (size() != list.size()) return false; for (int i = 0; i < size(); i++) { if (get(i) != list.get(i)) return false; } return true; } } ================================================ FILE: src/opennlp/ccg/util/TrieMap.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005 University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.*; /** * Implements a trie with a data object at each node. Keys are assumed to be * canonical, and thus checked using identity (==) rather than equality. For * efficient allocation, all children can be added at once. * * @author Michael White * @version $Revision: 1.6 $, $Date: 2011/03/20 20:11:58 $ */ public class TrieMap { /** Interface for methods returning an interned key. */ public interface KeyExtractor { /** Returns an interned key. */ public KeyType getKey(); } /** The data object. */ public DataType data; /** * The mapping to the children. If there is just one child, it's stored in a * pair with its key. Otherwise, an IdentityHashMap is used. */ private Object childMap = null; /** The parent node. */ private TrieMap parent = null; /** Constructor with data object. */ public TrieMap(DataType data) { this.data = data; } /** Factory method, for adding empty child nodes. */ protected TrieMap createNode() { return new TrieMap(null); } /** Adds the given child with its key. */ @SuppressWarnings("unchecked") public void addChild(KeyType key, TrieMap child) { child.parent = this; if (childMap == null) { childMap = new Pair>(key, child); return; } Map> map; if (childMap instanceof Pair) { Pair> pair = (Pair>) childMap; map = new IdentityHashMap>(); map.put(pair.a, pair.b); childMap = map; } else { map = (Map>) childMap; } map.put(key, child); } /** Adds the given children with their keys. */ @SuppressWarnings("unchecked") public void addChildren(List keys, List> childNodes) { if (childMap == null && keys.size() == 1) { TrieMap child = childNodes.get(0); child.parent = this; childMap = new Pair>(keys.get(0), child); return; } Map> map; if (childMap == null) { map = new IdentityHashMap>(keys.size()); childMap = map; } else if (childMap instanceof Pair) { Pair> pair = (Pair>) childMap; map = new IdentityHashMap>(keys.size() + 1); map.put(pair.a, pair.b); childMap = map; } else { map = (Map>) childMap; } for (int i = 0; i < keys.size(); i++) { TrieMap child = childNodes.get(i); child.parent = this; map.put(keys.get(i), child); } } /** Gets the parent node, or null if none. */ public TrieMap getParent() { return parent; } /** Gets the child for the given key, or null if none. */ @SuppressWarnings("unchecked") public TrieMap getChild(KeyType key) { if (childMap == null) return null; if (childMap instanceof Pair) { Pair> pair = (Pair>) childMap; if (pair.a == key) return pair.b; else return null; } Map> map = (Map>) childMap; return map.get(key); } /** Gets the child for the given list of keys, or null if none. */ public TrieMap getChildFromList(List keys) { TrieMap next = this; for (int pos = 0; pos < keys.size(); pos++) { next = next.getChild(keys.get(pos)); if (next == null) return null; } return next; } /** Gets the child for the given list of keys extractors, or null if none. */ public TrieMap getChildFromLazyList(List> keyExtractors) { TrieMap next = this; for (int pos = 0; pos < keyExtractors.size(); pos++) { next = next.getChild(keyExtractors.get(pos).getKey()); if (next == null) return null; } return next; } /** * Finds the child for the given key, adding one (with a null data object) * if necessary. */ public TrieMap findChild(KeyType key) { TrieMap child = getChild(key); if (child == null) { child = createNode(); addChild(key, child); } return child; } /** * Finds the child for the given list of keys, adding one (with a null data * object) if necessary, along with any necessary intervening parents. */ public TrieMap findChildFromList(List keys) { TrieMap next = this; for (int pos=0; pos < keys.size(); pos++) { KeyType key = keys.get(pos); TrieMap child = next.getChild(key); if (child == null) { child = createNode(); next.addChild(key, child); } next = child; } return next; } /** * Finds the child for the given list of keys, adding one (with a null data * object) if necessary, along with any necessary intervening parents. */ public TrieMap findChildFromLazyList(List> keyExtractors) { TrieMap next = this; for (int pos=0; pos < keyExtractors.size(); pos++) { KeyType key = keyExtractors.get(pos).getKey(); TrieMap child = next.getChild(key); if (child == null) { child = createNode(); next.addChild(key, child); } next = child; } return next; } /** * Gets the keys leading to this node. This requires a linear search at each * level. */ @SuppressWarnings("unchecked") public List traceKeys() { ArrayList retval = new ArrayList(); // collect keys up to root TrieMap currentNode = this; TrieMap currentParent = parent; while (currentParent != null) { if (currentParent.childMap instanceof Pair) { Pair> pair = (Pair>) currentParent.childMap; retval.add(pair.a); } else { Map> map = (Map>) currentParent.childMap; for (Map.Entry> entry : map.entrySet()) { if (entry.getValue() == currentNode) { retval.add(entry.getKey()); break; } } } currentNode = currentParent; currentParent = currentParent.parent; } // reverse and return Collections.reverse(retval); return retval; } /** Returns this trie map as a string, with indenting. */ public String toString() { StringBuffer sb = new StringBuffer(); toString(sb, ""); return sb.toString(); } // appends this trie map as a string, with the given indenting level, // to the given string buffer @SuppressWarnings("unchecked") private void toString(StringBuffer sb, String indent) { sb.append("node: " + data); if (childMap == null) return; indent += " "; if (childMap instanceof Pair) { Pair> pair = (Pair>) childMap; toString(sb, indent, pair.a, pair.b); } else { Map> map = (Map>) childMap; List keys = new ArrayList(map.keySet()); Comparator toStringComparator = new Comparator() { public int compare(KeyType o1, KeyType o2) { return o1.toString().compareTo(o2.toString()); } }; Collections.sort(keys, toStringComparator); for (KeyType key : keys) { toString(sb, indent, key, map.get(key)); } } } // appends the given key and child private void toString(StringBuffer sb, String indent, Object key, TrieMap child) { sb.append("\n").append(indent).append('[').append(key).append("] "); child.toString(sb, indent); } } ================================================ FILE: src/opennlp/ccg/util/VisitedFilter.java ================================================ ////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2012 Scott Martin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import java.util.HashSet; /** * A filter that tracks which elements have already been visited by some process or iteration, allowing only * those that have not yet been visited. An element is considered visited when this filter's * {@link #allows(Object)} method has been called with it as an argument. *

    * Internally, the visited elements are tracked by using a * {@link MembershipFilter} based on a {@link HashSet}. * * @author Scott Martin */ public class VisitedFilter extends MembershipFilter { /** * Creates a new visited filter with an empty set of visited elements. */ public VisitedFilter() { super(new HashSet()); } /** * Tests whether this filter allows e by testing whether or not it has been visited. An element * has been visited if this method has been previously called with it as an argument. * * @param e The element to test whether it has been visited or not. * @return true if e has not yet been visited. * * @see #hasVisited(Object) */ @Override public boolean allows(E e) { if(!hasVisited(e)) { members.add(e); return true; } return false; } /** * Tests whether the specified element has been visited or not. * @param e The element to test for visitation. * @return true if e is among the elements that have been previously visited. * * @see #allows(Object) * @see MembershipFilter#allows(Object) */ public boolean hasVisited(E e) { return super.allows(e); } } ================================================ FILE: src/opennlp/ccg/util/Visualizer.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2004 Alexandros Triantafyllidis and // University of Edinburgh (Michael White) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; import opennlp.ccg.lexicon.*; import opennlp.ccg.grammar.*; import opennlp.ccg.synsem.*; import java.io.*; import java.util.*; /** * Class for visualizing CCG derivations using latex. * * @author Alexandros Triantafyllidis * @author Michael White * @version $Revision: 1.9 $, $Date: 2009/12/21 03:27:18 $ */ public class Visualizer { private String ruleToTeX(String rule, int indent, int length) { StringBuffer sb = new StringBuffer(); int i=0; for (i=0; i < indent; i++) sb.append("&"); sb.append(" \\mc{" + Integer.toString(length) + "} "); if(rule == null || rule.length()==1) sb.append("{\\hrulefill_{"+ rule + "}} \\\\\n"); else sb.append("{\\hrulefill_{"+ rule.substring(0,1) + "\\mathbf{" + rule.substring(1) + "}}}\\\\\n"); return sb.toString(); } // Gets a file name for a temporary file, e.g.: ~tmp0 public String getTempFileName() { File f1 = null; File f2 = null; int i=0; f1 = new File("~tmp"+Integer.toString(i)+".tex"); f2 = new File("~tmp"+Integer.toString(i)+".div"); while(f1.exists() || f2.exists() ) { i++; f1 = null; f2 = null; f1 = new File("~tmp"+Integer.toString(i)+".tex"); f2 = new File("~tmp"+Integer.toString(i)+".div"); } f1 = null; f2 = null; return "~tmp"+Integer.toString(i); } public String getTempDirName() { File f1 = null; int i=0; f1 = new File("tmp"); while(f1.exists() ) { i++; f1 = null; f1 = new File("tmp"+Integer.toString(i) ); } f1 = null; return "tmp"+Integer.toString(i); } public int getTreeDepth(Sign sign, int level) { int max_depth = 0, depth = 0; Sign[] children = sign.getDerivationHistory().getInputs(); if (children != null && sign.getWords().size() > 1) for (int i=0; i < children.length; i++) { depth = getTreeDepth(children[i], level+1); if (depth > max_depth) max_depth = depth; } else max_depth = level; return max_depth; } private int numberOfLeaves(Sign results) { int totalLeaves = 0; Sign[] children = results.getDerivationHistory().getInputs(); if (children==null || results.getWords().size()==1) return 1; for(int i=0;i processSign(Sign results, int level, int identation) { List signList = new ArrayList(); Sign[] children = results.getDerivationHistory().getInputs(); int depth = getTreeDepth(results, 0); TeXSign ts = new TeXSign(); int offset=0; if (children != null && results.getWords().size() > 1) for (int i=0; i < children.length; i++) { if (i > 0) offset += numberOfLeaves(children[i-1]); signList.addAll( processSign(children[i], level + 1, identation + offset) ); } ts.identation = identation; ts.height = depth; ts.sign = results; signList.add(ts); return signList; } public boolean writeFooter(String fileName){ java.io.BufferedWriter bw = null; try{ bw = new java.io.BufferedWriter(new FileWriter(fileName,true) ); bw.write("\\end{document}\n"); bw.close(); } catch(Exception e){ return false; } return true; } public boolean writeHeader(String fileName) { java.io.BufferedWriter bw = null; try { bw = new java.io.BufferedWriter(new FileWriter(fileName) ); bw.write("\\documentclass{article}\n"); bw.write("\\usepackage[margin=0.5in]{geometry}\n"); bw.write("\\newcommand{\\deriv}[2]\n"); bw.write("{ \\renewcommand{\\arraystretch}{.5}\n"); bw.write("$\\begin{array}[t]{*{#1}{c}}\n"); bw.write(" #2\n"); bw.write(" \\end{array}$ }\n"); bw.write("\\newcommand{\\gf}[1]{\\textsf{\\textsl{#1}}}\n"); bw.write("\\newcommand{\\cf}[1]{\\mbox{\\ensuremath{\\cfont{#1}}}}\n"); bw.write("\\newcommand{\\uline}[1]\n"); bw.write("{\\mc{#1}{\\hrulefill} }\n"); bw.write("\\newcommand{\\mc}[2]\n"); bw.write(" {\\multicolumn{#1}{c}{#2}}\n"); bw.write("\\newcommand{\\cfont}{\\mathsf}\n"); bw.write("\\newcommand{\\bs}{\\backslash}\n"); bw.write("\\newcommand{\\subsa}[1]{\\hspace{-0.75mm}_{_{#1}}}\n"); bw.write("\\newcommand{\\subsb}[1]{\\hspace{-0.10mm}_{_{#1}}}\n"); bw.write("\\newcommand{\\subs}[1]{\\hspace{-0.40mm}_{#1}}\n"); bw.write("\\newcommand{\\subsf}[1]{\\hspace{-0.75mm}_{_{#1}}}\n"); bw.write("\\newcommand{\\supsa}[1]{\\hspace{-1.75mm}^{^{#1}} }\n"); bw.write("\\newcommand{\\supsb}[1]{\\hspace{-0.80mm}^{^{#1}} }\n"); bw.write("\\newcommand{\\sups}[1]{\\hspace{-0.40mm}^{#1}}\n"); bw.write("\\pagestyle{empty}\n"); bw.write("\\begin{document}\n"); bw.close(); } catch(Exception e){ return false; } return true; } /** * Shows the current derivation using YaP or xdvi. */ public boolean show(String fileName) { String viewerName = null; try { runCommand("latex " + fileName + ".tex"); //Process p = java.lang.Runtime.getRuntime().exec("latex " + fileName + ".tex"); if (System.getProperty("os.name").toUpperCase().startsWith("WINDOWS")) viewerName = "yap"; else viewerName = "xdvi"; System.out.println("Close " + viewerName + " to continue ..."); runCommand(viewerName + " " + fileName); // The process will wait indefinitely unless we close each of the related streams:/ //p.getInputStream().close(); //p.getOutputStream().close(); //p.getErrorStream().close(); //p.waitFor(); //p = null; //p = java.lang.Runtime.getRuntime().exec(viewerName + " " + fileName); //p.getInputStream().close(); //p.getOutputStream().close(); //p.getErrorStream().close(); //System.out.println("Close " + viewerName + " to continue ..."); //p.waitFor(); } catch(Exception e) { System.out.println("Error invoking latex/" + viewerName + " : " + e.toString()); return false; } return true; } class myFilter implements FileFilter { String baseFileName=null; public myFilter(String s) { baseFileName = s.toUpperCase(); } public boolean accept(File f) { System.out.println("checking: " + f.getName()); if(f.getName().toUpperCase().startsWith(baseFileName)) return true; else return false; } } public class myFileNameFilter implements FilenameFilter { public String fn=null; public myFileNameFilter(String s) { fn=s; } public boolean accept(File dir, String name) { return name.startsWith(fn); } } public boolean cleanFiles(String fileName) { try { File dir = new File(System.getProperty("user.dir")); myFileNameFilter filter = new myFileNameFilter( fileName) ; File[] allFiles = dir.listFiles(filter); for(int i=0;i signList = null; java.io.BufferedWriter bw = null; try { int i=0, numDerivs=0; TeXSign texSign = null; bw = new java.io.BufferedWriter(new FileWriter( fileName,true) ); signList = sortList( processSign(results,0, 0 ) ); numDerivs = results.getWords().size(); Tokenizer tokenizer = Grammar.theGrammar.lexicon.tokenizer; bw.write("\\deriv{" + Integer.toString(numDerivs) + "}{\n"); for (i=0; i < results.getWords().size(); i++) { if (i != 0) bw.write(" & "); String orth = tokenizer.getOrthography((Word)results.getWords().get(i), false); orth = orth.replaceAll("_", "\\\\_"); orth = orth.replaceAll("%", "\\\\%"); bw.write("\\gf{" + orth + "}"); } bw.write(" \\\\\n\\uline{1}"); for (i=1; i < results.getWords().size(); i++) bw.write(" & \\uline{1}"); bw.write(" \\\\\n"); texSign = (TeXSign)signList.get(0); bw.write("\\cf{"+ texSign.sign.getCategory().toTeX() + "}"); for (i=1; i < numDerivs; i++) { texSign = (TeXSign)signList.get(i); bw.write(" & \\cf{"+ texSign.sign.getCategory().toTeX() + "}"); } bw.write(" \\\\\n"); for (i=numDerivs; i < signList.size(); i++) { String ruleStr=null; texSign = (TeXSign)signList.get(i); ruleStr = ruleToTeX(texSign.sign.getDerivationHistory().getRule().name(), texSign.identation, texSign.sign.getWords().size() ); bw.write(ruleStr); for (int j=0; j < texSign.identation; j++) bw.write("&"); bw.write(" \\mc{" + texSign.sign.getWords().size() + "}{\\cf{"+ texSign.sign.getCategory().toTeX() +"}} \\\\\n"); } // Originally 1in, but that's too much when displayed onscreen bw.write("}\n\n\\vspace{5mm}\n\n"); bw.close(); } catch(Exception e) { System.out.println("Error while saving to TeX: " + e.toString()); e.printStackTrace(); return false; } return true; } private List sortList(List signList) { for (int i=0; i < signList.size(); i++) for(int j=i; j < signList.size(); j++) { TeXSign texSign1 = signList.get(i); TeXSign texSign2 = signList.get(j); if(texSign1.height > texSign2.height) { signList.set(i,texSign2); signList.set(j, texSign1); } if(texSign1.height == texSign2.height) if(texSign1.identation > texSign2.identation) { signList.set(i,texSign2); signList.set(j, texSign1); } } return signList; } private class TeXSign { Sign sign = null; int identation = 0; int height = 0; } /** * Calls runCommand/2 assuming that wait=true. * * @param cmd The string containing the command to execute */ public static void runCommand (String cmd) { runCommand(cmd, true); } /** * Run a command with the option of waiting for it to finish. * * @param cmd The string containing the command to execute * @param wait True if the caller should wait for this thread to * finish before continuing, false otherwise. */ public static void runCommand (String cmd, boolean wait) { try { //System.out.println("Running command: "+ cmd); Process proc = Runtime.getRuntime().exec(cmd); // This needs to be done, otherwise some processes fill up // some Java buffer and make it so the spawned process // doesn't complete. BufferedReader br = new BufferedReader(new InputStreamReader(proc.getInputStream())); //String line = null; //while ( (line = br.readLine()) != null) { while ( (br.readLine()) != null) { ; // just eat up the inputstream // Use this if you want to see the output from running // the command. //System.out.println(line); } if (wait) { try { proc.waitFor(); } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } proc.getInputStream().close(); proc.getOutputStream().close(); proc.getErrorStream().close(); } catch (IOException e) { System.out.println("Unable to run command: "+cmd); } } } ================================================ FILE: src/opennlp/ccg/util/XmlScanner.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2006 Michael White (The Ohio State University) // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccg.util; //JAXP packages import javax.xml.parsers.*; import org.xml.sax.*; import org.xml.sax.helpers.*; // jdom import org.jdom.*; import java.io.*; import java.net.*; import java.util.*; /** * Quick-and-dirty class for using JDOM elements in parsing XML * without building a document for the whole XML file. * The parse method parses an XML file with a SAX * parser, constructing JDOM elements for root and the top-level elements, * without attaching the top-level elements to the root. * Along the way, the handleRoot and handleElement * methods are invoked, to handle these elements incrementally. * To use the class, just implement handleElement, and optionally * implement handleRoot, and call parse on the * input XML via its URL. Note that with Java's incremental garbage * collection, an XmlScanner should be reasonably efficient, but not * as efficient as a pure SAX parser (which however requires considerably * more work to implement). At present, the parser only handles elements * and attributes without namespaces; all attributes with namespaces, text nodes, * comments, etc. are ignored. * * @author Michael White * @version $Revision: 1.3 $, $Date: 2009/12/21 03:27:18 $ */ public abstract class XmlScanner extends DefaultHandler { /** * Method for handling the root element, sans children. * The default method does nothing. */ public void handleRoot(Element e) {} /** Method for handling top-level elements. */ public abstract void handleElement(Element e); /** * Method for parsing an XML document, handling the childless root element * and the top-level elements along the way. */ public void parse(URL url) throws IOException { try { // Create a JAXP SAXParserFactory and configure it SAXParserFactory spf = SAXParserFactory.newInstance(); // Create a JAXP SAXParser SAXParser saxParser = spf.newSAXParser(); // Get the encapsulated SAX XMLReader XMLReader xmlReader = saxParser.getXMLReader(); // Set the ContentHandler of the XMLReader xmlReader.setContentHandler(this); // Tell the XMLReader to parse the XML document xmlReader.parse(url.toString()); } catch (ParserConfigurationException exc) { throw (IOException) new IOException().initCause(exc); } catch (SAXException exc) { throw (IOException) new IOException().initCause(exc); } } // flag for whether the root element has been seen yet private boolean seenRoot = false; // the element that is currently being processed private Element current = null; // // ContentHandler methods (just elements) // /** * For the root, a childless element is created and handled, via handleRoot; * for all other elements, a new current element is created and added as a child of the * current element, if any. */ public void startElement(String uri, String localName, String qname, Attributes attributes) throws SAXException { if (!seenRoot) { seenRoot = true; Element root = createElement(uri, localName, qname, attributes); handleRoot(root); return; } Element parent = current; current = createElement(uri, localName, qname, attributes); if (parent != null) parent.addContent(current); } /** * Constructs and returns a new element from the given info. * This implementation ignores the uri and localName, * and filters out any attributes whose qname contains a colon. */ protected Element createElement(String uri, String localName, String qname, Attributes attributes) { Element retval = new Element(qname); if (attributes != null) { int length = attributes.getLength(); for (int i = 0; i < length; i++) { String attrQName = attributes.getQName(i); if (attrQName.indexOf(':') >= 0) continue; String attrValue = attributes.getValue(i); retval.setAttribute(attrQName, attrValue); } } return retval; } /** * Resets the current element to its parent, after first invoking * handleElement on the element if it's a top-level one. */ public void endElement(String uri, String localName, String qname) throws SAXException { if (current == null) return; // for root Element parent = (Element) current.getParent(); if (parent == null) handleElement(current); current = parent; } /** * Example scanner: prints root, counts top- and second-level elements. */ public static class MyScanner extends XmlScanner { int count = 0; int second = 0; public void handleRoot(Element e) { System.out.println("root name: " + e.getName()); System.out.print("root attributes: "); @SuppressWarnings("unchecked") List attrs = (List) e.getAttributes(); for (Attribute attr : attrs) { System.out.print(attr.getName() + "=" + attr.getValue() + " "); } System.out.println(); } public void handleElement(Element e) { count++; second += e.getContentSize(); } } /** * The main method shows an example of using a scanner, by * invoking an instance of MyScanner on the file * whose name is given by the first arg. */ public static void main(String[] args) throws IOException { String filename = args[0]; MyScanner myScanner = new MyScanner(); myScanner.parse(new File(filename).toURI().toURL()); System.out.println("top-level elements: " + myScanner.count); System.out.println("second-level elements: " + myScanner.second); } } ================================================ FILE: src/opennlp/ccgbank/CCGBankConvert.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankConvert.java,v 1.8 2011/11/10 22:18:42 mwhite14850 Exp $ */ package opennlp.ccgbank; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.List; import java.util.StringTokenizer; import javax.xml.transform.Source; import javax.xml.transform.sax.SAXSource; import opennlp.ccgbank.convert.InfoHelper; import opennlp.ccgbank.convert.XSLTTrueCaser; import opennlp.ccgbank.convert.MorphLookup; import opennlp.ccgbank.parse.CCGbankDerivation; import opennlp.ccgbank.parse.SimpleNode; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.DefaultLogger; import org.apache.tools.ant.Project; import org.apache.tools.ant.ProjectHelper; import org.jdom.Document; import org.jdom.Element; import org.jdom.transform.JDOMSource; import org.xml.sax.InputSource; /** * Converts the CCGBank to a modified version for grammar extraction. *

    * Within this task, a series of FileLists is specified. These * files are the lists of xsltProcessors that should be used to transform the * CCGBank. These xsltProcessors are processed in the order they occur in the * FileList specified within this task. * @author Scott Martin * @author Rajakrishnan Rajkumar * @version $Revision: 1.8 $ * @see CCGBankExtract */ public class CCGBankConvert extends CCGBankTask { /** Flag for whether to keep case-marking preps in PP categories; defaults to false. */ boolean keepPPHeads = false; TreeWalker treeWalker = new TreeWalker(); CCGbankDerivation deriv = null; File auxFileDirectory, bbnAuxDirectory, wordsFile, stemsFile, currentDirectory = null; /* (non-Javadoc) * @see opennlp.ccgbank.CCGBankTask#start() */ @Override protected void start() throws BuildException { InfoHelper.init(auxFileDirectory, bbnAuxDirectory); String trueCaseListPath=auxFileDirectory.getAbsolutePath()+"/"+"truecase-list.gz"; XSLTTrueCaser.init(trueCaseListPath); try { MorphLookup.init(wordsFile, stemsFile); } catch(IOException io) { throw new BuildException("problem loading words or stems", io, getLocation()); } xsltProcessor = useXMLFilter ? new XMLFilterProcessor(this, this) : new TemplatesProcessor(this); // "prime" parser // TODO fix this hack!! try { File tmp = File.createTempFile(getClass().getName(), "prime"); tmp.deleteOnExit(); deriv = new CCGbankDerivation(new FileReader(tmp)); } catch(IOException e) { throw new BuildException("Problem priming parser: " + e.getMessage(), e, getLocation()); } } /** Read aux files for the next WSJ section **/ @Override protected void nextDirectory(File section) throws BuildException { currentDirectory = section; // only create if a numbered directory File d = new File(target, currentDirectory.getName()); if(!d.exists() && !d.mkdirs()) { throw new BuildException("unable to create directory " + d); } //Read in aux files try { InfoHelper.readBBNAuxfiles(section.getName()); InfoHelper.readQuoteAuxfiles(section.getName()); InfoHelper.readPTBAuxfiles(section.getName()); InfoHelper.readTreeAuxfiles(section.getName()); } catch(NumberFormatException nfe) { // not a numbered PTB directory } } @Override protected InputSource nextFile(File file) throws BuildException { try { Reader reader = new BufferedReader(new FileReader(file)); if(deriv == null) { deriv = new CCGbankDerivation(reader); } else { CCGbankDerivation.ReInit(reader); } SimpleNode root = CCGbankDerivation.start(); Element result = new Element("Derivation"); String fileName = file.getName(); int start = fileName.contains(File.separator) ? fileName.lastIndexOf(File.separatorChar) : 0; StringBuilder sb = new StringBuilder( fileName.substring(start, fileName.lastIndexOf('.'))); sb.append(".xml"); File targetDir = new File(target, currentDirectory.getName()); File targetFile = new File(targetDir, sb.toString()); xsltProcessor.resetSerializer(); xsltProcessor.setTarget(targetFile); Document doc = new Document(treeWalker.eval(root, result)); // TODO attempt to get error reporting for file / line !! Source s = new JDOMSource(doc); s.setSystemId(file.toURI().toString()); return SAXSource.sourceToInputSource(s); } catch(Exception e) { throw new BuildException(e, getLocation()); } } /** @param keepPPHeads the keepPPHeads value to set */ public void setKeepPPHeads(boolean keepPPHeads) { this.keepPPHeads = keepPPHeads; } /** * @param stemsFile the stemsFile to set */ public void setStemsFile(File stemsFile) { this.stemsFile = stemsFile; } /** * @param wordsFile the wordsFile to set */ public void setWordsFile(File wordsFile) { this.wordsFile = wordsFile; } /** * @param auxFileDirectory the auxFileDirectory to set */ public void setAuxFileDirectory(File auxFileDirectory) { this.auxFileDirectory = auxFileDirectory; } /** * @param bbnAuxDirectory the bbnAuxDirectory to set */ public void setBbnAuxDirectory(File bbnAuxDirectory) { this.bbnAuxDirectory = bbnAuxDirectory; } public static void main(String[] args) { File baseDir = new File(System.getProperty("user.dir")); File buildFile = new File(baseDir, "build.xml"); Project project = new Project(); project.init(); project.setBaseDir(baseDir); ProjectHelper helper = ProjectHelper.getProjectHelper(); project.setProjectReference(helper); helper.parse(project, buildFile); DefaultLogger logger = new DefaultLogger(); logger.setErrorPrintStream(System.err); logger.setOutputPrintStream(System.out); project.addBuildListener(logger); project.executeTarget("convert-base"); } class TreeWalker { // General purpose datastructure to store ccgbank indices of categories. // Refreshed after the lifespan of a node is over. public List idList = new ArrayList(); // flag for whether under a leaf node; // used to control whether to add fs id's private boolean underLeaf = false; public Element eval(SimpleNode node, Element root) throws Exception { // No:of children of any given node int numC = node.jjtGetNumChildren(); // Loop & flag variables int i = 0; SimpleNode child; // Processing the child nodes of the current node. for (i = 0; i < numC; i++) { child = (SimpleNode) node.jjtGetChild(i); // Cat spec without co-indexation info in the leafnodes if (child.type.equals("Redundant")) { if (node.type.equals("Leafnode")) { node.catRedundant = child.print(); continue; } // Processes treenode categories if (node.type.equals("Treenode")) child.type = "complexcat"; } // The header node is accessed and the CCGbankId is passed on to the // treenode root of the sentence which is processed next if (child.type.equals("Header")) { i++; String temp1 = child.getHeader(); int spacePos = temp1.indexOf(' '); if (spacePos > 0) temp1 = temp1.substring(0, spacePos); child = (SimpleNode) node.jjtGetChild(i); child.setHeader(temp1); // System.out.println(temp1); } // Xml element which is going to be generated. Element leaf = new Element(child.type); if (child.type.equals("complexcat") || child.type.equals("Treenode") || child.type.equals("Leafnode")) { // Atomic categories are represented in the javacc tree as // catSpec-aotmcat. So for such cases the catSpec child is // skipped and the next child is accessed. if (child.jjtGetNumChildren() == 1 && child.type.equals("complexcat")) { child = (SimpleNode) child.jjtGetChild(0); // The element which is to be added to the xml // representation leaf = new Element("atomcat"); // Extracting the content of the node and storing it. node.cat = child.print(); } else { // A complexcat element is created. leaf = ccinserter(child, leaf); // Extracting the content of the node and storing it. if (root.getName().equals("Leafnode")) { node.cat = child.print(); } if (root.getName().equals("Treenode") && leaf.getName().equals("complexcat")) { // Leafnode and treenode cat spec elements created node.cat = child.print(); } // Recursive processing of the children of the current node leaf = eval(child, leaf); // Sending Leaf,Tree nodes for to a function which inserts // the family (ie normalized cat spec) of its contents. if (!child.type.equals("complexcat")) leaf = catInserter(child, leaf); // Adding the current element to its parent in the xml tree. root.addContent(leaf); continue; } } // Slash elements added to the tree if (child.cat.equals("/") || child.cat.equals("\\")) { leaf = opinserter(child); root.addContent(leaf); continue; } // Atomcat elements added to the tree leaf = atomcatinserter(child); root.addContent(leaf); // if(!child.type.matches("\\p{Punct}")) } // The final result of the above operations returned. return root; } public Element ccinserter(SimpleNode node, Element leaf) { // This function produces complexcat/treenode/leafnode elements. // mww: the name of this function is not very helpful // The node can be of any of the above types. String name; name = node.type; // Set treebankId,parseNo at root of the sent String h = node.getHeader(); if (h != null) leaf.setAttribute("Header", h); // Treenode info ie head,daughter represented if (name.equals("Treenode")) { leaf.setAttribute("head", node.head); leaf.setAttribute("dtr", node.dtr); idList.clear(); underLeaf = false; } // Leafnode info represented if (name.equals("Leafnode")) { leaf.setAttribute("lexeme", node.lex); // nb: may be truecased later leaf.setAttribute("lexeme0", node.lex); leaf.setAttribute("pos", node.pos); idList.clear(); underLeaf = true; } // add propbank info here if (node.nodeRoles != null) { String roles = ""; for (SimpleNode.LexSenseRole lexSenseRole : node.nodeRoles) { if (lexSenseRole.role.equals("rel")) { leaf.setAttribute("rel", lexSenseRole.lex + "." + lexSenseRole.sense); } else { String role = adjustRole(lexSenseRole.role); roles += lexSenseRole.lex + "." + lexSenseRole.sense + ":" + role + " "; } } if (roles.length() > 0) leaf.setAttribute("roles", roles.trim()); } if (node.argRoles != null) { String args = ""; for (String role : node.argRoles) { role = adjustRole(role); args += role + " "; } leaf.setAttribute("argRoles", args.trim()); } // done return leaf; } public Element atomcatinserter(SimpleNode node) throws Exception { // Predicate for atomcat creation. // Flag signifies whether the elem is a single atomcat. // Relevant as if the present cat is an atomcat LF variable can be set // in the syntax here itself. StringTokenizer lex; Element atomcat = new Element("atomcat"); Element fs = new Element("fs"); Element feat = new Element("feat"); String id = "NotGiven"; String form; // Current element info extracted from the node String elem = node.print(); // Index extracted by sensing undescore delimiter if (elem.contains("_")) { // elem=elem.replaceAll(":[A-Z]",""); lex = new StringTokenizer(elem, "_"); elem = lex.nextToken(); id = lex.nextToken(); String x[] = id.split(":"); // System.out.println(id); if (x.length == 2) { // mww: moved this to atomcat //feat.setAttribute("attr", "dep"); //feat.setAttribute("val", x[1]); //fs.addContent(feat); atomcat.setAttribute("dep", x[1]); } id = id.replaceAll(":[A-Z]", ""); } else { // Default id 1 is alloted otherwise id = Integer.toString(idList.size() + 1); idList.add(id); } // Normalizing the id by comparing with previous indices. if (!idList.contains(id)) idList.add(id); id = Integer.toString(idList.indexOf(id) + 1); // Normalized index is set (leaves only) if (underLeaf) fs.setAttribute("id", id); // make lowercase elem = elem.toLowerCase(); // remove superfluous [nb] in np[nb] elem = elem.replaceAll("np\\[nb\\]", "np"); // also strip PP heads if apropos elem = stripPPHeads(elem); // Form attrtibute detected and set. if (elem.contains("[")) { lex = new StringTokenizer(elem, "[,]"); elem = lex.nextToken(); form = lex.nextToken(); feat = new Element("feat"); feat.setAttribute("attr", "form"); feat.setAttribute("val", form); fs.addContent(feat); } // add fs if non-empty if (underLeaf || fs.getContentSize() > 0) atomcat.addContent(fs); atomcat.setAttribute("type", elem); return atomcat; } public Element opinserter(SimpleNode node) { Element slash = new Element("slash"); String dir; String op = node.cat; // Slash direction sensed and set. if (op.equals("\\")) dir = "<"; else dir = ">"; slash.setAttribute("dir", op); slash.setAttribute("mode", dir); return slash; } public Element catInserter(SimpleNode node, Element leaf) { int i; // The normalization process. Relevant indices replaced by 1,2,3..n if (idList.size() > 0) { for (i = 0; i < idList.size(); i++) node.cat = node.cat.replaceAll(idList.get(i), Integer.toString(i + 1)); } String l = node.getLeftover(); if (l != null) node.cat = node.cat + l; // Purging the cat spec of indices outside brackets & colons ie )_2 ,:B int ind = node.cat.indexOf(")_"); while (ind != -1) { String str1 = node.cat.substring(0, ind + 1); String str2 = node.cat.substring(ind + 1, node.cat.length()); str2 = str2.replaceFirst("_(\\p{Digit})++", ""); // System.out.println(str1); // System.out.println(str2); node.cat = str1 + str2; ind = node.cat.indexOf(")_"); } node.cat = node.cat.replaceAll(":[A-Z]", ""); // Add categories with normalized indices, lowercased String cat = node.cat.toLowerCase(); cat = cat.replaceAll("np\\[nb\\]", "np"); // also strip PP heads if apropos cat = stripPPHeads(cat); leaf.setAttribute("cat", cat); String cat0 = ""; // Add the same category to the treenodes if (node.type.equals("Treenode")) cat0 = node.cat; else cat0 = node.catRedundant; // Add the bare category to the leafnodes leaf.setAttribute("cat0", cat0); // Refresh index list. idList.clear(); return leaf; } } // strips PP heads if apropos private String stripPPHeads(String cat) { if (keepPPHeads) return cat; return cat.replaceAll("pp\\[[a-z]+\\]", "pp"); } // adjusts role, stripping PP head if apropos private String adjustRole(String role) { role = role.replaceFirst("ARG", "Arg"); if (!keepPPHeads) { int hyph = role.indexOf('-'); if (hyph > 0) role = role.substring(0, hyph); } return role; } } ================================================ FILE: src/opennlp/ccgbank/CCGBankExtract.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankExtract.java,v 1.5 2011/11/04 01:49:57 raja-asoka Exp $ */ package opennlp.ccgbank; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.io.Writer; import java.util.EnumMap; import java.util.Map; import javax.xml.transform.Source; import javax.xml.transform.Templates; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.URIResolver; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.stream.StreamSource; import opennlp.ccgbank.CCGBankTaskTemplates.Type; import opennlp.ccgbank.extract.FreqTally; import opennlp.ccgbank.extract.RulesTally; import opennlp.ccgbank.extract.Testbed; import org.apache.tools.ant.BuildException; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Extracts a grammar from a converted version of the CCGBank. * @author Scott Martin * @version $Revision: 1.5 $ * @see CCGBankConvert */ public class CCGBankExtract extends CCGBankTask implements URIResolver { static String pkgPath = null; static final String LEXICON_TEMPLATE = "lexicon-base.xsl", RULES_TEMPLATE = "rules-base.xsl"; String grammarName = "ccgbankextract"; boolean pPheads = true, skipUnmatched = false; int catFreqCutoff = 1, lexFreqCutoff = 1, openFreqCutoff = 100, ruleFreqCutoff = 1; CCGBankTaskTestbed testbed = null; File lexiconTempFile, rulesTempFile; TemplatesProcessor ruleProcessor; Map xsltProcessors = new EnumMap(Type.class); public CCGBankExtract() { super(); if(pkgPath == null) { pkgPath = getClass().getPackage().getName().replace('.', '/'); } } /** * Sets the name of the generated grammar. * @param grammarName The name of the generated grammar. This is the string * that will appear in the "name" attribute of the root element of the * generated grammar's grammar.xml file. */ public void setGrammarName(String grammarName) { this.grammarName = grammarName; } /** * @param tb the testbed to set */ public void addConfiguredTestbed(CCGBankTaskTestbed tb) { this.testbed = tb; } /** * @param catFreqCutoff the catFreqCutoff to set */ public void setCatFreqCutoff(int catFreqCutoff) { this.catFreqCutoff = catFreqCutoff; } /** * @param lexFreqCutoff the lexFreqCutoff to set */ public void setLexFreqCutoff(int lexFreqCutoff) { this.lexFreqCutoff = lexFreqCutoff; } /** * @param openFreqCutoff the openFreqCutoff to set */ public void setOpenFreqCutoff(int openFreqCutoff) { this.openFreqCutoff = openFreqCutoff; } /** * @param pPheads the ppheads to set */ public void setPPheads(boolean pPheads) { this.pPheads = pPheads; } /** * @param ruleFreqCutoff the ruleFreqCutoff to set */ public void setRuleFreqCutoff(int ruleFreqCutoff) { this.ruleFreqCutoff = ruleFreqCutoff; } /** * @param skipUnmatched the skipUnmatched to set */ public void setSkipUnmatched(boolean skipUnmatched) { this.skipUnmatched = skipUnmatched; } /* (non-Javadoc) * @see javax.xml.transform.URIResolver#resolve(java.lang.String, java.lang.String) */ public Source resolve(String href, String base) { if(href != null && href.length() > 0 && href.startsWith(pkgPath)) { String lastChunk = (href.contains("/") && !href.endsWith("/")) ? href.substring(href.lastIndexOf('/') + 1) : href; if(lastChunk.endsWith(CCGBankExtract.LEXICON_TEMPLATE) || lastChunk.endsWith(CCGBankExtract.RULES_TEMPLATE)) { return new StreamSource(getResource(href)); } } return new StreamSource(new File(href)); } /* (non-Javadoc) * @see opennlp.ccgbank.CCGBankTask#addConfiguredCCGBankTaskTemplates(opennlp.ccgbank.CCGBankTaskTemplates) */ @Override public void addConfiguredTemplates(CCGBankTaskTemplates taskTemplates) { if(xsltProcessors.containsKey(taskTemplates.type)) { throw new BuildException(taskTemplates.type + " extraction type is multiply defined"); } XSLTProcessor xp = useXMLFilter ? new XMLFilterProcessor(this, this) : new TemplatesProcessor(this); xp.addTemplates(taskTemplates); xp.transformerFactory.setURIResolver(this); xsltProcessors.put(taskTemplates.type, xp); } /* (non-Javadoc) * @see opennlp.ccgbank.CCGBankTask#start() */ @Override protected void start() throws BuildException { xsltProcessor = new TemplatesProcessor(this); ((TemplatesProcessor)xsltProcessor).addTemplates( loadTemplates(pkgPath + "/" + CCGBankExtract.LEXICON_TEMPLATE)); ruleProcessor = new TemplatesProcessor(this); ruleProcessor.addTemplates(loadTemplates(pkgPath + "/" + CCGBankExtract.RULES_TEMPLATE)); FreqTally.reset(); FreqTally.CAT_FREQ_CUTOFF = catFreqCutoff; FreqTally.LEX_FREQ_CUTOFF = lexFreqCutoff; FreqTally.OPEN_FREQ_CUTOFF = openFreqCutoff; RulesTally.reset(); RulesTally.RULE_FREQ_CUTOFF = ruleFreqCutoff; RulesTally.KEEP_UNMATCHED = !skipUnmatched; try { lexiconTempFile = File.createTempFile(grammarName, ".xml"); lexiconTempFile.deleteOnExit(); xsltProcessor.setTarget(lexiconTempFile); rulesTempFile = File.createTempFile(grammarName + "-rules", ".xml"); rulesTempFile.deleteOnExit(); ruleProcessor.setTarget(rulesTempFile); Writer w = xsltProcessor.serializer.getWriter(); w.write(""); w.flush(); Writer rw = ruleProcessor.serializer.getWriter(); rw.write(""); rw.flush(); } catch(IOException io) { throw new BuildException(io, getLocation()); } } /* (non-Javadoc) * @see opennlp.ccgbank.CCGBankTask#nextFile(java.io.File) */ @Override protected InputSource nextFile(File file) throws BuildException { try { ruleProcessor.process(super.nextFile(file)); } catch(IOException io) { throw new BuildException("I/O problem processing " + file + ": " + io.getMessage(), io, getLocation()); } catch(SAXException se) { throw new BuildException("Problem processing " + file + ": " + se.getMessage(), se, getLocation()); } catch(TransformerException te) { throw new BuildException("Problem processing " + file + ": " + te.getMessageAndLocation(), te, getLocation()); } return super.nextFile(file); // TODO is this right? } /* (non-Javadoc) * @see opennlp.ccgbank.CCGBankTask#finish() */ @Override protected void finish() throws BuildException { try { Writer w = xsltProcessor.serializer.getWriter(); w.write(""); w.close(); Writer rw = ruleProcessor.serializer.getWriter(); rw.write(""); rw.close(); } catch(IOException io) { throw new BuildException(io, getLocation()); } // generate lexicon, morph, rules for(Type t : xsltProcessors.keySet()) { if(t == Type.LEXICON) { try { FreqTally.printTally(target); } catch(FileNotFoundException fnfe) { throw new BuildException("problem generating frequencies", fnfe, getLocation()); } } else if(t == Type.RULES) { try { RulesTally.printTally(target); } catch(FileNotFoundException fnfe) { throw new BuildException( "problem generating rule frequencies", fnfe, getLocation()); } } String fileName = t.fileName(); log("Generating " + fileName); try { XSLTProcessor xp = xsltProcessors.get(t); xp.setTarget(new File(target, fileName)); xp.process(new InputSource( new BufferedInputStream(new FileInputStream( (t == Type.RULES) ? rulesTempFile : lexiconTempFile)))); } catch(IOException io) { throw new BuildException("I/O problem writing " + fileName, io, getLocation()); } catch(TransformerException te) { throw new BuildException("Problem transforming " + fileName + ": " + te.getMessageAndLocation(), te, getLocation()); } catch(SAXException se) { throw new BuildException("Problem transforming " + fileName + ": " + se.getMessage(), se, getLocation()); } } // generate grammar.xml, if it doesn't already exist // nb: should eventually make schema refs relative to OPENCCG_HOME try { File gramFile = new File(target, "grammar.xml"); if (!gramFile.exists()) { log("Generating grammar.xml"); PrintWriter gramOut = new PrintWriter(new FileWriter(gramFile)); gramOut.println(""); gramOut.println(""); gramOut.println(" "); gramOut.println(" "); gramOut.println(" "); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.println(""); gramOut.close(); } } catch(IOException io) { throw new BuildException("problem generating grammar.xml", io, getLocation()); } if(testbed != null) { log("Creating testbed ..."); try { Testbed ct = new Testbed(ccgBankTaskSources, target, testbed); ct.createTestFiles(); } catch(Exception e) {e.printStackTrace(); throw new BuildException("problem generating testbed: " + e.getMessage(), e, getLocation()); } } } Templates loadTemplates(String resourceName) throws BuildException { try { // XXX nb: no xsltc option this way //TransformerFactory tf = XSLTProcessor.newTransformerFactory(); SAXTransformerFactory tf = (SAXTransformerFactory)TransformerFactory.newInstance(); return tf.newTemplates(new StreamSource(new BufferedInputStream( getResource(resourceName)))); } catch(TransformerConfigurationException e) { throw new BuildException("Problem loading template " + resourceName + ": " + e.getMessage(), e, getLocation()); } } /** * Loads a resource using the fully qualified name with the current * class loader */ InputStream getResource(String resourceName) { return getClass().getClassLoader().getResourceAsStream(resourceName); } } ================================================ FILE: src/opennlp/ccgbank/CCGBankTask.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankTask.java,v 1.5 2010/11/30 18:44:32 mwhite14850 Exp $ */ package opennlp.ccgbank; import java.io.BufferedInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import javax.xml.transform.ErrorListener; import javax.xml.transform.TransformerException; import opennlp.ccgbank.parse.TokenMgrError; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.Task; import org.apache.tools.ant.TaskContainer; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; /** * Abstract class to provide functionality for applying XSLT to an XML stream. *

    * This class is designed to be run as a task from within an Ant build file. * There is one concrete implementer for each of the separate tasks of * converting the CCGBank and extracting a grammar for the converted corpus. * @author Scott Martin * @version $Revision: 1.5 $ * @see CCGBankConvert * @see CCGBankExtract * @see Ant home page */ public abstract class CCGBankTask extends Task implements TaskContainer,ErrorHandler,ErrorListener { File target; Set ccgBankTaskSources = new HashSet(); List ccgBankTaskTemplates = new ArrayList(); XSLTProcessor xsltProcessor = null; boolean useXMLFilter = true; boolean terminateOnError = true, terminateOnWarning = false; /** * Counters */ int directoriesProcessed, filesProcessed, warnings, errors; /** * Adds a sub-task, but included here only for binary compatibility. * @throws BuildException Always throws an exception, only * sourcesSet or xsltProcessors can be added to this task. */ public void addTask(Task task) { throw new BuildException("nested task \"" + task.getTaskName() + "\" not supported, only sourcesSet or xsltProcessors"); } /** * Sets the target directory. * @param target The location of the result of the XSLT conversion (the * converted corpus, extracted grammar, etc.). * @throws BuildException If the target is null * or not a directory. */ public void setTarget(File target) { this.target = target; if(target == null) { throw new BuildException("no target specified"); } if(!target.exists()) { target.mkdirs(); } else if(!target.isDirectory()) { throw new BuildException("specified target is not a directory"); } } /** * @param terminateOnError the terminateOnError to set */ public void setTerminateOnError(boolean terminateOnError) { this.terminateOnError = terminateOnError; } /** * @param terminateOnWarning the terminateOnWarning to set */ public void setTerminateOnWarning(boolean terminateOnWarning) { this.terminateOnWarning = terminateOnWarning; } /** * @param useXMLFilter the useXMLFilter to set */ public void setUseXMLFilter(boolean useXMLFilter) { this.useXMLFilter = useXMLFilter; } /** * Adds a file set of source files. */ public void addConfiguredSources(CCGBankTaskSources sources) { ccgBankTaskSources.add(sources); } /** * Adds a series of xsltProcessors for XSLT transformation. */ public void addConfiguredTemplates(CCGBankTaskTemplates templates) { ccgBankTaskTemplates.add(templates); } /** * Hook to be overridden by subclasses that want notification of the start * of the transformation process. */ protected void start() throws BuildException { // to be overridden } /** * Hook to be overridden by subclasses that want notification of the end * of the transformation process. */ protected void finish() throws BuildException { // to be overridden } /** * Hook that lets subclasses be notified when processing starts on a new * directory. * @param section The file (directory) on which processing is starting. */ protected void nextDirectory(File section) throws BuildException { // to be overridden } /** * Hook that lets implementing subclasses know when processing starts on * a new file. * @param file The file on which processing is about to start. * @return The input source to process. */ protected InputSource nextFile(File file) throws BuildException { try { return new InputSource( new BufferedInputStream(new FileInputStream(file))); } catch(FileNotFoundException fnfe) { throw new BuildException("Unable to find file " + file, fnfe, getLocation()); } } /** * Required by {@link ErrorHandler}. Reports the specified error using the * Ant task {@link Task#log(String)} method. */ public void error(SAXParseException exception) { errors++; handleError("Error", exception, terminateOnError); } /** * Required by {@link ErrorHandler}. Reports the specified error using the * Ant task {@link Task#log(String)} method. */ public void fatalError(SAXParseException exception) { errors++; handleError("Fatal error", exception, terminateOnError); } /** * Required by {@link ErrorHandler}. Reports the specified error using the * Ant task {@link Task#log(String)} method. */ public void warning(SAXParseException exception) { warnings++; handleError("Warning", exception, terminateOnWarning); } /** * Required by {@link ErrorListener}. Reports the specified error using the * Ant task {@link Task#log(String)} method. */ public void error(TransformerException exception) { errors++; handleError("Error", exception, terminateOnError); } /** * Required by {@link ErrorListener}. Reports the specified error using the * Ant task {@link Task#log(String)} method. */ public void fatalError(TransformerException exception) { errors++; handleError("Fatal error", exception, terminateOnError); } /** * Required by {@link ErrorListener}. Reports the specified error using the * Ant task {@link Task#log(String)} method. */ public void warning(TransformerException exception) { warnings++; handleError("Warning", exception, terminateOnWarning); } /** * Helper method for the methods required by {@link ErrorHandler}. */ void handleError(String prefix, SAXParseException spe, boolean terminate) { StringBuilder sb = new StringBuilder(prefix); sb.append(": problem in parse: "); sb.append(spe.getSystemId()); sb.append(" on line "); sb.append(spe.getLineNumber()); sb.append(", column "); sb.append(spe.getColumnNumber()); sb.append(": "); sb.append(spe.getMessage()); if(!terminate) { log(sb.toString()); } else { throw new BuildException(sb.toString(), spe, getLocation()); } } /** * Helper method for the methods required by {@link ErrorListener}. */ void handleError(String prefix, TransformerException te, boolean terminate) { StringBuilder sb = new StringBuilder(prefix); sb.append(": problem in transform: "); sb.append(te.getMessageAndLocation()); if(!terminate) { log(sb.toString()); } else { throw new BuildException(sb.toString(), te, getLocation()); } } /** * Does the work of transforming the CCGBank and extracting grammars. * @throws BuildException In case no sourcesSet have been specified or an * error occurs during the transformation process. *

    * This method calls {@link #start()}, {@link #finish()}, * {@link #nextDirectory(File)}, and {@link #nextFile(File)} as required. */ @Override public void execute() throws BuildException { if(ccgBankTaskSources.isEmpty()) { throw new BuildException("no sourcesSet specified"); } filesProcessed = directoriesProcessed = warnings = errors = 0; start(); log("Target: " + target); if(xsltProcessor == null) { // should have been configured throw new BuildException("null XSLT processor"); } xsltProcessor.addAllTemplates(ccgBankTaskTemplates); try { for(CCGBankTaskSources sources : ccgBankTaskSources) { File prevDir = null; File currentDir = null; for(File file : sources) { currentDir = file.getParentFile(); if(!currentDir.equals(prevDir)) { log("Processing " + currentDir + " ..."); directoriesProcessed++; nextDirectory(currentDir); } prevDir = currentDir; log("Processing " + file); filesProcessed++; xsltProcessor.process(nextFile(file)); } } } catch(IOException io) { throw new BuildException("I/O problem during processing: " + io.getMessage(), io, getLocation()); } catch(SAXException se) { throw new BuildException("Problem during processing: " + se.getMessage(), se, getLocation()); } catch(TransformerException te) { throw new BuildException("I/O problem during processing: " + te.getMessageAndLocation(), te, getLocation()); } catch (TokenMgrError te) { throw new BuildException("I/O problem during processing: " + te.getMessage(), te, getLocation()); } finally { finish(); StringBuilder sb = new StringBuilder("Processed "); sb.append(filesProcessed); sb.append(" files in "); sb.append(directoriesProcessed); sb.append(" directories with "); sb.append(errors); sb.append(" error"); if(errors != 1) { sb.append('s'); } sb.append(" and "); sb.append(warnings); sb.append(" warning"); if(warnings != 1) { sb.append('s'); } log(sb.toString()); } } } ================================================ FILE: src/opennlp/ccgbank/CCGBankTaskFileGroup.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankTaskFileGroup.java,v 1.1 2009/11/09 19:21:50 mwhite14850 Exp $ */ package opennlp.ccgbank; import java.io.File; import java.util.Collection; import java.util.Iterator; import java.util.NoSuchElementException; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.Task; import org.apache.tools.ant.TaskContainer; import org.apache.tools.ant.types.FileList; import org.apache.tools.ant.types.FileSet; /** * Abstract class providing generalized functionality for file groups used by * {@link CCGBankTask}s. *

    * For convenience, this class implements {@link Iterable} over {@link File}s. * This allows instances to be used in standard iteration constructs while * abstracting away from implementation details such as multiple * {@link FileSet}s or {@link FileList}s. * @author Scott Martin * @version $Revision: 1.1 $ */ public abstract class CCGBankTaskFileGroup extends Task implements TaskContainer,Iterable { protected Collection subGroups; /** * Creates a file group over the specified collection of subgroups. * @param subGroups The collection over which this class is an abstracted * view. */ protected CCGBankTaskFileGroup(Collection subGroups) { this.subGroups = subGroups; } /** * Adds a subgroup to the collection of subgroups this class abstracts over. */ protected void addGroup(G group) { subGroups.add(group); } /** * Gets all the files in a group as an array. To be implemented by extending * classes, as {@link FileSet} and {@link FileList} represent files * differently. * @return The collection of files in group, as an array. */ protected abstract File[] getFiles(G group); /** * Creates an array of files given a directory and an array of file names * (specified relative to that directory). * @param directory The directory that the specified file names are relative * to. * @param fileNames The file names, relative to the specified directory. * @return An array containing all the files as specified relative to the * specified directory. */ protected File[] makeFiles(File directory, String[] fileNames) { File[] files = new File[fileNames.length]; for(int i = 0; i < fileNames.length; i++) { files[i] = new File(directory, fileNames[i]); } return files; } /** * Included for binary compatibility with {@link TaskContainer}. * @throws BuildException Always throws a build exception, as only the * parameterized type of this class's subgroups can be contained by this * task. */ public void addTask(Task task) { throw new BuildException("nested task \"" + task + "\" not supported, only " + subGroups.getClass().getTypeParameters()[0] .getGenericDeclaration()); } /** * Provides an * iterator over all the files in the collection of subgroups contained by * this instance. The iterator returned will iterate through files in all * the subgroups returned in the same order as the order returned by the * subgroups collection. */ public Iterator iterator() { return new AllFileView(); } /** * Implements an iterator over the files contained in the subgroups * collection. This class iterates over all the files contained in the * groups in the subgroups collection, in the order that they * are returned by the subgroups collection. * @author Scott Martin * @version $Revision: 1.1 $ */ class AllFileView implements Iterator { Iterator groupIterator = subGroups.iterator(); Iterator currentIterator; /** * Tests whether there is a next file. * @return true If the current subgroup contains a next file, or if * there is a next subgroup that is non-empty. */ public boolean hasNext() { while((currentIterator == null || !currentIterator.hasNext()) && groupIterator.hasNext()) { currentIterator = new FileArrayIterator( getFiles(groupIterator.next())); } // current may be empty return (currentIterator != null && currentIterator.hasNext()); } /** * Gets the next file in the series, as returned in order by the * subgroups collection. * @throws NoSuchElementException If the collection of subgroups is * exhausted. */ public File next() { if(!hasNext()) { throw new NoSuchElementException("elements exhausted"); } return currentIterator.next(); } /** * Included only for binary compatibility with {@link Iterator}. * @throws UnsupportedOperationException Always, as this operation is * not supported. */ public void remove() { throw new UnsupportedOperationException("removed not supported"); } } /** * Implements an iterator view of an array of {@link File} objects. * @author Scott Martin * @version $Revision: 1.1 $ */ class FileArrayIterator implements Iterator { File[] array; int index = 0; /** * Creates a new iterator view over the specified array of files. * @param array The file array backing this iterator view. */ FileArrayIterator(File[] array) { this.array = array; } /** * Tests whether the array of files is exhausted. * @return true If the current index is less than the array length. */ public boolean hasNext() { return (index < array.length); } /** * Gets the next file in series, as specified by the array backing this * iterator view. * @throws NoSuchElementException If the array of files is exhausted. * @see #hasNext() */ public File next() { if(!hasNext()) { throw new NoSuchElementException("elements exhausted"); } return array[index++]; } /** * Included only for binary compatibility with {@link Iterator}. * @throws UnsupportedOperationException Always, as this operation is * not supported. */ public void remove() { throw new UnsupportedOperationException("remove not supported"); } } } ================================================ FILE: src/opennlp/ccgbank/CCGBankTaskSources.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankTaskSources.java,v 1.1 2009/11/09 19:21:50 mwhite14850 Exp $ */ package opennlp.ccgbank; import java.io.File; import java.util.HashSet; import org.apache.tools.ant.BuildException; import org.apache.tools.ant.DirectoryScanner; import org.apache.tools.ant.Project; import org.apache.tools.ant.types.FileSet; /** * Represents a set of source files. This class encapsulates a set of * {@link FileSet}s. * @author Scott Martin * @version $Revision: 1.1 $ * @see Ant home page * @see FileSet */ public class CCGBankTaskSources extends CCGBankTaskFileGroup { /** * Creates a new sources object (required by Ant). */ public CCGBankTaskSources() { super(new HashSet()); } /** * Adds a file set to this sources object. */ public void addConfiguredFileSet(FileSet fileSet) { addGroup(fileSet); } /** * Gets the files in the specified group as an array of files. The files in * the returned array are in the order returned by group's * {@link FileSet#getDirectoryScanner(Project) directory scanner}. */ @Override protected File[] getFiles(FileSet group) { Project proj = getProject(); DirectoryScanner scanner = group.getDirectoryScanner(proj); scanner.scan(); String[] fileNames = scanner.getIncludedFiles(); if(fileNames.length == 0) { throw new BuildException("no source files included"); } return makeFiles(group.getDir(proj), fileNames); } } ================================================ FILE: src/opennlp/ccgbank/CCGBankTaskTemplates.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankTaskTemplates.java,v 1.1 2009/11/09 19:21:50 mwhite14850 Exp $ */ package opennlp.ccgbank; import java.io.File; import java.util.ArrayList; import org.apache.tools.ant.Project; import org.apache.tools.ant.types.FileList; /** * Represents a series of templates. This class encapsulates a list of lists of * {@link FileList}s. * @author Scott Martin * @version $Revision: 1.1 $ * @see Ant home page * @see FileList */ public class CCGBankTaskTemplates extends CCGBankTaskFileGroup { /** * File types/names for the generated OpenCCG-format grammar files. * @author Scott Martin * @version $Revision: 1.1 $ */ enum Type { /** * The lexicon file. */ LEXICON, /** * The file containing morphological information. */ MORPH, /** * The file where the grammar rules are stored. */ RULES; /** * Gets a filename corresponding to a given file type. * @return The file type's name, lowercased, with the string * ".xml" appended. Example: for LEXICON, * returns the string lexicon.xml. */ String fileName() { StringBuilder sb = new StringBuilder(name().toLowerCase()); sb.append(".xml"); return sb.toString(); } } Type type = null; /** * Creates a new xsltProcessors object (no-arg constructor required by Ant). */ public CCGBankTaskTemplates() { super(new ArrayList()); } /** * Adds a file list to the list of transforms. * @param fileList The FileList object to add. */ public void addConfiguredFilelist(FileList fileList) { addGroup(fileList); } /** * Gets the list of files contained in group as an array. * The order of files in the returned array is the same as the order * of group's {@link FileList#getFiles(Project) files}. */ @Override protected File[] getFiles(FileList group) { Project proj = getProject(); return makeFiles(group.getDir(proj), group.getFiles(proj)); } /** * Sets the {@link CCGBankTaskTemplates#type file type}. * @param typeName The name of the type to set. The actual type is * coerced using {@link Enum#valueOf(Class, String)} using * typeName as an argument. */ public void setType(String typeName) { this.type = Type.valueOf(typeName.toUpperCase()); } } ================================================ FILE: src/opennlp/ccgbank/CCGBankTaskTestbed.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: CCGBankTaskTestbed.java,v 1.3 2010/12/09 04:58:12 mwhite14850 Exp $ */ package opennlp.ccgbank; import java.io.File; import org.apache.tools.ant.Task; /** * @author Scott Martin * @version $Revision: 1.3 $ * */ public class CCGBankTaskTestbed extends Task { boolean debugDerivations = false, showSem = false; File text, factors, combos, preds, treeAuxFile; /** * @return the combos */ public File getCombos() { return combos; } /** * @return the debugDerivations */ public boolean isDebugDerivations() { return debugDerivations; } /** * @return the showSem */ public boolean isShowsSem() { return showSem; } /** * @return the factors */ public File getFactors() { return factors; } /** * @return the preds */ public File getPreds() { return preds; } /** * @return the text */ public File getText() { return text; } //Get the file which stores info about the id info of treenodes public File getTree() { return treeAuxFile; } /** * @param combos the combos to set */ public void setCombos(File combos) { this.combos = combos; } /** * @param debugDerivations the debugDerivations to set */ public void setDebugDerivations(boolean debugDerivations) { this.debugDerivations = debugDerivations; } /** * @param showSem the showSem to set */ public void setShowSem(boolean showSem) { this.showSem = showSem; } /** * @param factors the factors to set */ public void setFactors(File factors) { this.factors = factors; } /** * @param preds the preds to set */ public void setPreds(File preds) { this.preds = preds; } /** * @param text the text to set */ public void setText(File text) { this.text = text; } public void setTree(File treeAuxFile) { this.treeAuxFile = treeAuxFile; } } ================================================ FILE: src/opennlp/ccgbank/InputSourceAdapter.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: InputSourceAdapter.java,v 1.1 2009/11/09 19:21:50 mwhite14850 Exp $ * Copyright (C) 2009 Scott Martin (http://www.coffeeblack.org/contact/) */ package opennlp.ccgbank; import java.io.File; import java.io.InputStream; import java.io.Reader; import javax.xml.transform.Source; import javax.xml.transform.stream.StreamSource; import org.xml.sax.InputSource; /** * Turns an {@link InputSource} into a {@link StreamSource}. This class wraps * an input source for XSLT transformation routines that expect {@link Source} * objects. * @author Scott Martin * @version $Revision: 1.1 $ */ class InputSourceAdapter extends StreamSource { InputSource inputSource; InputSourceAdapter(InputSource inputSource) { this.inputSource = inputSource; } /** * @return * @see org.xml.sax.InputSource#getPublicId() */ @Override public String getPublicId() { return inputSource.getPublicId(); } /** * @return * @see org.xml.sax.InputSource#getSystemId() */ @Override public String getSystemId() { return inputSource.getSystemId(); } /** * @param publicId * @see org.xml.sax.InputSource#setPublicId(java.lang.String) */ @Override public void setPublicId(String publicId) { inputSource.setPublicId(publicId); } /** * @param systemId * @see org.xml.sax.InputSource#setSystemId(java.lang.String) */ @Override public void setSystemId(String systemId) { inputSource.setSystemId(systemId); } /* (non-Javadoc) * @see javax.xml.transform.stream.StreamSource#getInputStream() */ @Override public InputStream getInputStream() { return inputSource.getByteStream(); } /* (non-Javadoc) * @see javax.xml.transform.stream.StreamSource#getReader() */ @Override public Reader getReader() { return inputSource.getCharacterStream(); } /* (non-Javadoc) * @see javax.xml.transform.stream.StreamSource#setInputStream(java.io.InputStream) */ @Override public void setInputStream(InputStream inputStream) { inputSource.setByteStream(inputStream); } /* (non-Javadoc) * @see javax.xml.transform.stream.StreamSource#setReader(java.io.Reader) */ @Override public void setReader(Reader reader) { inputSource.setCharacterStream(reader); } /* (non-Javadoc) * @see javax.xml.transform.stream.StreamSource#setSystemId(java.io.File) */ @Override public void setSystemId(File f) { super.setSystemId(f); inputSource.setSystemId(super.getSystemId()); } } ================================================ FILE: src/opennlp/ccgbank/TemplatesProcessor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: TemplatesProcessor.java,v 1.2 2010/09/04 16:24:36 mwhite14850 Exp $ * Copyright (C) 2009 Scott Martin (http://www.coffeeblack.org/contact/) */ package opennlp.ccgbank; import java.io.BufferedOutputStream; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import javax.xml.transform.ErrorListener; import javax.xml.transform.Result; import javax.xml.transform.Source; import javax.xml.transform.Templates; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Implements an XSLT processor using {@link Templates}. This class processes * XSLT template objects successively with a given input, writing the output * of each successive transformation into memory, then feeding that output to * the next template in the chain. * @author Scott Martin * @version $Revision: 1.2 $ */ class TemplatesProcessor extends XSLTProcessor { List templates = null; TemplatesProcessor(ErrorListener errorListener) { super(errorListener); } void addTemplates(Templates t) { if(templates == null) { templates = new ArrayList(); } templates.add(t); } /* (non-Javadoc) * @see opennlp.ccgbank.XSLTProcessor#process(java.io.File) */ @Override void process(InputSource inputSource) throws IOException,SAXException, TransformerException { if(templates == null) { templates = makeTemplates(taskTemplatesList); } StreamSource input = new InputSourceAdapter(inputSource); ByteArrayOutputStream buffer = new ByteArrayOutputStream(); byte[] bytesIn = null; try { // transform input with each template successively, // writing the output of each to a memory buffer Iterator i = templates.iterator(); Source source; StreamSource memorySource = null; while(i.hasNext()) { if(bytesIn == null) { // first pass? source = input; // use source } else { // use buffer otherwise InputStream in = new ByteArrayInputStream(bytesIn); if(memorySource == null) { memorySource = new StreamSource(in); } else { memorySource.setInputStream(in); } source = memorySource; } // get and configure transformer for this template Templates template = i.next(); Transformer transformer = template.newTransformer(); transformer.setOutputProperties(xmlProperties); transformer.setErrorListener(errorListener); boolean ihn = i.hasNext(); // reuse Result result = ihn // last template? ? new StreamResult(buffer) // if it's the last, write output to file : new StreamResult(new BufferedOutputStream( serializer.getOutputStream())); transformer.transform(source, result); if(ihn) { bytesIn = buffer.toByteArray(); buffer.reset(); } } } finally { bytesIn = null; try { buffer.close(); } catch(IOException e) { // do nothing } } } } ================================================ FILE: src/opennlp/ccgbank/XMLFilterProcessor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: XMLFilterProcessor.java,v 1.4 2010/09/05 15:54:43 mwhite14850 Exp $ * Copyright (C) 2009 Scott Martin (http://www.coffeeblack.org/contact/) */ package opennlp.ccgbank; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; import javax.xml.transform.ErrorListener; import javax.xml.transform.Templates; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.sax.SAXTransformerFactory; import org.apache.tools.ant.BuildException; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLFilter; import org.xml.sax.helpers.XMLReaderFactory; /** * An implementation of {@link XSLTProcessor} that performs transformations * using an {@link XMLFilter}. This particular implementation uses the XSLTC * compiler distributed with Apache's Xalan in order to avoid the known * problems with re-using {@link XMLFilter}. * @see Apache Xalan * @author Scott Martin * @version $Revision: 1.4 $ */ class XMLFilterProcessor extends XSLTProcessor { // nb: this could be in the super class List templates = null; XMLFilter filter; ErrorHandler errorHandler; static final String XSLT_KEY = "javax.xml.transform.TransformerFactory", XSLTC_VALUE = "org.apache.xalan.xsltc.trax.TransformerFactoryImpl"; XMLFilterProcessor(ErrorListener errorListener, ErrorHandler errorHandler) { super(errorListener); this.errorHandler = errorHandler; } SAXTransformerFactory newTransformerFactory() { // TODO try using xsltc (seems to yield hard-to-trace bugs at the moment) //System.setProperty(XSLT_KEY, XSLTC_VALUE); return super.newTransformerFactory(); } /* (non-Javadoc) * @see opennlp.ccgbank.XSLTProcessor#process(java.io.File) */ @Override void process(InputSource inputSource) throws IOException,SAXException, TransformerException { // TODO figure out how to re-use filter without breaking :( // make new filter each time filter = makeFilter(taskTemplatesList); filter.setContentHandler(serializer.asContentHandler()); filter.parse(inputSource); } /** * Makes a filter from a single xsltProcessors object. * @see #makeFilter(List) */ XMLFilter makeFilter(CCGBankTaskTemplates templates) throws FileNotFoundException,SAXException, TransformerConfigurationException { return makeFilter(Collections.singletonList(templates)); } /** * Makes a filter from a series of xsltProcessors that applies those * templates in order. * @param templateList The series of xsltProcessors used to construct the * filter. * @throws BuildException If no xsltProcessors are specified. */ XMLFilter makeFilter(List templateList) throws FileNotFoundException,SAXException, TransformerConfigurationException { // make templates if(templates == null) { templates = makeTemplates(taskTemplatesList); } // assemble list of xslt templates into a filter XMLFilter currentFilter = null, previousFilter = null; for (Templates t : templates) { currentFilter = transformerFactory.newXMLFilter(t); currentFilter.setErrorHandler(errorHandler); if(previousFilter == null) { // it's the first one currentFilter.setParent( XMLReaderFactory.createXMLReader()); } else { currentFilter.setParent(previousFilter); } previousFilter = currentFilter; } if(currentFilter == null ) { throw new IllegalArgumentException("no templates specified"); } currentFilter.setErrorHandler(errorHandler); currentFilter.setFeature("http://xml.org/sax/features/namespace-prefixes", true); return currentFilter; } } ================================================ FILE: src/opennlp/ccgbank/XSLTProcessor.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// /* * $Id: XSLTProcessor.java,v 1.4 2010/09/05 15:54:43 mwhite14850 Exp $ * Copyright (C) 2009 Scott Martin (http://www.coffeeblack.org/contact/) */ package opennlp.ccgbank; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Properties; import javax.xml.transform.ErrorListener; import javax.xml.transform.OutputKeys; import javax.xml.transform.Templates; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXResult; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.stream.StreamSource; import org.apache.tools.ant.BuildException; import org.apache.xml.serializer.OutputPropertiesFactory; import org.apache.xml.serializer.Serializer; import org.apache.xml.serializer.SerializerFactory; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Abstract base class for XSLT processing. Templates are added to a processor, * then {@link #process(InputSource)} is called for each input source. * Subclasses will implement different processing strategies for transforming * XML using a series of XSL templates. * @author Scott Martin * @version $Revision: 1.4 $ */ abstract class XSLTProcessor { SAXTransformerFactory transformerFactory = newTransformerFactory(); static final Properties xmlProperties = OutputPropertiesFactory.getDefaultMethodProperties("xml"); static { xmlProperties.setProperty(OutputKeys.INDENT, "yes"); xmlProperties.setProperty( "{http://xml.apache.org/xalan}indent-amount", "2"); } List taskTemplatesList = new ArrayList(); Serializer serializer = SerializerFactory.getSerializer( XSLTProcessor.xmlProperties); ErrorListener errorListener; XSLTProcessor(ErrorListener errorListener) { this.errorListener = errorListener; transformerFactory.setErrorListener(errorListener); } boolean addAllTemplates(List templateList) { boolean b = false; for(CCGBankTaskTemplates t : templateList) { b |= addTemplates(t); } return b; } boolean addTemplates(CCGBankTaskTemplates taskTemplates) { return taskTemplatesList.add(taskTemplates); } /** * Processes an input source, applying each of the templates specified * using {@link #addTemplates(CCGBankTaskTemplates)} or * {@link #addAllTemplates(List)}. Subclasses will actually implement this * method. * @param inputSource The input source to which the templates will be * applied. * @throws IOException If a problem reading or writing occurs. * @throws SAXException If a subclass uses a SAX processor and there is a * problem with it. * @throws TransformerException If a subclass uses a processor that causes * a transformer problem. */ abstract void process(InputSource inputSource) throws IOException,SAXException,TransformerException; void setTarget(File file) throws FileNotFoundException { serializer.setOutputStream( new BufferedOutputStream(new FileOutputStream(file))); // ensure output properties set (shouldn't really be nec!) serializer.setOutputFormat(xmlProperties); } /** * Resets the serializer, if resetting is possible. If not, re-creates the * serializer. */ void resetSerializer() { if(!serializer.reset()) { serializer // create new unless re-useable = SerializerFactory.getSerializer(xmlProperties); } } SAXTransformerFactory newTransformerFactory() { SAXTransformerFactory tf = (SAXTransformerFactory)TransformerFactory.newInstance(); if(!tf.getFeature(SAXSource.FEATURE)) { throw new IllegalStateException( "SAX transformer factory does not support SAXSource"); } if(!tf.getFeature(SAXResult.FEATURE)) { throw new IllegalStateException( "SAX transformer factory does not support SAXResult"); } return tf; } /** * Makes a list of templates from a single xsltProcessors object. * @see #makeTemplates(List) */ List makeTemplates(CCGBankTaskTemplates taskTemplates) throws FileNotFoundException,TransformerConfigurationException { return makeTemplates(Collections.singletonList(taskTemplates)); } /** * Makes a list of templates from a series of xsltProcessors that applies those xsltProcessors * in order. * @param templateList The series of xsltProcessors used to construct the * filter. * @throws BuildException If no xsltProcessors are specified. */ List makeTemplates(List templateList) throws FileNotFoundException,TransformerConfigurationException { List l = new ArrayList(); for(CCGBankTaskTemplates taskTemplates : templateList) { for(File f : taskTemplates) { StreamSource ss = new StreamSource( new BufferedInputStream(new FileInputStream(f))); ss.setSystemId(f); l.add(transformerFactory.newTemplates(ss)); } } return Collections.unmodifiableList(l); } } ================================================ FILE: src/opennlp/ccgbank/ccgbank.properties ================================================ ccgbanktask=opennlp.ccgbank.CCGBankTask sources=opennlp.ccgbank.CCGBankTaskSources templates=opennlp.ccgbank.CCGBankTaskTemplates convert=opennlp.ccgbank.CCGBankConvert extract=opennlp.ccgbank.CCGBankExtract testbed=opennlp.ccgbank.CCGBankTaskTestbed ================================================ FILE: src/opennlp/ccgbank/convert/ApposTally.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Class which stores info about punctuations package opennlp.ccgbank.convert; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.util.ArrayList; public class ApposTally { //Operation performed //private String oper = ""; //Dest dir //private String destDir = ""; //Sentence id private String id = ""; //Context of a comma //private String cont = ""; //Status whether comma is balanced or not private String balStatus = ""; //Intervening lexical mtl private String lexMtl = ""; //Cue analysis result //private String cueResult = ""; //Features for appos vs conj identification private ArrayList featInfo = new ArrayList(); //Heads of np private ArrayList headInfo1 = new ArrayList(); //Heads of appositive private ArrayList headInfo2 = new ArrayList(); //Cues private static ArrayList cueList = new ArrayList(); //Proc which opens the cue file public void openCueFile(String fileName) throws IOException { BufferedReader br = new BufferedReader(new FileReader(fileName)); String line = ""; while ((line = br.readLine()) != null) { if (!cueList.contains(line)) cueList.add(line); //System.out.println(line); } br.close(); } //Proc which traps, stores id of each sentence in a global var public String storeId(String x) { id = ""; id = x; //System.out.println(id); return null; } //Proc which reinitializes all vars when a new comma is encountered public void flushVars() { balStatus = "conj"; lexMtl = ""; headInfo1.clear(); headInfo2.clear(); //cueResult = ""; featInfo.clear(); } //Proc which stores whether comma is balanced public void storeBalance(String status) { balStatus = status; } //Proc which returns balance status public String getBalance() { return balStatus; } //Proc which returns capitalized string of balance status public String getCaps(String x) { return x.toUpperCase(); } //Proc which stores intervening lexical mtl public void storeLex(String word, String pos) { String info = ""; if (pos.equals("X")) info = word; else info = word + "/" + pos; lexMtl = lexMtl + " " + info; } //Proc which prints intervening lexical mtl public void printLex() { lexMtl = id + " " + lexMtl; //System.out.println(lexMtl+'\n'); lexMtl = ""; } //Proc which stores head of np1 public void storeHead(String word, String pos, int npNo) { String info = ""; //Eliding the distinction b/w sing&plural nouns if (pos.equals("NNS")) pos = "NN"; if (pos.equals("NNPS")) pos = "NNP"; info = word + "/" + pos; if (npNo == 1) headInfo1.add(info); else headInfo2.add(info); } //Heuristic2: Cue based analysis public String cueAnalysis() { //String[] np = lexMtl.split("X"); String[] sent = lexMtl.split("X"); //Sift out possessed in genitive constr if (sent[0].contains("'s/POS") && headInfo1.size() > 0) headInfo1.remove(0); if (sent[1].contains("'s/POS") && headInfo2.size() > 0) headInfo2.remove(0); String res = ""; int flag = 0; String np1head[] = new String[2]; if (headInfo1.size() == 0 || headInfo2.size() == 0) { //System.out.println(id+": "+lexMtl+'\n'); headInfo1.add("X1/ECK"); np1head = (headInfo1.get(headInfo1.size() - 1)).split("/"); } else np1head = (headInfo1.get(headInfo1.size() - 1)).split("/"); //Comparing the heads of np1 & np2 for (String x : headInfo2) { String[] np2head = x.split("/"); if (np2head[1].equals(np1head[1])) flag = 1; else { flag = 0; break; } } //Avoid place names: ie like c if (flag == 1 && headInfo1.size() == 1 && headInfo2.size() == 1) flag = 0; if (flag == 1 && sent[0].contains("/CC") && !sent[0].contains(",")) flag = 0; //Like nps together smacks of a conjunction if (flag == 0 && sent[1].contains("/CC")) { headInfo2.add("X2/ECK"); String np2head[] = (headInfo2.get(0)).split("/"); if (np2head[1].equals(np1head[1])) flag = 1; } //if(flag==0 && sent[1].contains(",/, and/CC"))flag=1; /*Stub to print a particular sentence if(id.equals("ID=wsj_0012.3")){ System.out.println("Flag: "+flag); System.out.println(headInfo1); System.out.println(headInfo2); }*/ if (flag == 1) res = "conj"; else res = "appos"; //if(unit.contains("/CD") || unit.contains("/POS") || unit.contains("/IN") || unit.contains("/DT")|| unit.contains("PRP$") || head1==true || cue==true)featInfo.add("appos"); return res; } } ================================================ FILE: src/opennlp/ccgbank/convert/DiscrCheck.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Class which stores the context of punctuations package opennlp.ccgbank.convert; public class DiscrCheck{ //Sentence id private String id=""; //Proc which traps, stores id of each sentence in a global var public String storeId(String x){ id=""; id=x; return null; } public void checkCatDiscr(String cat,String cat0,String lex){ //Comparing discrepancies between cat0 & cat1 //Just a check. Should be commented out in the final version String catA=cat0; String catB=cat; catA=catA.toLowerCase(); catB=catB.replaceAll("[0-9]",""); catB=catB.replaceAll("_",""); if(!catA.equals(catB) && !cat0.contains("nb")) System.out.println(id+": "+cat0+"***"+cat+" - "+lex); } } ================================================ FILE: src/opennlp/ccgbank/convert/GenChal11Adjuster.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2011 Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccgbank.convert; import java.util.*; /** * Utility class for adjusting Generation Challenges 2011 outputs. * Strings are lowercased, named entities and hyphenated words are split, * and dollar sign and numbers are transposed. */ public class GenChal11Adjuster { /** Returns the adjusted text string . */ public String getAdjustedString(String text) { // lowercase and split String[] tokens = text.toLowerCase().split("\\s+"); // swap dollar signs for (int i=0; i < tokens.length-1; i++) { if (tokens[i+1].equals("$")) { try { // check for preceding number token Double.parseDouble(tokens[i]); // swap, skip String num = tokens[i]; tokens[i] = tokens[i+1]; tokens[i+1] = num; i++; } catch (NumberFormatException e) {} } } // split NEs and hyphenated words List splitTokens = new ArrayList(tokens.length*2); for (String token : tokens) { String[] tokenSplits = token.replace("-"," - ").split("[_ ]"); for (String s : tokenSplits) splitTokens.add(s); } // join StringBuffer retval = new StringBuffer(); for (int i=0; i < splitTokens.size()-1; i++) { retval.append(splitTokens.get(i)); retval.append(' '); } retval.append(splitTokens.get(splitTokens.size()-1)); // done return retval.toString(); } } ================================================ FILE: src/opennlp/ccgbank/convert/GenConjRule.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Java class which adds brackets, stores arg1 position for inferConjRules.xsl,normConjRulesId, normTreenodeId.xsl package opennlp.ccgbank.convert; import java.util.ArrayList; import java.util.Hashtable; //import java.util.StringTokenizer; public class GenConjRule { //The largest current id private static int globalId = 0; //The store of ids private static Hashtable idTally = new Hashtable(); //Dollar status of res, arg1 & arg2 private static ArrayList dollarStatus = new ArrayList(); //Final result of dollar status calcs private static String ds = "No_Dollar"; //Add brackets to complex categories public String addParen(String str) { if (str.contains("\\") || str.contains("/")) str = "(" + str + ")"; return str; } //Add _conj to the result category public String modRes(String str) { //StringTokenizer st = new StringTokenizer(str, "[]"); str = str + "_conj"; return str; } //Procedure which cleans the unary rule result public String getConjRes(String x) { x = x.replaceAll("_conj", ""); x = x.replaceAll("_[0-9]", ""); //System.out.println(x); return x; } //Normalizes id of input category public String normId(String oldId, String oldInhId, String cat) { String newId = ""; //int choice = 0; //Switch for usage between Treenode & Leafnode Id normalization if (oldId.length() > 0) cat = cat + "_" + oldId; if (oldInhId.length() > 0) cat = cat + "_" + oldInhId; cat = cat.trim(); //Normalization condition if (!idTally.containsKey(cat)) { globalId++; newId = Integer.toString(globalId); idTally.put(cat, newId); } newId = (String) idTally.get(cat); return newId; } //Initialization of idTally & globalId before start of a new conj rule public String globalInit() { globalId = 0; idTally.clear(); return null; } //Initialization of idTally before each of Result,arg1 & arg2 is added public String localInit() { idTally.clear(); return null; } //Calculation of dollarStatus before start of a new conj rule //Initialization of dollarStatus before start of a new conj rule public String dsInit() { //System.out.println(dollarStatus); dollarStatus.clear(); ds = "No_Dollar"; return null; } //Store dollar status of res, arg1 & arg2 public String storeDollarStatus(String type) { type = type.trim(); dollarStatus.add(type); /*System.out.println('\n'); System.out.println("Insertion of: "+type); System.out.println(dollarStatus); System.out.println('\n');*/ return "null"; } public String dsCalc() { //System.out.println(dollarStatus); if (dollarStatus.size() == 3) ds = "Dollar"; /*System.out.println('\n'); System.out.println("Retrieval"); System.out.println(dollarStatus); System.out.println('\n');*/ return null; } //Get dollar status of conjunct public String getDollarStatus() { return ds; } //Function invoked by invertedDirSpComma.xsl public String getglobalId() { globalId++; return Integer.toString(globalId); } } ================================================ FILE: src/opennlp/ccgbank/convert/InfoHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // //This library is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public //License as published by the Free Software Foundation; either //version 2.1 of the License, or (at your option) any later version. // //This library is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Lesser General Public License for more details. // //You should have received a copy of the GNU Lesser General Public //License along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Java class which helps info extr from PTB package opennlp.ccgbank.convert; //import opennlp.ccg.lexicon.*; import java.util.*; import java.io.*; public class InfoHelper{ /** CCG terminal & non-terminal nos.. */ private int termNo=0;private int ntNo=500; /** BBN-info. */ private static Hashtable bbnInfo=new Hashtable(); private static Hashtable> bbnSpans=new Hashtable>(); private static ArrayListbbnClasses=new ArrayList(); /** Quote info. */ private static Hashtable quoteInfo=new Hashtable(); /** PTB aux info viz. SBJ, FN_Tag & TPC annotation **/ private static Hashtable sbjInfo=new Hashtable(); private static Hashtable fntagInfo=new Hashtable(); private static Hashtable tpcInfo=new Hashtable(); /** Treenode info. */ private static Hashtable treeInfo=new Hashtable(); /** Directory where aux file and BBN NE info is stored. */ static File auxFileDirectory = null, bbnAuxDirectory = null; /** Store aux file directories. */ public static void init(File auxDir, File bbnAuxDir) { InfoHelper.auxFileDirectory = auxDir; InfoHelper.bbnAuxDirectory = bbnAuxDir; } /** Read BBN NE aux file corresponding to the WSJ Section provided as argument. */ public static void readBBNAuxfiles(String sect){ String bbnAuxFile=bbnAuxDirectory+"/"+"bbn-ccg"+sect+".aux"; try{ if(new File(bbnAuxFile).exists()){ BufferedReader inp= new BufferedReader(new FileReader(bbnAuxFile)); System.out.println("Reading in BBN aux file: "+bbnAuxFile); String line=""; while((line=inp.readLine())!=null ){ if(line.length()==0)continue; String bbn[]=line.trim().split(" "); String sentId=bbn[0]; String span=bbn[1]+","+bbn[2]; String key=sentId+" "+span; String bbnData=""; for(int i=3;i spanList=new ArrayList(); if(!bbnSpans.containsKey(sentId)) bbnSpans.put(sentId,spanList); spanList=bbnSpans.get(sentId); spanList.add(span); bbnSpans.put(sentId,spanList); } inp.close(); } } catch(IOException e){ System.out.println("Error reading: "+bbnAuxFile); } } /** Read quotes aux file corresponding to the WSJ Section provided as argument. */ public static void readQuoteAuxfiles(String sect){ String quoteAuxFile=auxFileDirectory+"/"+"aux-quotes-"+sect+".txt"; try{ if(new File(quoteAuxFile).exists()){ System.out.println("Reading in quotes aux File: "+quoteAuxFile); String line=""; BufferedReader inp= new BufferedReader(new FileReader(quoteAuxFile)); while((line=inp.readLine())!=null ){ if(line.length()==0)continue; String quoteData[]=line.trim().split(" "); String sentId=quoteData[0]; String span=quoteData[1]+","+quoteData[2]; String key=sentId+" "+span; String quotedText=""; for(int i=3;i treeCont=new ArrayList(); public boolean checkTreeInfo(String header,String ntId,int numCats){ String key=header+" "+ntId; if(treeInfo.containsKey(key)){ String treeCats=treeInfo.get(key); String[] x=treeCats.split(","); if(numCats==x.length) return true; else return false; } else return false; } /** Retrieve treenode info. */ public String getTreeInfo(String key){ String retVal=""; treeCont=new ArrayList(); if(treeInfo.containsKey(key)){ retVal=treeInfo.get(key); String[]temp=retVal.split(","); for(String x:temp) treeCont.add(x); } return retVal; } /** Procedure which gives back id of a particular cat. */ public String getTreeId(){ String retVal=""; if(treeCont.size()>0){ retVal=treeCont.get(0); treeCont.remove(0); String x[]=retVal.split("_"); retVal=x[1]; if(x.length==3) retVal="M_"+retVal; } return retVal; } /** Procedure which gives slash of combination. */ public String getTreeSlash(){ String retVal=""; if(treeCont.size()>0){ String slash=treeCont.get(0); String mode=""; if(slash.length()>1) mode=Character.toString(slash.charAt(1)); else if(slash.startsWith("/")) mode=">"; else if(slash.startsWith("\\")) mode="<"; treeCont.remove(0); retVal=Character.toString(slash.charAt(0)); retVal=retVal+"_"+mode; } return retVal; } /** Retrieve BBN class for lexical items for use in the Leafnodes. */ public String getBBNClass(String header,String lex,String pos,String cat,int nodeInd){ String retVal=""; String semClass=""; //Check and exit if the pos is not relevant boolean relFlag=false; if(cat.matches("pp\\[[a-z]+\\]_~2/np_2")) return ""; if (pos.startsWith("NN") || pos.startsWith("RB") || pos.startsWith("JJ") || pos.startsWith("VB") ||pos.equals("CD") || lex.equals("%") || pos.equals("$")) relFlag=true; if(!relFlag) return ""; String sentId=header.replaceFirst("ID=",""); ArrayList spanList=new ArrayList(); String key=sentId+" "+Integer.toString(nodeInd)+","+Integer.toString(nodeInd); if(bbnInfo.containsKey(key)){ String bbnData=bbnInfo.get(key); String info[]=bbnData.split(" "); if(info.length>=2){ semClass=getCleanClass(info[0]); retVal=(classReplace(semClass,lex,info[1])).trim(); if(retVal.length()>0 && !bbnClasses.contains(retVal)) bbnClasses.add(retVal); } return retVal; } else if(bbnSpans.containsKey(sentId)) spanList=bbnSpans.get(sentId); for(String span: spanList){ String inds[]=span.split(","); if(inds.length!=2)continue; if(!inds[0].matches("[0-9]+") || !inds[1].matches("[0-9]+") || inds[0].equals("NA") || inds[1].equals("NA")) continue; int ind1=Integer.parseInt(inds[0]); int ind2=Integer.parseInt(inds[1]); if(nodeInd >=ind1 && nodeInd <=ind2){ key=sentId+" "+span; String bbnData=bbnInfo.get(key); String info[]=bbnData.split(" "); if(info.length==0) continue; semClass=getCleanClass(info[0]); //Compile a list of acceptable classes boolean accClasses=false; if (!semClass.startsWith("DATE") && !semClass.startsWith("TIME") && !semClass.startsWith("ORDINAL") && !semClass.startsWith("QUANTITY") && !semClass.startsWith("PERCENT") && !semClass.startsWith("MONEY")) accClasses=true; if (accClasses || pos.startsWith("NN") || pos.equals("CD") || lex.equals("%") || pos.equals("$")){ String wordBit=""; int relInd=nodeInd-ind1+1; if(relInd=0) wordBit=info[(nodeInd-ind1+1)]; retVal=classReplace(semClass,lex,wordBit); if(retVal.length()>0 && !bbnClasses.contains(retVal)) bbnClasses.add(retVal); break; } } } return retVal; } /** Perform semantic replacement over relevant parts of the part. */ public String classReplace(String semClass,String lex,String wordBit){ String retVal=""; /*CITY-based classes if(wordBit.equals(lex)) retVal=semClass; else if(lex.contains(wordBit)) retVal=lex.replaceFirst(wordBit,semClass);*/ //Ignore CITY-based classes if(wordBit.equals(lex)) retVal=semClass; return retVal; } /** Strip off label like ENAMEX,TIMEX,NUMEX. */ public String getCleanClass(String semClass){ String retVal=""; String x[]=semClass.split("="); if(x.length>=2) retVal=x[1]; return retVal; } /** Retrieve stored bbn-info for use in the Treenodes. */ public String getBBNInfo(String header,String span,String words){ String sentId=header.replaceFirst("ID=",""); String bbnData=""; String key=sentId+" "+span; boolean legitPhr=false; //Checking stored BBN-data with actual words if(bbnInfo.containsKey(key)){ String[] ccgWords=words.split("_"); bbnData=bbnInfo.get(key); String[] bbnWords=bbnData.split(" "); if(ccgWords.length>0 && bbnWords.length>1 && ccgWords.length==bbnWords.length-1){ for(int i=0;i0) retVal=span1+" "+qInfo[0]; } else if(quoteInfo.containsKey(key2)){ quotedText=quoteInfo.get(key2); String []qInfo=quotedText.split(" "); if(qInfo.length>0 && qInfo[qInfo.length-1].matches("\\p{Punct}")){ retVal=span2+" "+qInfo[0]+" "+qInfo[qInfo.length-1]; } } return retVal; } /** Store result cat. */ String res=""; public void storeRes(String str){ this.res=str.replaceAll("\\[.*",""); } /** Retrieve result cat. */ public String getRes(){ String retVal=this.res; return retVal; } /** The store of ids. */ private Hashtable idTally=new Hashtable(); public String id(String cat){ String retVal=""; int idNum=idTally.size()+2; if(!idTally.containsKey(cat)) idTally.put(cat,idNum); else idNum=idTally.get(cat); if(idTally.size()==1) retVal="first"+"_"+Integer.toString(idNum); else retVal="later"+"_"+Integer.toString(idNum); return retVal; } public void id(){ idTally=new Hashtable(); } /** Retrieve terminal no. */ public String getTermNo(){ String tn=Integer.toString(this.termNo); this.termNo++; return tn; } /** Retrieve non-terminal no. */ public String getNonTermNo(){ String ntNo=Integer.toString(this.ntNo); this.ntNo++; return ntNo; } /** Get punctless index. */ int plessInd=0; public String getPunctlessIndex(String word){ boolean isCCGWord=this.isCCGWord(word); int retval=-1; if(isCCGWord){ retval=this.plessInd; this.plessInd++; } return Integer.toString(retval); } /** Given a lexical item, ascertain whether it is a legit original CCGbank word **/ public boolean isCCGWord(String word){ boolean retval=true; if(word.matches("\\p{Punct}|[\\.]+|(-lrb-)|(-rrb-)|(-lcb-)|(-rcb-)|(--)|(`)|(')|(``)|('')") && !word.equals("$") && !word.equals("%")){ retval=false; } return retval; } /** Init terminal nos. */ public String initId(){ this.termNo=0; this.ntNo=500; this.plessInd=0; return null; } /** Extract PTB SBJ,FN-TAG & TPC annotation. */ public String getPTBInfo(String label,String sentId,String head,String lexInd){ String retval=""; sentId=sentId.replaceFirst("ID=",""); String key=sentId+" "+head+"_"+lexInd; String rel=null; if(label.equals("SBJ"))rel=sbjInfo.get(key); else if(label.equals("FNT"))rel=fntagInfo.get(key); else if(label.equals("TPC"))rel=tpcInfo.get(key); if(rel!=null){ retval=rel; } return retval; } /** Print out BBN classes (for use in grammar.xml). */ public void printBBNClasses(){ try{ System.out.println("Printing BBN classes used in the corpus to bbn-types.txt (for use in grammar.xml)"); // Create a FileWriter stream to the file FileWriter file_writer = new FileWriter ("bbn-types.txt"); BufferedWriter buf_writer = new BufferedWriter (file_writer); PrintWriter print_writer = new PrintWriter (buf_writer,true); print_writer.print(""); print_writer.flush(); } print_writer.close(); } catch (Exception e){ System.err.println ("Error writing info to file"); } } /** Input a string which contains a ':' and replace it by '|'. */ public String replaceColon(String str) { String replacedStr=str.replace(":","|"); return replacedStr; } } ================================================ FILE: src/opennlp/ccgbank/convert/Javafns.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Java Program invoked by pp-head extraction routines package opennlp.ccgbank.convert; import java.util.ArrayList; import java.util.List; // NB: addRes and changeCat probably made superfluous by computeCats.xsl public class Javafns{ //Prep heads storage public List heads = new ArrayList(); //Temp id of pp heads with args String tempId=""; //Insert the prep-head into a result category by string replacement public String addRes(String cat,String ppHead){ //Xsl spl char detected and escaped if(ppHead.equals("$")) ppHead="\\$"; //Head inserted into first PP category cat=cat.replaceFirst("pp","pp["+ppHead+"]"); return cat; } //A safety hatch to elide extra heads detected - For leaf nodes public String elimRedun(int headCount){ //Calculating redundant Headcount int j=0; //All heads following the last PP-head in a cat spec are redundant int redun=heads.size()-headCount; //Redundant heads removed for(j=0;j-1;i--){ String ppHead = heads.get(i); //Escaping dollar signs for xsl if(ppHead.equals("$")) ppHead="\\$"; //Simple head insertion by replacement on the string cat=cat.replaceFirst("pp_","pp["+ppHead+"]_"); } return cat; } public String flush(){ heads.clear();tempId=""; return null; } public String setHead(String head){ heads.add(head); return null; } public String getHead(){ String head=""; if(heads.size()==0) head="WrongHead"; else{ head = heads.get(heads.size()-1); heads.remove(heads.size()-1); } return head; } public String peekHead(){ String head="WrongHead"; if(heads.size()>0) head = heads.get(heads.size()-1); return head; } public String printCat(String cat){ System.out.println(cat); return cat; } } ================================================ FILE: src/opennlp/ccgbank/convert/MWHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Bkgrnd java class which helps with operations reltd to multi-word units //As a first stab, I combine multi-word conjns like "As well as" package opennlp.ccgbank.convert; import java.util.*; public class MWHelper { // The largest current id private String lex = ""; private String pos = ""; private String term_no = ""; // A list of the particle terminal nos private ArrayList prtTally = new ArrayList(); // Index of terminal nos and the lexical items they correspond to private Hashtable prtIndex = new Hashtable(); public void initSettings() { prtTally = new ArrayList(); prtIndex = new Hashtable(); } // Concat lex,pos & term_nos of multi-word units public void concatWords(String lex, String pos, String term_no) { // System.out.println(lex); this.lex = this.lex + "_" + lex; this.pos = this.pos + " " + pos; this.term_no = this.term_no + " " + term_no; } // Retrieve stored info public String getInfo(int choice) { String retVal = ""; switch (choice) { case 1: retVal = lex.trim().replaceFirst("_", ""); this.lex = ""; break; case 2: retVal = pos.trim(); this.pos = ""; break; case 3: retVal = term_no.trim(); this.term_no = ""; break; } return retVal; } // Store particle ids public void storePrt(String prt_term_no, String prt) { prtTally.add(Integer.parseInt(prt_term_no)); prtIndex.put(Integer.parseInt(prt_term_no), prt); } public String peekPrt(int nextPrnNo) { String retVal = ""; if (prtIndex.containsKey(nextPrnNo)) { retVal = prtIndex.get(nextPrnNo); } return retVal; } public String getPrt() { // System.out.println(prtTally); // prtTally=new ArrayList(); Collections.sort(prtTally); String retVal = ""; if (prtTally.size() > 0) { retVal = (prtTally.get(prtTally.size() - 1)).toString(); // System.out.println(retVal); prtTally.remove(prtTally.size() - 1); } return retVal; } } ================================================ FILE: src/opennlp/ccgbank/convert/MorphLookup.java ================================================ package opennlp.ccgbank.convert; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.Map; /** * Utility class for looking up stems as determined by the morpha utility. */ public class MorphLookup { // map from word_pos to stem private static Map stemMap = null; static File words, stems; public static void init(File wordsFile, File stemsFile) throws IOException { MorphLookup.words = wordsFile; MorphLookup.stems = stemsFile; if(MorphLookup.words == null) { throw new IllegalArgumentException("words file not specified"); } if(MorphLookup.stems == null) { throw new IllegalArgumentException("stems file not specified"); } stemMap = new HashMap(); BufferedReader wordsReader = new BufferedReader( new FileReader(words)); BufferedReader stemsReader = new BufferedReader( new FileReader(stems)); String wordsLine, stemsLine; // read lines in parallel while ( (wordsLine = wordsReader.readLine()) != null ) { stemsLine = stemsReader.readLine(); //System.out.println(stemsLine); // wordsLine has a word and a POS String[] tokens = wordsLine.split("\\s+"); String word = tokens[0]; String pos = tokens[1]; // stemsLine just has a stem; lowercase it, for good measure String stem = stemsLine.trim().toLowerCase(); // add word_POS -> stem to map, also with word lowercased String key = word + "_" + pos; String key2 = word.toLowerCase() + "_" + pos; stemMap.put(key, stem); stemMap.put(key2, stem); } wordsReader.close(); stemsReader.close(); } /** Returns the stem for the given word and pos, or the empty string if none. */ public String getStem(String word, String pos) { String retval="";String key=word + "_" + pos; if (MorphLookup.stemMap.containsKey(key)) retval = stemMap.get(word + "_" + pos); /*if (retval == null) retval = ""; //System.out.println(key+" "+retval);*/ if(retval.length()==0) System.out.println("addStems: No stem for: "+key); return retval; } } ================================================ FILE: src/opennlp/ccgbank/convert/OrigPunctRules.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Class which stores the context of punctuations package opennlp.ccgbank.convert; import java.util.Hashtable; public class OrigPunctRules { //Sentence id private String id = ""; //The largest current id private static int globalId = 0; //ccgbank section private String sect=""; //Label for a punct mark private String label = ""; //The store of ids private static Hashtable idTally = new Hashtable(); //Proc which traps, stores id of each sentence in a global var public String storeId(String x) { id = ""; sect = ""; //System.out.println(x+" "+"Raja"); String expId[] = x.split("_"); id = expId[1]; sect = id.substring(0, 2); if (sect.length() < 0) sect += "just avoiding a warning here"; //System.out.println(sect+" "+"Raja"); return null; } //Initialization before start of a new binary rule public String initId() { globalId = 0; idTally.clear(); return null; } //Id allotment public String allotId(String cat) { String newId = ""; //Id allotment if (!idTally.containsKey(cat)) { globalId++; newId = Integer.toString(globalId); idTally.put(cat, newId); } newId = (String) idTally.get(cat); return newId; } public String storeLabel(String x) { label = x; return null; } public String getLabel() { return label; } public String initLabel() { label = ""; return null; } } ================================================ FILE: src/opennlp/ccgbank/convert/PunctHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Java class which adds brackets, stores arg1 position for inferConjRules.xsl,normConjRulesId, normTreenodeId.xsl package opennlp.ccgbank.convert; import java.util.ArrayList; import java.util.Hashtable; public class PunctHelper { //The largest current id private int globalId = 0; //The store of ids //private Hashtable idTally = new Hashtable(); private String feat = ""; private String pos = ""; private String balCom = ""; //The pos-indexRel tally private static Hashtable> indexRel = new Hashtable>(); //Calculate & store the indexRel public String calcIndexRel(String cat, String pos) { ArrayList temp = new ArrayList(); pos = pos.replaceAll("[0-9]", ""); //System.out.println(cat); if (!pos.equals("PUNCT_LPAREN")) { cat = cat.replaceAll("/\\*punct\\[,\\]_[0-9]", ""); cat = cat.replaceAll("/\\*punct\\[--\\]_[0-9]", ""); cat = cat.replaceAll("/\\*punct\\[-rrb-\\]_[0-9]", ""); cat = cat.replaceAll("/\\*punct\\[-rcb-\\]_[0-9]", ""); } //System.out.println(cat); if (!indexRel.containsKey(pos)) { temp.add(cat); indexRel.put(pos, temp); } temp = indexRel.get(pos); if (!temp.contains(cat)) temp.add(cat); String ind = Integer.toString(temp.indexOf(cat) + 1); return ind; } //Initialization before start of a new conj rule public String globalInit() { globalId = 0; //idTally.clear(); return null; } //Function invoked by invertedDirSpComma.xsl public String getglobalId() { globalId++; return Integer.toString(globalId); } public String setglobalId(int x) { globalId = x; return null; } public String storePOS(String x) { pos = x; return null; } public String getPOS() { String retVal = pos; return retVal; } public String initPOS() { pos = ""; return null; } public String balInit() { balCom = ""; return null; } public String storeBal(String x) { balCom = x; return null; } public String getBal() { return balCom; } public String storeFeat(String x) { feat = x; return null; } public String getFeat() { return feat; } public String featInit() { feat = ""; return null; } public String debugPrint(String x, String y) { System.out.println("Debug: " + x + " at " + y); return null; } public String removeFeats(String cat) { cat = cat.replaceAll("\\[[a-zA-Z]+\\]", ""); //System.out.println("Debug: "+cat); return cat; } public String purgeCat(String cat) { cat = cat.replaceAll("\\[[a-zA-Z]+\\]", ""); cat = cat.replaceAll("~", ""); cat = cat.replaceAll("_[0-9]+", ""); //System.out.println("Debug: "+cat); return cat; } public String purgeCat1(String cat) { cat = cat.toLowerCase(); cat = cat.replaceAll("\\[[a-zA-Z]+\\]", ""); cat = cat.replaceAll("~", ""); cat = cat.replaceAll("_[0-9]+", ""); //System.out.println("Debug: "+cat); return cat; } //Replace pp[] by pp public String cleanPP(String cat) { cat = cat.replaceAll("pp\\[\\]", "pp"); return cat; } } ================================================ FILE: src/opennlp/ccgbank/convert/RoleAdjuster.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccgbank.convert; /** * Utility class for adjusting roles and augmenting cat names with roles. */ public class RoleAdjuster { /** Returns the adjusted argument roles, making guesses at the missing roles. */ public String getAdjustedRoles(String cat, String roles) { // aux things like "have to" if (roles.equals("null e") || roles.equals("e e")) { if (cat.indexOf("np[thr]") < 0) return "Arg0 Arg1"; else return "e Arg1"; } // vp mods if (roles.startsWith("null e")) { return "null ArgM" + roles.substring("null e".length()); } // missing subjects, mostly if (roles.startsWith("null")) { String rest = roles.substring("null".length()); return addMissingArg(cat, rest); } // various if (roles.startsWith("e")) { String rest = roles.substring("e".length()); // mods if (cat.indexOf("_~") > 0) return "ArgM" + rest; // leave expletives unchanged if (cat.indexOf("np[expl]") > 0 || cat.indexOf("np[thr]") > 0) return roles; // otherwise add standard guess return addMissingArg(cat, rest); } // otherwise unchanged return roles; } // add guess at missing arg private String addMissingArg(String cat, String rest) { // distinguish passive if (cat.startsWith("s[pss]")) { if (rest.indexOf("Arg1") < 0) return "Arg1" + rest; else return "Arg2" + rest; } // otherwise Arg0 or Arg1 if (rest.indexOf("Arg0") < 0) return "Arg0" + rest; else return "Arg1" + rest; } /** Returns the cat name augmented with the given argument roles. */ public String getCatPlusRoles(String cat, String roles) { return cat + ":" + roles.replaceAll(" ", "+"); } } ================================================ FILE: src/opennlp/ccgbank/convert/XSLTTrueCaser.java ================================================ /////////////////////////////////////////////////////////////////////////////// //Copyright (C) 2011 Dennis N. Mehay // //This library is free software; you can redistribute it and/or //modify it under the terms of the GNU Lesser General Public //License as published by the Free Software Foundation; either //version 2.1 of the License, or (at your option) any later version. // //This library is distributed in the hope that it will be useful, //but WITHOUT ANY WARRANTY; without even the implied warranty of //MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the //GNU Lesser General Public License for more details. // //You should have received a copy of the GNU Lesser General Public //License along with this program; if not, write to the Free Software //Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccgbank.convert; /** * A class that has a static constructor to create a TrueCaser so that XSLT (which requires such a * set-up) can call the TrueCaser. * * @author Dennis N. Mehay * */ import opennlp.ccg.lexicon.TrueCaser; public class XSLTTrueCaser { static TrueCaser tc = null; /** * Static constructor that creates a true-caser. See the TrueCaser.java doc's for more info. */ public static void init(String pathToTrueCaseList) { XSLTTrueCaser.tc = new TrueCaser(pathToTrueCaseList, 0.5); } /** Function invoked from the XSLT transform trueCaser.xsl to true case words in a derivation .*/ public String trueCase(String theWord, String neClass, String pos,String wordPosition) { return tc.trueCase(theWord, true, true); } } ================================================ FILE: src/opennlp/ccgbank/extract/CatNode.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Java class to store all the info associated with a category package opennlp.ccgbank.extract; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.StringTokenizer; public class CatNode { // Category name, POS public final String cat; public final String pos; //Lexeme frequency private Map lexFreq = new HashMap(); //Sentence id of lexemes private Map> lexOccur = new HashMap>(); //Constructor to initialize the cat name public CatNode(String cat, String pos){ this.cat=cat; this.pos = pos; } //Procedure to insert info associated with all lexemes of a category public void lexInsert(String lex, String id){ //When first occurences of lexemes detected, entries opened if (!lexFreq.containsKey(lex)){ lexFreq.put(lex, 1); List idList = new ArrayList(4); idList.add(id); lexOccur.put(lex, idList); } else { //Subsequent occurences of lexemes updated lexFreq.put(lex, lexFreq.get(lex)+1); List idList = lexOccur.get(lex); //Ids of up to four sentences stored if(!idList.contains(id) && idList.size() < 4){ idList.add(id); } } } // returns frequence of lex with this cat and pos public int getLexFreq(String lex) { Integer retval = lexFreq.get(lex); if (retval == null) return 0; else return retval; } //Proc which prints out the lexical info of a category public void printTally(PrintWriter output) { //Lexemes sorted in descending order of freq List sortedLex = FreqTally.sortTally(lexFreq); String ccgbankHome = System.getProperty("CCGBANK_HOME", "/home/corpora/EN/ccgbank"); //Sorted list processed for (int i=0; i<3 && i"); output.println("

  • "); output.println(lex+" "+freq); //Sentence ids also printed List temp = lexOccur.get(lex); for (String id: temp) { //System.out.println(id); String[]idInfo=id.split("\\."); //System.out.println(idInfo[0]); StringTokenizer st=new StringTokenizer(id,"."); output.println("
      "); String idLink=""; String sentNo=""; String dir=""; //2 courses of action depending on whether input is gold std .auto parses or C&C .auto parses if(idInfo.length==2){ idLink=st.nextToken()+".html"; sentNo="#Sentence "+st.nextToken(); dir=id.substring(4,6); } else { idLink=idInfo[0]; sentNo="#Sentence "+idInfo[0]; dir=idInfo[0]; } output.println("
    • "); output.println(id); output.println(" "); output.println(idLink); output.println(""); output.println("
    • "); output.println("
    "); } output.println("
  • "); output.println(""); output.println(""); output.flush(); } } } ================================================ FILE: src/opennlp/ccgbank/extract/DebugHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Bkgrnd java class which helps with operations for debugging LFs package opennlp.ccgbank.extract; import java.util.*; import java.io.*; public class DebugHelper { public static boolean init = true; public static ArrayList unmatCats = new ArrayList(); public static int nsrCount = 0; public static int unmatCount = 0; public boolean getInit() { return init; } //Read in bkgrnd info public void readInfo() { try { BufferedReader inp = new BufferedReader(new FileReader( "/scratch/propgrammar/unmat.txt")); String line = ""; while ((line = inp.readLine()) != null) { if (line.length() == 0) continue; String parts[] = line.split(" "); String name = parts[parts.length - 2]; String pos = parts[parts.length - 1]; String x[] = name.split("="); String y[] = pos.split("="); name = purgeCat(x[1]); pos = purgeCat(y[1]); String unmat = name + " " + pos; unmatCats.add(unmat); //System.out.println(name+" "+pos); } //System.out.println(tagInfo); //System.out.println(tagInfo.size()); init = false; inp.close(); } catch (IOException e) { System.out.println("Error reading input file"); } } public String purgeCat(String cat) { cat = cat.replaceAll("\"", ""); cat = cat.replaceAll("~", ""); cat = cat.replaceAll("_[0-9]+", ""); cat = cat.replaceAll(">", ""); //System.out.println("Debug: "+cat); return cat; } public void recordInfo(String sentId, String pred, String misc) { nsrCount++; if (unmatCats.contains(misc)) unmatCount++; } public void printInfo() { System.out.println(unmatCats); System.out.println("No:of nsr LFs: " + nsrCount); System.out.println("No:of unmatched that a NSR LF contains: " + unmatCount); } } ================================================ FILE: src/opennlp/ccgbank/extract/DefaultLFHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Bkgrnd java class which helps with operations for debugging LFs package opennlp.ccgbank.extract; import java.util.*; //import java.io.*; public class DefaultLFHelper { private ArrayList idTally = new ArrayList(); private Hashtable freqTable = new Hashtable(); private String lfType = "ord"; public void init() { lfType = "ord"; idTally = new ArrayList(); freqTable = new Hashtable(); argCounter = 0; } public void storeCat(String cat, String id, String idType) { int freq = 1; if (id.length() > 0) { //System.out.println(cat+" "+id+" "+idType); if (!freqTable.containsKey(id)) freqTable.put(id, freq); else { freq = freqTable.get(id) + 1; freqTable.put(id, freq); } if (id.equals("1") && idType.equals("inherits")) lfType = "mod-mod"; else if (!lfType.equals("mod-mod") && id.equals("1") && idTally.size() > 0 && idTally.get(0).equals("1")) lfType = "mod"; idTally.add(id); } } public String getType() { String retVal = lfType; lfType = ""; argCounter = 0; return retVal; } private int argCounter = 0; public String getArgNo(int argCount) { int argNo = argCount - argCounter; argCounter++; return Integer.toString(argNo); } public boolean isArg(String id) { //System.out.println(id); //System.out.println(freqTable); int freq = 0; if (freqTable.containsKey(id)) freq = freqTable.get(id); boolean retVal = false; if (freq == 1) retVal = true; return retVal; } public String purgeCat(String cat) { cat = cat.replaceAll("\"", ""); cat = cat.replaceAll("~", ""); cat = cat.replaceAll("_[0-9]+", ""); cat = cat.replaceAll(">", ""); //System.out.println("Debug: "+cat); return cat; } } ================================================ FILE: src/opennlp/ccgbank/extract/ExtractGrammar.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// package opennlp.ccgbank.extract; import java.io.File; import java.io.FileWriter; import java.io.PrintWriter; import java.util.Arrays; import java.util.Iterator; import java.util.List; import javax.xml.transform.stream.StreamSource; public class ExtractGrammar { /** Class for holding properties of desired grammar extraction. */ public static class ExtractionProperties { /** Whether to use the PP head augmented corpus. */ public boolean ppHeads = true; /** Source directory. */ public String srcDir = "/scratch/ccgbank/converted"; /** Destination directory. */ public String destDir = "/scratch/grammars/protogrammar"; /** Temp directory. */ public String tempDir = "/tmp/ccgbankextract"; /** Start section. */ public int startSection = 0; /** End section. */ public int endSection = 24; /** Selected file (-1 if none). */ public int fileNum = -1; /** Frequency cutoff for including an extracted cat. */ public int catFreqCutoff = 1; /** Frequency cutoff for including an extracted lex, cat, pos triple. */ public int lexFreqCutoff = 1; /** Frequency cutoff for making a family (ie, cat & pos) open. */ public int openFreqCutoff = 100; /** Rule frequency cutoff. */ public int ruleFreqCutoff = 1; /** Flag for whether to skip unmatched rules. */ public boolean skipUnmatched = false; /** Whether to show debug info for failed derivations. */ public boolean debugDerivs = false; /** File name for text only output. */ public String textfile = null; /** File name for text factors output. */ public String factorsfile = null; /** File name for observed supertag-rule combos.. */ public String combosfile = null; // Flag to add feats excl to the lexicon public boolean lexF = false; // Flag to adjust lfs of orig puncts ie those corrs to extant corp // binary rules public boolean origPuncts = false; // String taking names of macros as input. Expects a dash separated list public String macroSpecs = ""; // String taking names of LF specificity condtions as input. Expects a // dash separated list // Overt wh pronouns: wh public String lfSpecs = ""; } /** Processes args and invokes extraction steps. */ public static void main(String args[]) throws Exception { List arguments = Arrays.asList(args); ExtractionProperties extractProps = new ExtractionProperties(); // flags for each extraction step boolean doLex = true; boolean doMorph = true; boolean doRules = true; boolean doTestbed = true; if (arguments.contains("-h") || arguments.contains("--help")) { System.out.println("usage: extractGrammar \n" + "\t[-noPPs|--noPPHeads] \n" + "\t[-lexF] \n" + "\t[-origPuncts] \n" + "\t[-s|--section sectnum] [-ss|--startSection sectnum] [-es|endSection sectnum] \n" + "\t[-f|--file filenum] \n" + "\t[--lexOnly|--morpOnly|--rulesOnly|--testbedOnly] [--skipLex] [--skipMorph] [--skipRules] [--skipTestbed]\n" + "\t[-tmp|--tempDir tempDir] \n" + "\t[-cfc|--catFreqCutoff num] \n" + "\t[-lfc|--lexFreqCutoff num] \n" + "\t[-ofc|--openFreqCutoff num] \n" + "\t[-rfc|--ruleFreqCutoff num] \n" + "\t[--skipUnmatched] \n" + "\t[-dd|--debugDerivs] \n" + "\t[--text textfile] [--textf factorsfile] \n" + "\t[--combos combosfile] \n" + "\t[srcDir] [destDir]"); System.exit(0); } if (arguments.contains("-noPPs") || arguments.contains("--noPPHeads")) { extractProps.ppHeads = false; } // process args Iterator it = arguments.iterator(); String s; boolean seenSrc = false; while (it.hasNext()) { s = it.next(); if (s.equals("-s") || s.equals("--section") || s.equals("-ss") || s.equals("--startSection") || s.equals("-es") || s.equals("--endSection")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no sectnum specified"); } int sectNum = Integer.parseInt(it.next()); if (s.equals("-s") || s.equals("--section") || s.equals("-ss") || s.equals("--startSection")) extractProps.startSection = sectNum; if (s.equals("-s") || s.equals("--section") || s.equals("-es") || s.equals("--endSection")) extractProps.endSection = sectNum; } else if (s.equals("-f") || s.equals("--filenum")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no filenum specified"); } extractProps.fileNum = Integer.parseInt(it.next()); } else if (s.equals("-lexF")) { System.out .println("Inserting lexicon specific feats - Punct filter placeholder feats now"); extractProps.lexF = true; } else if (s.equals("-origPuncts")) { extractProps.origPuncts = true; } else if (s.equals("--lexOnly")) { doMorph = false; doRules = false; doTestbed = false; } else if (s.equals("--morphOnly")) { doLex = false; doRules = false; doTestbed = false; } else if (s.equals("--rulesOnly")) { doLex = false; doMorph = false; doTestbed = false; } else if (s.equals("--testbedOnly")) { doLex = false; doMorph = false; doRules = false; } else if (s.equals("--skipLex")) { doLex = false; } else if (s.equals("--skipMorph")) { doMorph = false; } else if (s.equals("--skipRules")) { doRules = false; } else if (s.equals("--skipTestbed")) { doTestbed = false; } else if (s.equals("-tmp") || s.equals("--tempDir")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no temp dir specified"); } extractProps.tempDir = it.next(); } else if (s.equals("-cfc") || s.equals("--catFreqCutoff")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no num specified"); } int num = Integer.parseInt(it.next()); extractProps.catFreqCutoff = num; } else if (s.equals("-lfc") || s.equals("--lexFreqCutoff")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no num specified"); } int num = Integer.parseInt(it.next()); extractProps.lexFreqCutoff = num; } else if (s.equals("-ofc") || s.equals("--openFreqCutoff")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no num specified"); } int num = Integer.parseInt(it.next()); extractProps.openFreqCutoff = num; } else if (s.equals("-rfc") || s.equals("--ruleFreqCutoff")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no num specified"); } int num = Integer.parseInt(it.next()); extractProps.ruleFreqCutoff = num; } else if (s.equals("--skipUnmatched")) extractProps.skipUnmatched = true; else if (s.equals("-dd") || s.equals("--debugDerivs")) extractProps.debugDerivs = true; else if (s.equals("--text")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no file name specified"); } extractProps.textfile = it.next(); } else if (s.equals("--textf")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no file name specified"); } extractProps.factorsfile = it.next(); } else if (s.equals("--combos")) { if (!it.hasNext()) { throw new IllegalArgumentException("encountered flag " + s + ", but no file name specified"); } extractProps.combosfile = it.next(); } else if (!seenSrc) { extractProps.srcDir = s; seenSrc = true; } else { extractProps.destDir = s; } } // ensure directories exist or can be made File tempDir = new File(extractProps.tempDir); if (!tempDir.exists() && !tempDir.mkdirs()) throw new IllegalArgumentException( "could not create temp directory: " + extractProps.tempDir); File srcDir = new File(extractProps.srcDir); if (!srcDir.exists() || !srcDir.isDirectory()) throw new IllegalArgumentException( "source directory does not exist: " + extractProps.srcDir); File destDir = new File(extractProps.destDir); if (!destDir.exists() && !destDir.mkdirs()) throw new IllegalArgumentException( "could not create destination directory: " + extractProps.destDir); // log params System.out.println("Extracting Grammar"); System.out.println("Reading from: " + srcDir); System.out.println("Writing to: " + destDir); System.out.println("Temp dir: " + tempDir); System.out.println("Start section: " + extractProps.startSection); System.out.println("End section: " + extractProps.endSection); if (extractProps.fileNum >= 0) System.out.println("File: " + extractProps.fileNum); // do extraction steps if (doLex) LexExtract.extractLex(extractProps); if (doMorph) MorphExtract.extractMorph(extractProps); if (doRules) RulesExtract.extractRules(extractProps); // generate grammar.xml, if it doesn't already exist // nb: should eventually make schema refs relative to OPENCCG_HOME File gramFile = new File(destDir, "grammar.xml"); if (!gramFile.exists()) { System.out.println("Generating grammar.xml"); PrintWriter gramOut = new PrintWriter(new FileWriter(gramFile)); gramOut.println(""); gramOut.println(""); gramOut.println(" "); gramOut.println(" "); gramOut.println(" "); gramOut.println(""); gramOut.close(); } // do testbed if (doTestbed && !doTestbed) ; // nb: just avoiding a warning here // TODO if (doTestbed) Testbed.createTestFiles(extractProps); } /* Returns a stream source for the given resource from the class loader. */ public static StreamSource getSource(String resourceName) { ClassLoader cl = ExtractGrammar.class.getClassLoader(); return new StreamSource(cl.getResourceAsStream(resourceName)); } } ================================================ FILE: src/opennlp/ccgbank/extract/FreqTally.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Class which stores lexical info and associated frequencies. //This class is invoked by LexExtr.xsl and StemInsert.xsl transforms package opennlp.ccgbank.extract; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import opennlp.ccgbank.extract.ExtractGrammar.ExtractionProperties; //import javax.xml.transform.TransformerConfigurationException; //import javax.xml.transform.TransformerException; //import org.jdom.JDOMException; //import org.xml.sax.SAXException; public class FreqTally{ // Frequency cutoff for including an extracted cat public static int CAT_FREQ_CUTOFF = 1; // Frequency cutoff for including an extracted lex, cat, pos triple public static int LEX_FREQ_CUTOFF = 1; // Frequency cutoff for making a family (ie, cat & pos) open public static int OPEN_FREQ_CUTOFF = 100; //The object where lexical info of each category has been stored private static Map catInfo = new HashMap(); //Freq of cat specs private static Map catFreq = new HashMap(); //Sentence id private static String id=""; // Observed lex combos private static Set observedLexCombos = new HashSet(); /** Resets the statically held tallies. */ public static void reset() { catInfo = new HashMap(); catFreq = new HashMap(); id=""; observedLexCombos = new HashSet(); } //Proc which traps and stores id of each sentence public String storeId(String x) { if(x.length()>0){ id = x; int posEquals = x.indexOf('='); if (posEquals > 0) { id = x.substring(posEquals+1); } } return id; } //Changes case of proper nouns public String changeCase(String lex,String pos){ //if(!pos.equals("NNP") && !pos.equals("NNPS") && !lex.equals("I"))lex=lex.toLowerCase(); return lex; } // Creating a freq tally using hashtables. Invoked by LexExtr.xsl. // Returns whether cat+pos is seen for the first time. public boolean loadTally(String lex, String cat, String pos) { String key = catPosKey(cat, pos); CatNode cn; boolean retval; //First occurence of cat+pos. Entry made if(!catFreq.containsKey(key)){ cn = new CatNode(cat, pos); catInfo.put(key, cn); catFreq.put(key,1); retval = true; } else { // otherwise inc count cn = catInfo.get(key); catFreq.put(key, catFreq.get(key)+1); retval = false; } // store lex info cn.lexInsert(lex,id); return retval; } /** Returns a string key for a cat and pos. */ public static String catPosKey(String cat, String pos) { return cat+"-"+pos; } //Proc which outputs list of map keys in descending order of frequencies public static List sortTally(Map x) { // retval List sortedList = new ArrayList(); //Sorting by freq ArrayList vals1 = new ArrayList(x.values()); Collections.sort(vals1); //Removing unique frequencies to a new arraylist ArrayList vals = new ArrayList(vals1.size()); int prev = -1; for (Integer freq : vals1) { if (freq != prev) vals.add(freq); prev = freq; } //Finding all the keys corresponding to a particular freq for (int i=vals.size()-1; i >=0; i--) { int sortedFreq = vals.get(i); for (String key : x.keySet()) { int freq = x.get(key); if(freq==sortedFreq) sortedList.add(key); } } return sortedList; } public static void printTally(ExtractionProperties extractProps) throws FileNotFoundException { FreqTally.printTally(new File(extractProps.tempDir)); } public static void printTally(File directory) throws FileNotFoundException { System.out.println("Generating CorpFreq.html"); //Freq Output file File freqFile = new File(directory, "CorpFreq.html"); PrintWriter output=new PrintWriter(new FileOutputStream(freqFile)); List sortedCatKeys = sortTally(catFreq); //Printing the final ouput in html form output.println(""); output.println(""); output.println(""); output.println("Lexical Info"); output.println(""); output.println(""); output.println(""); output.flush(); for (int i=0; i < sortedCatKeys.size(); i++) { String key = sortedCatKeys.get(i); CatNode cn = catInfo.get(key); String cat = cn.cat; String pos = cn.pos; int freq = catFreq.get(key); output.println("

    "); output.println(i+1+" Category: "+cat+" POS: "+pos+" Freq: "+freq); output.println("

    "); output.println(); cn.printTally(output); output.flush(); } output.println(""); output.println(""); } /** Returns whether this lex combo has been seen for the first time. */ public boolean firstLexCombo(String lex, String stem, String rel, String cat, String pos,String semClass) { String key = lex + "_" + stem + "_" + rel + "_" + cat + "_" + pos + "_" + semClass; //String key = lex + "_" + stem + "_" + rel + "_" + cat + "_" + pos; if (observedLexCombos.contains(key)) return false; observedLexCombos.add(key); return true; } // returns the freq for the given key, or 0 if not present private int getFreq(String key) { Integer freq = catFreq.get(key); return (freq != null) ? freq : 0; } /** Returns the frequency of the cat and pos. */ public int getFreq(String cat, String pos) { String key = catPosKey(cat, pos); return getFreq(key); } /** Returns whether the cat and pos pass the frequency cutoff. */ public boolean checkFreqStatus(String cat, String pos) { /*if(cat.contains("Arg") || cat.startsWith("pp[")) return true;*/ /*if(id.contains("wsj_00")) return true;*/ return getFreq(cat, pos) >= CAT_FREQ_CUTOFF; } /** Returns whether the lex, cat and pos pass the frequency cutoffs. */ public boolean checkFreqStatus(String lex, String cat, String pos) { String key = catPosKey(cat, pos); //System.out.println(cat); if(cat.contains("pp[")) return true; /*if(id.contains("wsj_00")) return true;*/ if (getFreq(key) < CAT_FREQ_CUTOFF) return false; CatNode cn = catInfo.get(key); return cn.getLexFreq(lex) >= LEX_FREQ_CUTOFF; } /** Returns whether the cat and pos are for an open family. */ public boolean isOpen(String cat, String pos) { if (getFreq(cat, pos) < OPEN_FREQ_CUTOFF) return false; if (pos.startsWith("NN") || pos.equals("CD")) return true; else if (pos.startsWith("JJ") && (cat.equals("n_~1/n_1") || cat.equals("s[adj]_1\np_2"))) return true; else return false; } } ================================================ FILE: src/opennlp/ccgbank/extract/InsertLFHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Helper Class for insertLF.xsl //This class is invoked by MorphExtr.xsl package opennlp.ccgbank.extract; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; public class InsertLFHelper{ private List featTally=new ArrayList(); //Flush feat tally public String initFeat(){ featTally.clear(); return null; } public String putFeat(String feat){ featTally.add(feat); return null; } public String getFeat(){ String feat=""; if (featTally.size() > 0) { feat = featTally.get(0); featTally.remove(0); } else feat="xxx"; return feat; } // for ensuring uniqueness of stem/rel pairs private Set stemRelPairs = new HashSet(); // reset public String resetStemRelPairs() { stemRelPairs.clear(); return null; } // contains, updating public boolean containsStemRelPair(String stem, String rel) { String key = stem + "_" + rel; if (stemRelPairs.contains(key)) return true; stemRelPairs.add(key); return false; } private String[] rolesArray = {}; // sets the roles public boolean setRoles(String roles) { rolesArray = roles.split("\\s+"); return true; } // returns the nth role public String getRole(int n) { return (n < rolesArray.length) ? rolesArray[n] : "null"; } } ================================================ FILE: src/opennlp/ccgbank/extract/LexExtract.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Program which creates a temp.xml file from the bareparse. temp.xml serves are the input for creating lexicon.xml & morph.xml package opennlp.ccgbank.extract; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.Arrays; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXResult; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import opennlp.ccgbank.extract.ExtractGrammar.ExtractionProperties; import org.apache.xml.serializer.OutputPropertiesFactory; import org.apache.xml.serializer.Serializer; import org.apache.xml.serializer.SerializerFactory; import org.jdom.JDOMException; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLFilter; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; /** * Program which reads in each file of the bare parse xml rep and generates a lexicon, * a freq tally of the lexical info and a list of ccgbank sentences. */ public class LexExtract{ public static void extractLex(ExtractionProperties extractProps) throws TransformerException,TransformerConfigurationException,SAXException,IOException,JDOMException { System.out.println("Extracting lexicon info:"); File lexFile = new File(new File(extractProps.destDir), "lexicon.xml"); File tempFile = new File(new File(extractProps.tempDir), "temp.xml"); PrintWriter tempOut = new PrintWriter(new FileOutputStream(tempFile),true); File ccgbankDir = new File(extractProps.srcDir); File[] ccgbankSections=ccgbankDir.listFiles(); Arrays.sort(ccgbankSections); FreqTally.CAT_FREQ_CUTOFF = extractProps.catFreqCutoff; FreqTally.LEX_FREQ_CUTOFF = extractProps.lexFreqCutoff; FreqTally.OPEN_FREQ_CUTOFF = extractProps.openFreqCutoff; //temp.xml creation TransformerFactory tFactory = TransformerFactory.newInstance(); Transformer lexExtrTransformer = tFactory.newTransformer(ExtractGrammar.getSource("opennlp.ccgbank/transform/lexExtr.xsl")); // add root tempOut.println(""); for (int i=extractProps.startSection; i<=extractProps.endSection; i++){ System.out.println("Section " + ccgbankSections[i].getName()); File[] files=ccgbankSections[i].listFiles(); Arrays.sort(files); int fileStart = 0; int fileLimit = files.length; if (extractProps.fileNum >= 0) { fileStart = extractProps.fileNum; fileLimit = extractProps.fileNum + 1; } for (int j=fileStart; j"); tempOut.flush(); tempOut.close(); //Generating a freq tally from static datastructures FreqTally.printTally(extractProps); System.out.println("Generating lexicon.xml"); if (tFactory.getFeature(SAXSource.FEATURE) && tFactory.getFeature(SAXResult.FEATURE)) { SAXTransformerFactory saxTFactory = ((SAXTransformerFactory) tFactory); // Create an XMLFilter for each stylesheet. // Extract lexicon from temp.xml XMLFilter xmlFilter0 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/filterLex.xsl")); XMLFilter xmlFilter1 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/closedCatInsert.xsl")); XMLFilter xmlFilter2 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertLF.xsl")); XMLFilter xmlFilter3 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertPunctLF.xsl")); XMLFilter xmlFilter4 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertOrigPunctsLF.xsl")); XMLFilter xmlFilter5 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/addFilterLexFeats.xsl")); XMLFilter xmlFilter6 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/insertSemFeats.xsl")); XMLFilter xmlFilter7 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/markUnmatched.xsl")); // Create an XMLReader. XMLReader reader = XMLReaderFactory.createXMLReader(); // xmlFilter0 uses the XMLReader as its reader. xmlFilter0.setParent(reader); xmlFilter1.setParent(xmlFilter0); xmlFilter2.setParent(xmlFilter1); xmlFilter3.setParent(xmlFilter2); if (extractProps.lexF) { xmlFilter5.setParent(xmlFilter3); xmlFilter6.setParent(xmlFilter5); } else if (extractProps.origPuncts) { xmlFilter4.setParent(xmlFilter2); xmlFilter6.setParent(xmlFilter4); } else xmlFilter6.setParent(xmlFilter3); xmlFilter7.setParent(xmlFilter6); XMLFilter xmlFilter = xmlFilter7; java.util.Properties xmlProps = OutputPropertiesFactory.getDefaultMethodProperties("xml"); xmlProps.setProperty("indent", "yes"); xmlProps.setProperty("standalone", "no"); xmlProps.setProperty("{http://xml.apache.org/xalan}indent-amount", "2"); Serializer serializer = SerializerFactory.getSerializer(xmlProps); serializer.setOutputStream(new FileOutputStream(lexFile)); xmlFilter.setContentHandler(serializer.asContentHandler()); xmlFilter.parse(new InputSource(tempFile.getPath())); } } } ================================================ FILE: src/opennlp/ccgbank/extract/MorphExtrHelper.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //This class is invoked by MorphExtr.xsl package opennlp.ccgbank.extract; import java.util.HashSet; import java.util.Set; public class MorphExtrHelper { private FreqTally aFreqTally = new FreqTally(); private Set seenLexPos = new HashSet(); /** Returns whether the lex, cat and pos pass the frequency cutoffs, * and the lex-stem-pos combo is new. */ public boolean checkFreqAndNoveltyStatus(String lex, String stem, String cat, String pos,String semClass) { if (!aFreqTally.checkFreqStatus(lex, cat, pos)) return false; String key = lex + "_" + stem + "_" + "_" + pos + "_"+ semClass; //String key = lex + "_" + stem + "_" + "_" + pos; if (seenLexPos.contains(key)) return false; seenLexPos.add(key); return true; } //Applies rules to discern whether noun is animate or not public String macroNamer(String macro, String semClass,String pos,String lex){ if((semClass.startsWith("PER") && pos.startsWith("N"))||pos.equals("DT")){ macro=macro+" "+"@anim-nom"; } else if(pos.startsWith("PP") || lex.equals("those") || (pos.startsWith("PRP") && !lex.startsWith("it"))) macro=macro+" "+"@anim-nom"; else macro=macro+" "+"@non-anim-nom"; macro=macro.trim(); //Skip date time entities from animacy classification if(semClass.contains("STATE") || semClass.contains("NATION") || semClass.startsWith("ORG_DESC") || semClass.contains("DATE")||semClass.contains("TIME")||semClass.contains("QUANTITY")||semClass.contains("CARDINAL") || semClass.contains("PERCENT")) macro=""; //Eliminate collective nouns if(lex.equals("audience") || lex.equals("band") || lex.equals("group") || lex.equals("team") || lex.equals("club") || lex.equals("congregation")) macro=""; return macro; } //Applies rules to discern whether noun should have number agreement for the copula macro public String agrMacroDecider(String macro,String semClass,String pos,String lex){ if(pos.equals("NN")){ //if(lex.equals("couple") || lex.equals("following") ||lex.equals("rest") || semClass.contains("STATE") || semClass.contains("NATION") || semClass.startsWith("ORG_DESC") || semClass.contains("DATE")||semClass.contains("TIME")||semClass.contains("QUANTITY")||semClass.contains("CARDINAL") ||semClass.endsWith("'S")) if(lex.equals("couple") || semClass.startsWith("ORG_DESC") || lex.equals("following") ||lex.equals("rest") || semClass.contains("STATE") || semClass.contains("NATION") || semClass.contains("DATE")||semClass.contains("TIME")||semClass.contains("QUANTITY")||semClass.contains("CARDINAL") || semClass.contains("PERCENT") || semClass.endsWith("'S")) macro=""; if(semClass.length()==0) macro=""; } else{ } //System.out.println(macro); return macro; } public String whLex=""; public void storeWHLex(String whLex){ //System.out.println("Raja: "+whLex); this.whLex=whLex; } public String getWHLex(){ return this.whLex; } } ================================================ FILE: src/opennlp/ccgbank/extract/MorphExtract.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Program which takes in the /tmp/temp.xml file generated and forms a morph.xml file package opennlp.ccgbank.extract; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXResult; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.sax.SAXTransformerFactory; import opennlp.ccgbank.extract.ExtractGrammar.ExtractionProperties; import org.apache.xml.serializer.OutputPropertiesFactory; import org.apache.xml.serializer.Serializer; import org.apache.xml.serializer.SerializerFactory; import org.jdom.JDOMException; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLFilter; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; public class MorphExtract { public static void extractMorph(ExtractionProperties extractProps) throws TransformerException, TransformerConfigurationException, SAXException, IOException, JDOMException { System.out.println("Extracting morph:"); System.out.println("Generating morph.xml"); TransformerFactory tFactory = TransformerFactory.newInstance(); File morphFile = new File(new File(extractProps.destDir), "morph.xml"); File tempFile = new File(new File(extractProps.tempDir), "temp.xml"); if (tFactory.getFeature(SAXSource.FEATURE) && tFactory.getFeature(SAXResult.FEATURE)) { SAXTransformerFactory saxTFactory = ((SAXTransformerFactory) tFactory); ArrayList filterChain = new ArrayList(); ArrayList xslChain = new ArrayList(); if (extractProps.macroSpecs.length() > 0) { } addTransforms(xslChain, extractProps.macroSpecs); for (String xslFile : xslChain) filterChain.add(saxTFactory.newXMLFilter(ExtractGrammar .getSource(xslFile))); // Create an XMLReader and set first xsl transform to that. XMLReader reader = XMLReaderFactory.createXMLReader(); XMLFilter xmlFilter0 = filterChain.get(0); xmlFilter0.setParent(reader); //Create chain of xsl transforms // Create an XMLFilter for each stylesheet. for (int i = 1; i < filterChain.size(); i++) { XMLFilter xmlFilterPrev = filterChain.get(i - 1); XMLFilter xmlFilterCurr = filterChain.get(i); xmlFilterCurr.setParent(xmlFilterPrev); } XMLFilter xmlFilter = filterChain.get(filterChain.size() - 1); java.util.Properties xmlProps = OutputPropertiesFactory .getDefaultMethodProperties("xml"); xmlProps.setProperty("indent", "yes"); xmlProps.setProperty("standalone", "no"); xmlProps.setProperty("{http://xml.apache.org/xalan}indent-amount", "2"); Serializer serializer = SerializerFactory.getSerializer(xmlProps); serializer.setOutputStream(new FileOutputStream(morphFile)); //XMLFilter xmlFilter = xmlFilter2; //XMLFilter xmlFilter = xmlFilter3; xmlFilter.setContentHandler(serializer.asContentHandler()); xmlFilter.parse(new InputSource(tempFile.getPath())); } //Deleting the temporary lex file //tempFile.delete(); } public static void addTransforms(ArrayList xslChain, String macroSpecs) { xslChain.add("opennlp.ccgbank/transform/morphExtr.xsl"); if (macroSpecs.length() == 0) xslChain.add("opennlp.ccgbank/transform/macroInsert.xsl"); if (macroSpecs.contains("agr")) { System.out .println("Inserting a macro to check agreement in the copula"); xslChain.add("opennlp.ccgbank/transform/agr-macroInsert.xsl"); } if (macroSpecs.contains("anim")) { System.out .println("Inserting a macro to check animacy constraints"); xslChain.add("opennlp.ccgbank/transform/anim-macroInsert.xsl"); } } } ================================================ FILE: src/opennlp/ccgbank/extract/RulesExtract.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Program which extracts unary rules and their frequencies and finally outputs the rules.xml file package opennlp.ccgbank.extract; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.Arrays; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.sax.SAXResult; import javax.xml.transform.sax.SAXSource; import javax.xml.transform.sax.SAXTransformerFactory; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; import opennlp.ccgbank.extract.ExtractGrammar.ExtractionProperties; import org.apache.xml.serializer.OutputPropertiesFactory; import org.apache.xml.serializer.Serializer; import org.apache.xml.serializer.SerializerFactory; import org.jdom.JDOMException; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLFilter; import org.xml.sax.XMLReader; import org.xml.sax.helpers.XMLReaderFactory; public class RulesExtract { public static void extractRules(ExtractionProperties extractProps) throws TransformerException, TransformerConfigurationException,SAXException, IOException,JDOMException{ System.out.println("Extracting rule info:"); File rulesFile = new File(new File(extractProps.destDir), "rules.xml"); File tempFile = new File(new File(extractProps.tempDir), "temp-rules.xml"); PrintWriter tempOut=new PrintWriter(new FileOutputStream(tempFile),true); File ccgbankDir = new File(extractProps.srcDir); File[] ccgbankSections=ccgbankDir.listFiles(); Arrays.sort(ccgbankSections); RulesTally.RULE_FREQ_CUTOFF = extractProps.ruleFreqCutoff; RulesTally.KEEP_UNMATCHED = !extractProps.skipUnmatched; // add root tempOut.println(""); TransformerFactory tFactory = TransformerFactory.newInstance(); Transformer transformer = tFactory.newTransformer(ExtractGrammar.getSource("opennlp.ccgbank/transform/rulesExtr.xsl")); for (int i=extractProps.startSection; i<=extractProps.endSection; i++){ File[] files=ccgbankSections[i].listFiles(); Arrays.sort(files); int fileStart = 0; int fileLimit = files.length; if (extractProps.fileNum >= 0) { fileStart = extractProps.fileNum; fileLimit = extractProps.fileNum + 1; } for (int j=fileStart; j"); tempOut.close(); RulesTally.printTally(extractProps); System.out.println("Generating rules.xml"); if (tFactory.getFeature(SAXSource.FEATURE) && tFactory.getFeature(SAXResult.FEATURE)){ SAXTransformerFactory saxTFactory = ((SAXTransformerFactory) tFactory); // Create an XMLFilter for each stylesheet. XMLFilter xmlFilter1 = saxTFactory.newXMLFilter(ExtractGrammar.getSource("opennlp.ccgbank/transform/ccgRules.xsl")); //XMLFilter xmlFilter3 = saxTFactory.newXMLFilter(new StreamSource("foo3.xsl")); // Create an XMLReader. XMLReader reader = XMLReaderFactory.createXMLReader(); // xmlFilter1 uses the XMLReader as its reader. xmlFilter1.setParent(reader); java.util.Properties xmlProps = OutputPropertiesFactory.getDefaultMethodProperties("xml"); xmlProps.setProperty("indent", "yes"); xmlProps.setProperty("standalone", "no"); xmlProps.setProperty("{http://xml.apache.org/xalan}indent-amount", "2"); Serializer serializer = SerializerFactory.getSerializer(xmlProps); serializer.setOutputStream(new FileOutputStream(rulesFile)); XMLFilter xmlFilter = xmlFilter1; xmlFilter.setContentHandler(serializer.asContentHandler()); xmlFilter.parse(new InputSource(tempFile.getPath())); } //Deleting the temporory lex file //lexiconTempFile.delete(); } } ================================================ FILE: src/opennlp/ccgbank/extract/RulesTally.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Class which stores unary rule freqs //This class is invoked by the RulesExtr.xsl transform package opennlp.ccgbank.extract; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.PrintWriter; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.StringTokenizer; import opennlp.ccgbank.extract.ExtractGrammar.ExtractionProperties; public class RulesTally { // Frequency cutoff for including an extracted rule public static int RULE_FREQ_CUTOFF = 1; // Flag for whether to keep unmatched rules in extracted grammar public static boolean KEEP_UNMATCHED = true; //Unary Rule Freq private static Map ruleFreq = new HashMap(); //Rule Occurrence private static Map> ruleOccur = new HashMap>(); //Sentence id private static String id=""; /** Resets the statically held tallies. */ public static void reset() { ruleFreq = new HashMap(); ruleOccur = new HashMap>(); id=""; } //Proc which traps and stores id of each sentence public String storeId(String x){ if(x.length()>0){ id = x; int posEquals = x.indexOf('='); if (posEquals > 0) { id = x.substring(posEquals+1); } } return id; } //Loads freq tables, returns rule name public String loadTally(String res, String arg) throws FileNotFoundException{ arg = arg.replaceAll("_\\d", ""); res = res.replaceAll("_\\d", ""); String rule = arg+"_to_"+res; List temp; //Freq table entry opened at first instance of rule if (!ruleFreq.containsKey(rule)) { ruleFreq.put(rule,1); temp = new ArrayList(4); temp.add(id); ruleOccur.put(rule,temp); } else { int freq = ruleFreq.get(rule)+1; ruleFreq.put(rule,freq); temp = ruleOccur.get(rule); } //First 4 instances of rules stored if(temp.size()<4 && !temp.contains(id)){ temp.add(id); ruleOccur.put(rule,temp); } return rule; } public static void printTally(ExtractionProperties extractProps) throws FileNotFoundException { RulesTally.printTally(new File(extractProps.tempDir)); } //Rule Frequencies printed to file public static void printTally(File directory) throws FileNotFoundException{ System.out.println("Generating RuleFreq.html"); //Freq Output file File freqFile = new File(directory, "RuleFreq.html"); PrintWriter output=new PrintWriter(new FileOutputStream(freqFile)); List ruleList = FreqTally.sortTally(ruleFreq); //Printing the final ouput in html form output.flush(); output.println(""); output.println(""); output.println("");output.println("Unary Rule Info");output.println(""); output.println(""); output.println(""); output.flush(); String ccgbankHome = System.getProperty("CCGBANK_HOME", "/home/corpora/EN/ccgbank"); for (int i=0; i"); output.println(i+1+" Rule: "+rule+" Freq: "+freq); output.println("

    "); output.flush(); List rules = ruleOccur.get(rule); output.flush(); output.println("

      "); output.println("
    • "); output.flush(); for (int j=0; j"); id=rules.get(j); String[]idInfo=id.split("\\."); StringTokenizer st=new StringTokenizer(id,"."); String idLink=""; String sentNo=""; String dir=""; //2 courses of action depending on whether input is gold std .auto parses or C&C .auto parses if(idInfo.length==2){ idLink=st.nextToken()+".html"; sentNo="#Sentence "+st.nextToken(); dir=id.substring(4,6); } else { idLink=idInfo[0]; sentNo="#Sentence "+idInfo[0]; dir=idInfo[0]; } //System.out.println(idLink); output.println("
    • "); output.println(id); output.println(" "); output.println(idLink); output.println(""); output.println("
    • "); output.println("
    "); } output.flush(); output.println(""); output.println(""); output.println(""); output.flush(); } output.flush(); output.println(""); output.println(""); output.flush(); output.close(); } //Invoked by RulesExtr.xsl to check repetition of categories public boolean checkRuleStatus(String rule) { Integer freq = ruleFreq.get(rule); return (freq != null && freq == 1); } //Checks the freq of a rule public boolean checkRuleFreqStatus(String rule){ int freq = ruleFreq.get(rule); //Freqs >= cutoff accepted return (freq >= RULE_FREQ_CUTOFF); } // returns flag public boolean keepUnmatched() { return KEEP_UNMATCHED; } } ================================================ FILE: src/opennlp/ccgbank/extract/Testbed.java ================================================ /////////////////////////////////////////////////////////////////////////////// // Copyright (C) 2005-2009 Scott Martin, Rajakrishan Rajkumar and Michael White // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ////////////////////////////////////////////////////////////////////////////// //Program which creates a temp.xml file from the bareparse. temp.xml serves are the input for creating lexicon.xml & morph.xml package opennlp.ccgbank.extract; import java.io.*; import java.util.*; import opennlp.ccgbank.CCGBankTaskSources; import opennlp.ccgbank.CCGBankTaskTestbed; import opennlp.ccg.grammar.Grammar; import opennlp.ccg.grammar.RuleGroup; import opennlp.ccg.hylo.*; import opennlp.ccg.lexicon.*; import opennlp.ccg.parse.ParseException; import opennlp.ccg.synsem.*; import opennlp.ccg.test.*; import opennlp.ccg.unify.*; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMException; import org.jdom.input.SAXBuilder; import org.jdom.output.Format; import org.jdom.output.XMLOutputter; /** * Creates test files under in 'test' dir under extracted grammar. */ public class Testbed { // the grammar private Grammar grammar; private Lexicon lexicon; private RuleGroup rules; // supertagger stand-in private SupertaggerStandIn supertaggerStandIn = new SupertaggerStandIn(); // results of following deriv private Sign sign = null; private LF lf = null; private String str = ""; private int numParses = 0; private String header = ""; // Store info related to treenodes in the xml deriv private static ArrayList treeInfo = new ArrayList(); private static boolean treeInfoFlag = false; // Store details of preds (nomId key: pos,stag, pos and pred name) private static Hashtable predInfo = new Hashtable(); // supertag-rule combos private Set combos = null; Set sourcesSet; CCGBankTaskTestbed ccgBankTaskTestbed; File grammarFile, targetDirectory; // constructor public Testbed(Set sourcesSet, File targetDirectory, CCGBankTaskTestbed testbed) throws IOException { grammarFile = new File(targetDirectory, "grammar.xml"); this.grammar = new Grammar(grammarFile.toURI().toURL(), true); this.lexicon = grammar.lexicon; this.rules = grammar.rules; this.sourcesSet = sourcesSet; this.targetDirectory = targetDirectory; this.ccgBankTaskTestbed = testbed; } // main method for creating test files @SuppressWarnings("rawtypes") public void createTestFiles() throws IOException, JDOMException { ccgBankTaskTestbed.log("Creating test files:"); // config grammar Tokenizer tokenizer = grammar.lexicon.tokenizer; grammar.prefs.showFeats = true; grammar.prefs.showSem = ccgBankTaskTestbed.isShowsSem(); // ensure test dir exists File testDir = new File(targetDirectory, "test"); testDir.mkdirs(); ccgBankTaskTestbed.log("Writing test files to: " + testDir.getPath()); // text, class-replaced text factors etc. output PrintWriter textPW = null; PrintWriter textscPW = null; PrintWriter factorsPW = null; PrintWriter combosPW = null; PrintWriter predsPW = null; PrintWriter treePW = null; File textFile = ccgBankTaskTestbed.getText(); File factorsFile = ccgBankTaskTestbed.getFactors(); File combosFile = ccgBankTaskTestbed.getCombos(); File predsFile = ccgBankTaskTestbed.getPreds(); File treeFile = ccgBankTaskTestbed.getTree(); if (textFile != null) { File textscFile=new File(textFile.getParent()+"/"+textFile.getName().replaceFirst("text-","textsc-")); ccgBankTaskTestbed.log("Writing text to: " + textFile); ccgBankTaskTestbed.log("Writing class-replaced text to: " + textscFile); textFile.getParentFile().mkdirs(); textPW = new PrintWriter(new BufferedWriter(new FileWriter(textFile))); textscPW = new PrintWriter(new BufferedWriter(new FileWriter(textscFile))); } if (factorsFile != null) { ccgBankTaskTestbed.log("Writing factors to: " + factorsFile); factorsFile.getParentFile().mkdirs(); factorsPW = new PrintWriter(new BufferedWriter(new FileWriter(factorsFile))); } if (combosFile != null) { ccgBankTaskTestbed.log("Writing supertag-rule combos to: " + combosFile); combosFile.getParentFile().mkdirs(); combos = new HashSet(); combosPW = new PrintWriter(new BufferedWriter(new FileWriter(combosFile))); } if (predsFile != null) { ccgBankTaskTestbed.log("Writing preds to: " + predsFile); predsFile.getParentFile().mkdirs(); predsPW = new PrintWriter(new BufferedWriter(new FileWriter(predsFile))); } if (treeFile != null) { ccgBankTaskTestbed.log("Writing tree node info to: " + treeFile); treeFile.getParentFile().mkdirs(); treePW = new PrintWriter(new BufferedWriter(new FileWriter(treeFile))); treeInfoFlag = true; } // jdom stuff SAXBuilder builder = new SAXBuilder(); XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()); // counters int numWithLFs = 0; int numSingleRootLFs = 0; int numWithoutLFs = 0; for (CCGBankTaskSources sources : sourcesSet) { for (File file : sources) { File testSectDir = new File(testDir, file.getParentFile().getName()); testSectDir.mkdir(); ccgBankTaskTestbed.log("Debug Print: " + testSectDir.getAbsolutePath()); // parse derivations Document inDoc = builder.build(file); Element inRoot = inDoc.getRootElement(); // make test doc, sign map Document outDoc = new Document(); Element outRoot = new Element("regression"); outDoc.setRootElement(outRoot); Map signMap = new HashMap(); // loop through derivations, making test items List derivElts = inRoot.getChildren(); for (Object derivObj : derivElts) { Element derivElt = (Element) derivObj; followDeriv(derivElt); if (lf != null) { numWithLFs++; // check for single root if (lf instanceof SatOp) numSingleRootLFs++; // add test item, sign Element item = RegressionInfo.makeTestItem(grammar, str, numParses, lf); if (header == null) { header = "missing"; ccgBankTaskTestbed.log("Warning: missing header in " + file); } item.setAttribute("info", header); if (header != null) signMap.put(header, sign); // Add parsed words as a separate LF element Element fullWordsElt = new Element("full-words"); fullWordsElt.addContent(tokenizer.format(sign.getWords())); // Add info about LF lexical preds as a separate element Element predInfoElt = new Element("pred-info"); String predInfoText = collectPredInfo(header); predInfoElt.setAttribute("data", predInfoText); item.addContent(fullWordsElt); item.addContent(predInfoElt); outRoot.addContent(item); // append to text, factors files if (textPW != null) textPW.println(str); if (textscPW != null) { textscPW.flush(); String textsc=""; //Note sem class replacement works only for NE classes spec in the grammar file textsc=tokenizer.getOrthography((List)sign.getWords(),true); textscPW.println(textsc); textscPW.flush(); } if (factorsPW != null) factorsPW.println(tokenizer.format(sign.getWords())); // append new combos to combos file if (combosPW != null) { for (String combo : newCombos()) combosPW.println(combo); } // also to preds if (predsPW != null) predsPW.println(predInfoText); if (treePW != null) { for (String info : treeInfo) { treePW.println(info); treePW.flush(); } } treeInfo = new ArrayList(); } else numWithoutLFs++; } // write test doc, saved signs File regressionFile = new File(testSectDir, file.getName()); outputter.output(outDoc, new FileOutputStream(regressionFile)); RegressionInfo.writeSerFile(signMap, regressionFile); } } // flush text, factors, combos, preds, tree files if (textPW != null) { textPW.flush(); textPW.close(); } if (factorsPW != null) { factorsPW.flush(); factorsPW.close(); } if (combosPW != null) { combosPW.flush(); combosPW.close(); } if (predsPW != null) { predsPW.flush(); predsPW.close(); } if (treePW != null) { treePW.flush(); treePW.close(); } // summary ccgBankTaskTestbed.log("numWithLFs: " + numWithLFs); ccgBankTaskTestbed.log("numSingleRootLFs: " + numSingleRootLFs); ccgBankTaskTestbed.log("numWithoutLFs: " + numWithoutLFs); ccgBankTaskTestbed.log("total: " + (numWithLFs + numWithoutLFs)); } private void followDeriv(Element derivElt) { // reset sign = null; lf = null; str = ""; header = derivElt.getAttributeValue("Header"); // bookkeeping UnifyControl.startUnifySequence(); try { Category cat = null; Nominal index = null; LF flatLF = null; // recurse through deriv SignHash signs = followDerivR(derivElt); // set results, using first available sign (ie some arbitrary one) if (!signs.isEmpty()) { Iterator iter = signs.asSignSet().iterator(); // System.out.println("Processing file no: "+header); // Count of single rooted LFs produced by the constrained parser int matchSRLF = 0; // Check whether any of the signs have a single rooted LF while (iter.hasNext()) { // System.out.println("Found LF"); sign = iter.next(); cat = sign.getCategory(); index = cat.getIndexNominal(); flatLF = cat.getLF(); if (flatLF != null) { lf = HyloHelper.compactAndConvertNominals(flatLF, index, sign); // Break when the first single rooted LF is encountered if (lf instanceof SatOp) { matchSRLF++; // System.out.println("Single root LF found"); break; } } } // If no single rooted LF is there, using first available sign // (ie some arbitrary one) if (matchSRLF == 0) { sign = signs.asSignSet().iterator().next(); cat = sign.getCategory(); index = cat.getIndexNominal(); flatLF = cat.getLF(); if (flatLF != null) lf = HyloHelper.compactAndConvertNominals(flatLF, index, sign); } if (flatLF != null) { extrPredInfo(flatLF, ""); } numParses = signs.size(); str = str.trim(); } } catch (ParseException exc) { ccgBankTaskTestbed.log("Warning for " + header + ": " + exc.toString()); } } // recurse through deriv, returning signs @SuppressWarnings({ "rawtypes", "unchecked" }) private SignHash followDerivR(Element derivElt) throws ParseException { String eltName = derivElt.getName(); // follow deriv, applying combinatory rules // nb: no checks made for intended deriv! if (eltName.equals("Treenode")) { String cat = derivElt.getAttributeValue("cat"); String ntId = derivElt.getAttributeValue("nt_id"); String simpleCat = derivElt.getAttributeValue("stag"); List childElts = derivElt.getChildren(); int numChildren = childElts.size(); if (numChildren == 0) throw new ParseException(header + ": no child elements for TreeNode for cat: " + cat); // if no cat element present, adjust list with an initial dummy node, // to avoid code changes in what follows Element elt0 = (Element) childElts.get(0); String elt0name = elt0.getName(); if (elt0name.equals("Treenode") || elt0name.equals("Leafnode")) { childElts.add(0, new Element("dummy")); numChildren++; } if (numChildren != 2 && numChildren != 3) throw new ParseException(header + ": wrong number of child elements: " + numChildren + " for cat: " + cat); Element firstInputElt = (Element) childElts.get(1); SignHash firstSigns = followDerivR(firstInputElt); SignHash retval = new SignHash(); // unary case if (numChildren == 2) { // apply rules for (Sign s : firstSigns.asSignSet()) { List results = rules.applyUnaryRules(s); for (Sign rSign : results) retval.insert(rSign); } // caution/warn upon failure if (!containsCat(retval, simpleCat)) { boolean noResults = retval.isEmpty(); String inCat = firstInputElt.getAttributeValue("cat"); String msg = "Unable to derive: " + cat + " from: " + inCat; if (!noResults) ccgBankTaskTestbed.log("Caution for " + header + ": " + msg); if (ccgBankTaskTestbed.isDebugDerivations()) { ccgBankTaskTestbed.log(header + ": derivation stymied; inputs: "); for (Sign s : firstSigns.asSignSet()) { ccgBankTaskTestbed.log(s.toString()); } if (!noResults) { ccgBankTaskTestbed.log("Outputs: "); for (Sign s : retval.asSignSet()) ccgBankTaskTestbed.log(s.toString()); } } if (noResults) throw new ParseException("Derivation blocked: " + msg); } } // binary case else if (numChildren == 3) { Element secondInputElt = (Element) childElts.get(2); SignHash secondSigns = followDerivR(secondInputElt); // apply rules for (Sign sign1 : firstSigns.asSignSet()) { for (Sign sign2 : secondSigns.asSignSet()) { List results = rules.applyBinaryRules(sign1, sign2); for (Sign rSign : results) retval.insert(rSign); } } // if no results, propagate one input if the other is // internal punct if (retval.isEmpty()) { if (isPunct(secondInputElt)) return firstSigns; else if (isPunct(firstInputElt)) return secondSigns; } // caution/warn upon failure if (!containsCat(retval, simpleCat)) { boolean noResults = retval.isEmpty(); String inCat1 = firstInputElt.getAttributeValue("cat"); String inCat2 = secondInputElt.getAttributeValue("cat"); String msg = "Unable to derive: " + cat + " from: " + inCat1 + " and: " + inCat2; if (!noResults) ccgBankTaskTestbed.log("Caution for " + header + ": " + msg); if (ccgBankTaskTestbed.isDebugDerivations()) { ccgBankTaskTestbed.log(header + ": derivation stymied; first inputs: "); for (Sign sign1 : firstSigns.asSignSet()) { ccgBankTaskTestbed.log(sign1.toString()); } ccgBankTaskTestbed.log("Second inputs: "); for (Sign sign2 : secondSigns.asSignSet()) { ccgBankTaskTestbed.log(sign2.toString()); } if (!noResults) { ccgBankTaskTestbed.log("Outputs: "); for (Sign s : retval.asSignSet()) ccgBankTaskTestbed.log(s.toString()); } } if (noResults) throw new ParseException("Derivation blocked: " + msg); } } // Store cat ids of tree nodes for printing to aux files if (treeInfoFlag) { for (Sign s : retval.asSignSet()) { Hashtable idConvTally = new Hashtable(); Hashtable freqTally = new Hashtable(); ArrayList fullCat = new ArrayList(); String catId = ""; Category treeCat = s.getCategory(); // System.out.println(header+" "+ntId+" "+treeCat); recurseCat(treeCat, fullCat, idConvTally, freqTally); /* * System.out.println(freqTally); * System.out.println(fullCat); System.out.println('\n'); */ if (fullCat.size() > 1) { for (String x : fullCat) { String y[] = x.split("_"); if (y.length == 1) { catId = catId + "," + y[0]; continue; } int freq = freqTally.get(y[1]); freqTally.put(y[1], freq - 1); if (x.endsWith("_M") && freq <= 1) x = x.replaceFirst("_M", ""); catId = catId + "," + x; } catId = catId.replaceFirst(",", ""); treeInfo.add(header + " " + ntId + " " + catId); } /* * System.out.println(idConvTally); * System.out.println(fullCat); System.out.println('\n'); */ } } // done return retval; } // lex lookup // nb: not always insisting on right POS, b/c hashing strategy uses // surface words, // thus doesn't distinguish lex signs based solely on POS // nb: might make sense to warn on lex cats with missing semantics else if (eltName.equals("Leafnode")) { try { String lex = derivElt.getAttributeValue("lexeme"); Word w = lexicon.tokenizer.parseToken(lex); str += w.getForm() + " "; String cat = derivElt.getAttributeValue("cat"); String simpleCat = derivElt.getAttributeValue("stag"); String rel = derivElt.getAttributeValue("rel"); String indexRel = derivElt.getAttributeValue("indexRel"); String semClass = ""; semClass = derivElt.getAttributeValue("class"); String roles = derivElt.getAttributeValue("argRoles"); String pos = derivElt.getAttributeValue("pos"); // nb: for now, need to ignore rel for non-VB pos if (!pos.startsWith("VB")) rel = null; // lex lookup with required supertag // NB: there's no guarantee of getting the right arg roles if the word-cat pair is observed lexicon.setSupertagger(supertaggerStandIn); supertaggerStandIn.setTag(simpleCat); SignHash lexSigns = lexicon.getSignsFromWord(w); if (semClass == null || semClass.length() == 0) semClass = "NoClass"; // add lex signs, filtered by rel, reindexed // also check number with matching pos, match on no class int matchPOS = 0; boolean matchNoClass = false; for (Iterator it = lexSigns.asSignSet().iterator(); it.hasNext();) { Sign s = it.next(); Word wTemp = s.getWords().get(0); String morphClass = wTemp.getSemClass(); if (morphClass == null || morphClass.length() == 0) morphClass = "NoClass"; Category lexcat = s.getCategory(); LF lexLF = lexcat.getLF(); // allow any class if no sem class given if (!(semClass.equals("NoClass") || semClass.equals(morphClass)) || !containsPred(lexLF, rel) || !containsRoles(lexLF, roles) || !containsRel(lexLF, indexRel, s)) { it.remove(); } else { UnifyControl.reindex(lexcat); if (wTemp.getPOS().equals(pos)) { matchPOS++; if (semClass.equals("NoClass") && morphClass.equals("NoClass")) matchNoClass = true; } } } // filter by pos unless none match if (matchPOS > 0) { for (Iterator it = lexSigns.asSignSet().iterator(); it.hasNext();) { Sign s = it.next(); Word wTemp = s.getWords().get(0); if (!wTemp.getPOS().equals(pos)) { it.remove(); continue; } // filter by mismatched class if apropos if (matchNoClass) { String morphClass = wTemp.getSemClass(); if (morphClass != null && morphClass.length() != 0) it.remove(); } } } if (lexSigns.isEmpty()) throw new LexException("No matching category " + cat + " for: " + w); return lexSigns; } catch (LexException exc) { // try continuing derivations without lex signs for punctuation, // otherwise throw parse exception if (isPunct(derivElt)) { if (ccgBankTaskTestbed.isDebugDerivations()) { ccgBankTaskTestbed.log(header + ": " + exc.toString()); } return new SignHash(); } throw new ParseException(exc.toString()); } catch (RuntimeException exc) { // for other exceptions, throw parse exception throw new ParseException(exc.toString()); } } else throw new RuntimeException(header + ": unrecognized element in derivation: " + eltName); } // Recurse through a CCG cat and print out the atomcats and their ids private static void recurseCat(Category cat, ArrayList fullCat, Hashtable idConvTally, Hashtable freqTally) { if (cat instanceof ComplexCat) { ComplexCat cc = (ComplexCat) cat.copy(); Category resCat = cc.getResult(); recurseCat(resCat, fullCat, idConvTally, freqTally); int argStart = 0; if (resCat instanceof ComplexCat) { ComplexCat temp = (ComplexCat) resCat.copy(); argStart = temp.getArgStack().size(); } ArgStack argStack = cc.getArgStack(argStart); for (int i = 0; i < argStack.size(); i++) { if (argStack.get(i) instanceof BasicArg) { BasicArg bArg = (BasicArg) argStack.get(i); Category argCat = (Category) bArg.getCat(); Slash argSlash = (Slash) bArg.getSlash(); // System.out.println(argSlash.toString()+'\n'); fullCat.add(argSlash.toString()); recurseCat(argCat, fullCat, idConvTally, freqTally); } } } else if (cat instanceof AtomCat) { AtomCat ac = (AtomCat) cat.copy(); FeatureStructure fs = ac.getFeatureStructure(); if (fs.hasAttribute("index")) { String index = fs.getValue("index").toString(); // System.out.println(index); String id[] = index.split(":"); if (!idConvTally.containsKey(id[0])) idConvTally.put(id[0], Integer.toString(idConvTally.size() + 1)); String numId = idConvTally.get(id[0]); String catId = ac.getType() + "_" + numId; if (!freqTally.containsKey(numId)) freqTally.put(numId, 0); int freq = freqTally.get(numId); freqTally.put(numId, freq + 1); if (fs.hasAttribute("mod-index")) catId = catId + "_" + "M"; // System.out.println('\n'); fullCat.add(catId); } } } // returns whether the given LF contains the given the lexical predicate private static boolean containsPred(LF lf, String pred) { if (pred == null) return true; if (lf == null) return false; for (SatOp satOp : HyloHelper.getPreds(lf)) { if (HyloHelper.isLexPred(satOp)) { if (HyloHelper.getLexPred(satOp).equals(pred)) return true; } } return false; } // roles in a given LF private static Set rolesSet = new HashSet(); // returns whether the given LF contains the given the lexical predicate private static boolean containsRoles(LF lf, String roles) { if (roles == null) return true; if (lf == null) return false; String[] rolesArray = roles.split("\\s+"); // get roles in LF rolesSet.clear(); for (SatOp satOp : HyloHelper.getPreds(lf)) { if (HyloHelper.isRelPred(satOp)) { rolesSet.add(HyloHelper.getRel(satOp)); } } // check presence of roles in LF for (String role : rolesArray) { if (role.equals("null") || role.equals("e")) continue; if (!rolesSet.contains(role)) return false; } return true; } // returns whether the given LF contains the given indexRel private static boolean containsRel(LF lf, String indexRel, Sign sign) { if (indexRel == null) return true; if (lf == null) return false; indexRel = "<" + indexRel + ">"; /* * System.out.println(sign.getSupertag()+" "+sign.getPOS()); * System.out.println(indexRel); System.out.println(rolesSet); * System.out.println(lf); System.out.println('\n'); */ // check presence of that rel/feat in LF if (!lf.toString().contains(indexRel)) return false; else return true; } // identifies punctuation private static boolean isPunct(Element elt) { String pos = elt.getAttributeValue("pos"); if (pos == null) return false; return (pos.equals("|") || pos.equals(".") || pos.equals(",") || pos.equals(";") || pos.equals(":") || pos.equals("LRB") || pos.equals("RRB") || pos.equals("``") || pos.equals("''")); } // return whether signs contains cat; filter if so private static boolean containsCat(SignHash signs, String cat) { // special case: give free pass to cats with dollars if (!signs.isEmpty() && cat.indexOf('$') >= 0) return true; // check for cat boolean retval = false; for (Sign sign : signs.asSignSet()) { String supertag = sign.getCategory().getSupertag(); // again, give free pass to cats with dollars if (supertag.indexOf('$') >= 0 || cat.equals(supertag)) { retval = true; break; } } // filter if found if (retval) { for (Iterator it = signs.asSignSet().iterator(); it.hasNext();) { Sign sign = it.next(); String supertag = sign.getCategory().getSupertag(); if (supertag.indexOf('$') >= 0 || cat.equals(supertag)) continue; else it.remove(); } } return retval; } // returns new combos for current sign private List newCombos() { List retval = new ArrayList(); newCombos(sign, retval); return retval; } // recursively adds new combos for given sign private void newCombos(Sign s, List retval) { Sign[] inputs = s.getDerivationHistory().getInputs(); if (inputs != null) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < inputs.length; i++) { sb.append(inputs[i].getCategory().getSupertag()).append(' '); } sb.append(s.getDerivationHistory().getRule().name()); String combo = sb.toString(); if (!combos.contains(combo)) { retval.add(combo); combos.add(combo); } for (int i = 0; i < inputs.length; i++) { newCombos(inputs[i], retval); } } } // Extracts nom-id,pos,supertag info related to LF lexical preds private void extrPredInfo(LF lf, String sentId) { // System.out.println(sentId); extractPredInfo(lf, predInfo); } /** * Extracts the nom id, pos, and supertag info related to LF lexical preds, * and puts it in the given map keyed off the nom id. * Note that the map should be cleared for each new LF. */ public static void extractPredInfo(LF lf, Map predInfoMap) { String predData = ""; List preds = HyloHelper.getPreds(lf); for (SatOp pred : preds) { String lexPred = HyloHelper.getLexPred(pred); if (lexPred == null) continue; if (!(pred.getArg() instanceof Proposition)) continue; Proposition p = (Proposition) pred.getArg(); String lex = (p.getName()).toString(); // Get supertag & pos tag info and store that String stag = pred.getOrigin().getSupertag(); String pos = pred.getOrigin().getPOS(); Nominal nom = pred.getNominal(); String nomInd = nom.toString(); String nomIndParts[] = nomInd.split(":"); if (stag == null || pos == null || lex == null) continue; predData = escape(stag) + ":" + escape(pos) + ":" + escape(lex); predInfoMap.put(nomIndParts[0], predData); } } // Collects nom-id,pos,supertag info related to LF lexical preds for this // particular LF private static String collectPredInfo(String sentId) { String predData = ""; for (Enumeration e = predInfo.keys(); e.hasMoreElements();) { String nomId = e.nextElement(); predData = predData + " " + nomId + ":" + predInfo.get(nomId); } predInfo = new Hashtable(); return predData.trim(); } /** * Returns the pred info string for the given pred info map (see extractPredInfo). */ public static String getPredInfo(Map predInfoMap) { String predData = ""; for (String nomId : predInfoMap.keySet()) { predData = predData + " " + nomId + ":" + predInfoMap.get(nomId); } return predData.trim(); } // escapes a string using DefaultTokenizer private static String escape(String s) { return DefaultTokenizer.escape(s); } // stands in for a supertagger during lex lookup private static class SupertaggerStandIn implements SupertaggerAdapter { // map for a single key private Map map = new HashMap(2); public Map getSupertags() { return map; } // set tag void setTag(String tag) { map.clear(); map.put(tag, 1.0); } // dummy implementations public void setIncludeGold(boolean includeGold) {} public void resetBeta() {} public void resetBetaToMax() {} public void nextBeta() {} public void previousBeta() {} public boolean hasMoreBetas() { return false; } public boolean hasLessBetas() { return false; } public double[] getBetas() { return new double[]{1.0}; } public void setBetas(double[] betas) {} public double getCurrentBetaValue() { return 1.0; } } } ================================================ FILE: src/opennlp/ccgbank/lexicon-base.xsl ================================================ ================================================ FILE: src/opennlp/ccgbank/parse/CCGbankDerivation.jjt ================================================ // Grammar to parse all the ccgbank derivations in a given ccgbank file options { MULTI=true; NODE_DEFAULT_VOID=false; NODE_SCOPE_HOOK=true; STATIC=true; USER_TOKEN_MANAGER=false; NODE_PREFIX=""; NODE_USES_PARSER=true; } PARSER_BEGIN(CCGbankDerivation) package opennlp.ccgbank.parse; import java.util.*; import org.apache.tools.ant.Task; //import java.io.*; public class CCGbankDerivation { // The java code to operate the node scope hook static void jjtreeOpenNodeScope(Node n) { ((SimpleNode)n).first_token = getToken(1); } static void jjtreeCloseNodeScope(Node n) { ((SimpleNode)n).last_token = getToken(0); } } PARSER_END(CCGbankDerivation) // enter PROPINFO state (for extra Propbank-derived info) on "{" // nb: this is a workaround for ATOMCAT being perhaps too broadly defined TOKEN: { : PROPINFO } // return to normal on ">" at end of node <*> TOKEN: { "> : DEFAULT } // tokens in PROPINFO state TOKEN: { } TOKEN: { } TOKEN: { } TOKEN: { } // for stems and sem roles TOKEN: { } TOKEN: { } TOKEN: { } // regular tokens // mww: ATOMCAT is really too broad, b/c it's used for words too ... TOKEN: { | | ("\\/")*)+|("\\*")+> //The SPL token is for treebank words like "1\/2" or "bassoonist\/pianist\/composer" or "\*" where the middle tokens stand elsewhere for ccg operations. //So the SPL prevents such words from interfering with the tokenization } <*> SKIP: { " " | "\t" | "\n" | "\r" } SimpleNode start(): {} { // Every file consists of one or more treebank entries // Each treebank entry consists of 1 header and 1 or more treenodes or leafnodes ( header() ( LOOKAHEAD(2) leafnode() | treenode()) )+ { return jjtThis; } } void header(): { Token t1 = new Token(); Token t2 = new Token(); Token t3 = new Token(); jjtThis.type="Header"; } { // Header, eg: ID=wsj_2300.1 PARSER=GOLD NUMPARSE=1 t1= [t2= t3=] { jjtThis.header = t1.image + " " + t2.image + " " + t3.image; } } void treenode(): { Token leftover = new Token(); Token head = new Token(); Token dtr = new Token(); Token lex = new Token(); Token sense = new Token(); Token role = new Token(); jjtThis.type="Treenode"; } { // Storing the ccgID. (mww: huh?) // Sample Treenode: // With Propbank roles: "(" " head= dtr= { jjtThis.leftover=leftover.image; } | head= dtr= ) // Headedness, number of dtrs info stored in the SimpleNode data structure { jjtThis.head=head.image; jjtThis.dtr=dtr.image; } // optional propbank info // nb: need to use eg instead of "." once in PROPINFO lex state [ "{" lex= sense= role= { jjtThis.nodeRoles = new ArrayList(3); jjtThis.nodeRoles.add(new SimpleNode.LexSenseRole(lex.image, sense.image, role.image)); } ( lex= sense= role= { jjtThis.nodeRoles.add(new SimpleNode.LexSenseRole(lex.image, sense.image, role.image)); } )* ] // closure of one the root of a treenode // A treenode consists of 1 or more treenodes or leafnodes. ( LOOKAHEAD(2) leafnode() | treenode() )+ ")" // Close of a treenode } void leafnode(): { Token t = new Token(); Token lex = new Token(); Token sense = new Token(); Token role = new Token(); jjtThis.type="Leafnode"; } { /* Sample Leafnode structures: () () () */ "(" " { jjtThis.pos = t.image; } // The pos tag has been repeated in the ccgbank. // For numerals (null) is an entry. // Hence the optional brackets in the bnf below. ( ["("] [")"]) ( t= | t= ) { jjtThis.lex = t.image; } catSpec() // optional propbank info [ "{" lex= sense= role= // role or 'rel' { jjtThis.nodeRoles = new ArrayList(3); jjtThis.nodeRoles.add(new SimpleNode.LexSenseRole(lex.image, sense.image, role.image)); } ( lex= sense= role= { jjtThis.nodeRoles.add(new SimpleNode.LexSenseRole(lex.image, sense.image, role.image)); } )* ] [ role= { jjtThis.argRoles = new ArrayList(4); jjtThis.argRoles.add(role.image); } ( role= { jjtThis.argRoles.add(role.image); } )* ] //">" ")" } void catSpec(): { jjtThis.type="complexcat"; } { // The result category (atomic or complex) ( ( "(" catSpec() ")" [] ) // mww: the extra "atomcat" is really for an index on the complex cat | atomcat() ) // The argument of the function (atomic or complex). // Note: The following could be processed as a separate production "Embedded category" , but this would produce a node of that name. // To avoid that, the preceding BNF notation has been repeated here. [ op() ( ( "(" catSpec() ")" [] ) | atomcat() ) ] } void catSpecRedundant(): { jjtThis.type="Redundant"; } { ( ( "(" catSpec() ")" ) | atomcat() ) [ op() ( ( "(" catSpec() ")" ) | atomcat() ) ] } void atomcat(): { jjtThis.type="atomcat"; } { } void op(): { Token t = new Token(); jjtThis.type="op"; } { t= { jjtThis.cat=t.image; } } ================================================ FILE: src/opennlp/ccgbank/parse/SimpleNode.java ================================================ /* Generated By:JJTree: Do not edit this line. SimpleNode.java */ package opennlp.ccgbank.parse; import java.util.List; import opennlp.ccgbank.parse.CCGbankDerivation; import opennlp.ccgbank.parse.CCGbankDerivationTreeConstants; public class SimpleNode implements Node { //Javacc generated variables protected Node parent; protected Node[] children; protected int id; protected CCGbankDerivation parser; //User defined variables // lex, sense, role triples public static class LexSenseRole { public String lex, sense, role; public LexSenseRole(String lex, String sense, String role) { this.lex = lex; this.sense = sense; this.role = role; } } //CCGbank id public String header; //The serial no of the gold standard parse String parseNo = ""; //Node type eg:-Treenode,Leafnode,atomcat etc public String type = ""; //Traps any feature which is leftover public String leftover; //Headedness info 0 or 1 public String head = ""; //No:of daughters of a node public String dtr = ""; //Category Specification public String cat = ""; //Category Specification without co-indexation info in leafnodes public String catRedundant = ""; //Lexical information public String lex = ""; //Part of speech info. eg: RB, IN etc public String pos = ""; // The roles (or rel) that the node plays public List nodeRoles = null; // The arg roles of a verbal cat public List argRoles = null; //First token in the node scope Token first_token; //Final token in the node scope Token last_token; //Function which produces the content of the node. public String print() throws Exception { Token p = first_token; while (p != last_token) { cat = cat + p.image; p = p.next; } return cat + last_token.image; } //The remaining part incl comments is Javacc generated. public SimpleNode(int i) { id = i; } public SimpleNode(CCGbankDerivation p, int i) { this(i); parser = p; } /** * @return the header */ public String getHeader() { return header; } /** * @param header the header to set */ public void setHeader(String header) { this.header = header; } /** * @return the leftover */ public String getLeftover() { return leftover; } public int getId(){ return id; } public void jjtOpen() { } public void jjtClose() { } public void jjtSetParent(Node n) { parent = n; } public Node jjtGetParent() { return parent; } public void jjtAddChild(Node n, int i) { if (children == null) { children = new Node[i + 1]; } else if (i >= children.length) { Node c[] = new Node[i + 1]; System.arraycopy(children, 0, c, 0, children.length); children = c; } children[i] = n; } public Node jjtGetChild(int i) { return children[i]; } public int jjtGetNumChildren() { return (children == null) ? 0 : children.length; } /* You can override these two methods in subclasses of SimpleNode to customize the way the node appears when the tree is dumped. If your output uses more than one line you should override toString(String), otherwise overriding toString() is probably all you need to do. */ @Override public String toString() { return CCGbankDerivationTreeConstants.jjtNodeName[id]; } public String toString(String prefix) { return prefix + toString(); } /* Override this method if you want to customize how the node dumps out its children. */ public void dump(String prefix) { System.out.println(toString(prefix)); if (children != null) { for (int i = 0; i < children.length; ++i) { SimpleNode n = (SimpleNode) children[i]; if (n != null) { n.dump(prefix + " "); } } } } } ================================================ FILE: src/opennlp/ccgbank/parse/grammarInsert ================================================ static void jjtreeOpenNodeScope(Node n) { ((SimpleNode)n).first_token = getToken(1); } static void jjtreeCloseNodeScope(Node n) { ((SimpleNode)n).last_token = getToken(0); } ================================================ FILE: src/opennlp/ccgbank/rules-base.xsl ================================================ ================================================ FILE: src/pom.xml ================================================ 4.0.0 opennlp openccg 0.10.0 1.8 1.8 ${project.build.directory}/generated-sources jdom jdom 1.1 trove trove 1.0.2 org.apache.ant ant 1.9.0 net.sf.jgrapht jgrapht 0.8.3 net.sf.jopt-simple jopt-simple 3.1 junit junit 4.12 jline jline 1.0 org.eclipse.birt.runtime.3_7_1 org.apache.xml.serializer 2.7.1 openccg . maven-compiler-plugin 3.7.0 **/.backup.orig/** srilmbridge/ kenlm/ org.codehaus.mojo javacc-maven-plugin 2.6 jjt generate-sources jjtree-javacc ${basedir}/opennlp/ccgbank/parse/ ${project.build.gen}/jjtree/ jj generate-sources javacc ${project.build.gen}/jjtree/opennlp/ccgbank/parse/ ${project.build.gen}/jjtree/ org.codehaus.mojo build-helper-maven-plugin generate-sources add-source ${project.build.gen} ================================================ FILE: src/srilmbridge/Makefile ================================================ compile: javah -d . -classpath ${OPENCCG_HOME}/classes \ opennlp.ccg.ngrams.SRILMNgramModel g++ -o ${OPENCCG_HOME}/lib/libsrilmbridge.so \ -Wl,-soname,srilmbridge.so \ -I${JAVA_HOME}/include \ -I${JAVA_HOME}/include/linux \ -I${SRILM}/include \ -L${SRILM}/lib/i686 \ srilmbridge.cpp ${SRILM}/lm/src/LM.cc \ -shared -lc -ldstruct -lflm -llattice -lmisc -loolm \ -Wno-deprecated clean: rm ${OPENCCG_HOME}/lib/libsrilmbridge.so rm opennlp_ccg_ngrams_SRILMNgramModel.h ================================================ FILE: src/srilmbridge/srilmbridge.cpp ================================================ /* $Id: srilmbridge.cpp,v 1.13 2007/06/16 22:26:28 coffeeblack Exp $ */ #include #include #include #include #include "opennlp_ccg_ngrams_SRILMNgramModel.h" /* * Bridge from Java to the SRILM toolkit library. Loads a language model based * on specified parameters, then calculates the probability of a word within a * given context. * * Author: Scott Martin (http://www.ling.osu.edu/~scott/) * Version: $Revision: 1.13 $ */ const static unsigned STANDARD = 0, COUNT = 1; /* * The language model we will use to calculate word probabilities. */ LM *lm = NULL; /* * The type of language model in effect, as specified in loadLM. */ unsigned nativeLMType = STANDARD; /* * Loads a language model from a specified file with the specified ngram order. * The parameter lmType specifies what type (format) of language model to * expect. * * Throws: * java.io.IOException If fileName is null or empty or if a problem is * encountered reading the language model file. * java.lang.IllegalStateException If an LM has already been loaded. * java.lang.IllegalArgumentException If the specified LM type is not * supported. Currently supports STANDARD (type 0) and COUNT (type 1). */ JNIEXPORT void JNICALL Java_opennlp_ccg_ngrams_SRILMNgramModel_loadLM (JNIEnv *env, jobject obj, jint order, jstring fileName, jint lmType) { if(lm != NULL) { // already loaded env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "LM already loaded"); return; } nativeLMType = lmType; if(nativeLMType < STANDARD || nativeLMType > COUNT) { // only STANDARD and COUNT are allowed env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "LM type not supported"); return; } if(fileName == 0 || env->GetStringLength(fileName) == 0) { env->ThrowNew(env->FindClass("java/io/IOException"), "problem reading LM: empty file name"); return; } Vocab *vocab = new Vocab; lm = (nativeLMType == COUNT) ? (LM *)new NgramCountLM(*vocab, order) : (LM *)new Ngram(*vocab, order); const char* nativeFileName = env->GetStringUTFChars(fileName, NULL); File file(nativeFileName, "r", 0); unsigned lmError = 0; if(((File *)&file)->error() > 0) { lmError = 1; env->ThrowNew(env->FindClass("java/io/IOException"), "problem with LM file"); } else if(!lm->read(file)) { lmError = 1; env->ThrowNew(env->FindClass("java/io/IOException"), "problem reading LM"); } //TODO the following just repeats finalize(), should be in reusable function if(lmError > 0) { // destroy lm so this can be called again vocab->~Vocab(); if(lm != NULL) { if(nativeLMType == STANDARD) { ((Ngram *)lm)->~Ngram(); } else if(nativeLMType == COUNT) { ((NgramCountLM *)lm)->~NgramCountLM(); } else { // as a failsafe, call the abstract destructor lm->~LM(); } delete lm; } lm = NULL; //TODO clean up file somehow? } ((File *)&file)->close(); env->ReleaseStringUTFChars(fileName, nativeFileName); } /* * Uses the SRILM toolkit library to calculate the log prob of a word in a * specified context. The context is a history of tokens preceeding the * specified word specified in reverse order. For example, to find the * probability of "rain" in the context "in the rain", this method should be * called with "rain" as the parameter `word' and the array {"the", "in"} in * the parameter `context'. * * To calculate the log probability of a single word with no context, call this * method with either (1) NULL, or (2) a zero-length array as the value of * the parameter `context'. * * Throws: * java.lang.IllegalStateException If an error happened while loading * the LM and word probabilities can not be computed. * java.lang.IllegalArgumentException If the specified word is null or * zero-length or if the specified context contains a null or zero-length * string. */ JNIEXPORT jfloat JNICALL Java_opennlp_ccg_ngrams_SRILMNgramModel_doLogProb (JNIEnv *env, jobject obj, jstring word, jobjectArray context) { // make sure LM is ok to use if(lm == NULL) { env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "LM not loaded"); return 0; } // sanity checks must throw Java exceptions if(word == NULL || env->GetStringLength(word) == 0) { env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "word is null or zero-length"); return 0; } int contextLength = (context == NULL) ? 0 : env->GetArrayLength(context); VocabString nativeWord = (VocabString)env->GetStringUTFChars(word, NULL); VocabString nativeContext[contextLength + 1]; nativeContext[contextLength] = NULL; // context must be terminated by NULL // build context, converting each Java string to a VocabString jstring jstr = NULL; for(unsigned i = 0; i < contextLength; i++) { jstr = (jstring)env->GetObjectArrayElement(context, i); if(jstr == NULL || env->GetStringLength(jstr) == 0) { env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "context contains null or zero-length string"); env->DeleteLocalRef(jstr); env->ReleaseStringUTFChars(word, nativeWord); // release word // release already translated strings for(unsigned j = (i - 1); j >= 0; j--) { env->ReleaseStringUTFChars( (jstring)env->GetObjectArrayElement(context, j), nativeContext[j]); } return 0; } nativeContext[i] = (VocabString)env->GetStringUTFChars(jstr, NULL); } if(jstr != NULL) { env->DeleteLocalRef(jstr); } LogP prob = lm->wordProb(nativeWord, nativeContext); // clean up env->ReleaseStringUTFChars(word, nativeWord); // release word // release context strings if any for(unsigned k = 0; k < contextLength; k++) { env->ReleaseStringUTFChars((jstring)env->GetObjectArrayElement(context, k), nativeContext[k]); } return prob; } /* * Should be called by a finalize() method from within Java. Calls the * destructor method on the language model object we are using. */ JNIEXPORT void JNICALL Java_opennlp_ccg_ngrams_SRILMNgramModel_finalize (JNIEnv *env, jobject obj) { if(lm != NULL) { // call local destructors if type was specified if(nativeLMType == STANDARD) { ((Ngram *)lm)->~Ngram(); } else if(nativeLMType == COUNT) { ((NgramCountLM *)lm)->~NgramCountLM(); } else { // as a failsafe, call the abstract destructor lm->~LM(); } delete lm; } } ================================================ FILE: test/grammar.xml ================================================ ================================================ FILE: test/lexicon.xml ================================================ ================================================ FILE: test/morph.xml ================================================ ================================================ FILE: test/opennlp/ccg/alignment/AlignmentTest.java ================================================ package opennlp.ccg.alignment; import static opennlp.ccg.alignment.PhrasePosition.A; import static opennlp.ccg.alignment.PhrasePosition.B; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.junit.Before; import org.junit.Test; public class AlignmentTest { Integer id; Phrase one, two; Set mappings; Alignment alignment; Map> map, pam; Set twoAVals; @Before public void setUp() throws Exception { id = new Integer(37); one = new Phrase(id, Alignments.tokenize("This is it .")); two = new Phrase(id, Alignments.tokenize("A test this is .")); mappings = new HashSet(); mappings.add(new Mapping(id, 0, 2)); mappings.add(new Mapping(id, 1, 3)); mappings.add(new Mapping(id, 3, 4)); mappings.add(new Mapping(id, 2, 0)); mappings.add(new Mapping(id, 2, 1)); map = new HashMap>(); pam = new HashMap>(); map.put(0, new HashSet(Collections.singleton(2))); map.put(1, new HashSet(Collections.singleton(3))); map.put(3, new HashSet(Collections.singleton(4))); twoAVals = new HashSet(); twoAVals.add(0); twoAVals.add(1); twoAVals = Collections.unmodifiableSet(twoAVals); map.put(2, twoAVals); pam.put(2, new HashSet(Collections.singleton(0))); pam.put(3, new HashSet(Collections.singleton(1))); pam.put(4, new HashSet(Collections.singleton(3))); pam.put(0, new HashSet(Collections.singleton(2))); pam.put(1, new HashSet(Collections.singleton(2))); alignment = new Alignment(one, two, mappings); assertEquals(mappings, alignment); } @Test public void testCompare() { Phrase o = new Phrase(43, one), t = new Phrase(43, two); Set ms = new HashSet(); for(Mapping m : mappings) { ms.add(m.copyWithPhraseNumber(43)); } Alignment a = new Alignment(o, t, ms); assertEquals(-1, alignment.compareTo(a)); assertEquals(1, a.compareTo(alignment)); assertEquals(0, a.compareTo(a)); assertEquals(0, alignment.compareTo(alignment)); } @Test public void testAlignment() { try { new Alignment(null, two, mappings); fail("able to create alignment with null phrase"); } catch(IllegalArgumentException expected) { // do nothing } try { new Alignment(one, null, mappings); fail("able to create alignment with null phrase"); } catch(IllegalArgumentException expected) { // do nothing } try { new Alignment(one, two, null); fail("able to create alignment with null mappings"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testSize() { assertEquals(mappings.size(), alignment.size()); } @Test public void testGet() { assertEquals(one, alignment.getA()); assertEquals(two, alignment.getB()); assertNotSame(one, alignment.getB()); } @Test public void testAddMapping() { Mapping m = new Mapping(id, 0, 4); assertTrue(alignment.add(m)); assertFalse(alignment.add(new Mapping(id, 3, 4))); alignment.remove(m); try { alignment.add(new Mapping(id + 1, 0, 4)); fail("able to add mapping with non-matching ID"); } catch(IllegalArgumentException expected) { // do nothing } try { alignment.add(new Mapping(id, null, 5)); fail("able to add mapping with null index"); } catch(IllegalArgumentException expected) { // do nothing } try { alignment.add(new Mapping(id, 0, 5)); fail("able to add mapping with out of bounds index"); } catch(IndexOutOfBoundsException expected) { // do nothing } try { alignment.add(new Mapping(id, -2, 3)); fail("able to add mapping with out of bounds index"); } catch(IndexOutOfBoundsException expected) { // do nothing } try { alignment.add(new Mapping(id, 5, 3)); fail("able to add mapping with out of bounds index"); } catch(IndexOutOfBoundsException expected) { // do nothing } try { alignment.add(new Mapping(id, 3, -3)); fail("able to add mapping with out of bounds index"); } catch(IndexOutOfBoundsException expected) { // do nothing } } @Test public void testGetTargets() { Set ts = alignment.getTargets(2, A); Set s = new HashSet(); s.add(0); s.add(1); assertFalse(s.retainAll(ts)); assertEquals(s.size(), ts.size()); ts.add(4); assertTrue(ts.contains(4)); try { ts.remove(4); assertFalse(ts.contains(4)); } catch(UnsupportedOperationException e) { fail("unable to call remove()"); } try { Iterator i = ts.iterator(); i.next(); i.remove(); } catch(UnsupportedOperationException e) { fail("unable to call iterator().remove()"); } } @Test public void testMapEntrySet() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); Set s = new HashSet(twoAVals); for(Map.Entry> e : amap.entrySet()) { if(e.getKey().equals(2)) { assertEquals(s, e.getValue()); } } Iterator>> i = bmap.entrySet().iterator(); while(i.hasNext()) { Map.Entry> e = i.next(); if(e.getKey().equals(1)) { assertEquals(Collections.singleton(2), e.getValue()); } else if(e.getKey().equals(4)) { assertEquals(Collections.singleton(3), e.getValue()); } else { try { i.remove(); } catch(UnsupportedOperationException ex) { fail("unable to call Iterator.remove()"); } try { assertTrue(e.getValue().add(3)); assertTrue(e.getValue().contains(3)); assertFalse(e.getValue().add(3)); } catch(UnsupportedOperationException ex) { fail("unable to add to entry value"); } try { assertTrue(e.getValue().remove(3)); assertFalse(e.getValue().contains(3)); assertFalse(e.getValue().remove(3)); } catch(UnsupportedOperationException ex) { fail("unable to remove from entry value"); } try { if(!e.getValue().isEmpty()) { e.getValue().remove(e.getValue().iterator().next()); } } catch(UnsupportedOperationException ex) { fail("unable to call remove() for entry value"); } try { if(!e.getValue().isEmpty()) { Iterator it = e.getValue().iterator(); it.next(); it.remove(); } } catch(UnsupportedOperationException ex) { fail("unable to call remove() for entry value iterator"); } try { e.setValue(new HashSet(Collections.singleton(0))); } catch(UnsupportedOperationException ex) { fail("unable to set entry value"); } } } } @Test public void testMapValues() { Set> as = new HashSet>(); as.add(Collections.singleton(2)); as.add(Collections.singleton(3)); as.add(Collections.singleton(4)); Set s = new HashSet(twoAVals); as.add(s); Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); assertTrue(amap.values().size() == as.size() && amap.values().containsAll(as)); Set> bvals = new HashSet>(); // avoid doubling bvals.addAll(bmap.values()); as.remove(s); as.add(Collections.singleton(0)); as.add(Collections.singleton(1)); as.remove(Collections.singleton(4)); assertEquals(as, bvals); assertTrue(bmap.values().contains(Collections.singleton(2))); try { amap.values().add(Collections.singleton(1)); fail("able to add value"); } catch(UnsupportedOperationException expected) { // do nothing } try { if(!amap.values().isEmpty()) { amap.values().remove(amap.values().iterator().next()); } } catch(UnsupportedOperationException ex) { fail("unable to remove value"); } } @Test public void testMapKeySet() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); assertTrue(amap.keySet().contains(1)); assertTrue(bmap.keySet().contains(3)); assertFalse(amap.keySet().contains(4)); assertFalse(amap.keySet().contains(null)); assertFalse(bmap.keySet().contains(5)); try { amap.keySet().add(4); fail("able to add key to key set"); } catch(UnsupportedOperationException expected) { // do nothing, expected } try { bmap.keySet().add(4); fail("able to add key to key set"); } catch(UnsupportedOperationException expected) { // do nothing, expected } try { amap.keySet().remove(1); assertFalse(amap.keySet().contains(1)); } catch(UnsupportedOperationException ex) { fail("unable to remove key from key set"); } try { bmap.keySet().remove(3); assertFalse(bmap.keySet().contains(3)); } catch(UnsupportedOperationException ex) { fail("unable to remove key from key set"); } try { amap.keySet().clear(); assertTrue(amap.keySet().isEmpty()); } catch(UnsupportedOperationException expected) { fail("unable to clear key set"); } try { bmap.keySet().clear(); assertTrue(bmap.keySet().isEmpty()); } catch(UnsupportedOperationException expected) { fail("able to clear key set"); } try{ amap.keySet().add(5); fail("able to add to key set"); } catch(UnsupportedOperationException ex) { // expected } } @Test public void testMapContains() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); for(int i = 0; i < 4; i++) { assertTrue(amap.containsKey(i)); assertTrue(bmap.containsKey(i)); } assertTrue(bmap.containsKey(4)); assertFalse(amap.containsKey(4)); assertTrue(bmap.containsValue(Collections.singleton(2))); Set s = new HashSet(twoAVals); assertTrue(amap.containsValue(s)); } @Test public void testMapGet() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); Set s = new HashSet(twoAVals); assertEquals(s, amap.get(2)); assertEquals(Collections.singleton(3), bmap.get(4)); assertNull(amap.get(4)); try { assertTrue(amap.get(2).contains(1)); amap.get(2).remove(1); assertFalse(amap.get(2).contains(1)); } catch(UnsupportedOperationException expected) { fail("unable remove from value set"); } try { assertFalse(bmap.get(4).contains(2)); bmap.get(4).add(2); assertTrue(bmap.get(4).contains(2)); } catch(UnsupportedOperationException expected) { fail("unable add to value set"); } } @Test public void testMapPut() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); try { assertEquals(Collections.singleton(3), amap.put(1, Collections.singleton(2))); assertEquals(Collections.singleton(2), amap.get(1)); } catch(UnsupportedOperationException expected) { fail("unable to put"); } try { assertEquals(Collections.singleton(0), bmap.put(2, Collections.singleton(3))); assertEquals(Collections.singleton(3), bmap.get(2)); } catch(UnsupportedOperationException expected) { fail("unable to put"); } amap.remove(1); assertNull(amap.put(1, Collections.singleton(0))); } @Test public void testMapRemove() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); try { assertTrue(amap.containsKey(1)); amap.remove(1); assertFalse(amap.containsKey(1)); } catch(UnsupportedOperationException expected) { fail("unable to remove"); } try { assertTrue(bmap.containsKey(2)); bmap.remove(2); assertFalse(bmap.containsKey(2)); } catch(UnsupportedOperationException expected) { fail("unable to remove"); } } @Test public void testMapAdd() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); try { amap.get(1).add(0); assertTrue(amap.get(1).contains(0)); } catch(UnsupportedOperationException expected) { fail("unable to add"); } try { bmap.get(2).add(3); assertTrue(bmap.get(2).contains(3)); } catch(UnsupportedOperationException expected) { fail("unable to add"); } } @Test public void testMapClear() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); try { amap.clear(); assertTrue(amap.isEmpty()); } catch(UnsupportedOperationException expected) { fail("unable to clear"); } try { bmap.clear(); assertTrue(bmap.isEmpty()); } catch(UnsupportedOperationException expected) { fail("unable to clear"); } } @Test public void testAsMap() { Map> amap = alignment.asMap(A), bmap = alignment.asMap(B); assertEquals(map, amap); assertEquals(pam, bmap); assertTrue(map.keySet().containsAll(amap.keySet())); assertTrue(pam.keySet().containsAll(bmap.keySet())); assertTrue(map.values().containsAll(amap.values())); assertTrue(pam.values().containsAll(bmap.values())); assertEquals(4, amap.size()); assertEquals(5, bmap.size()); assertFalse(amap.isEmpty()); assertFalse(bmap.isEmpty()); alignment.add(new Mapping(id, 2, 2)); assertTrue(alignment.asMap(A).get(2).contains(2)); } @Test public void testFromMap() { assertEquals(alignment, Alignment.fromMap(one, two, map)); assertEquals(alignment.reverse(), Alignment.fromMap(two, one, pam)); assertEquals(alignment, Alignment.fromMap(one, two, alignment.asMap(A))); assertEquals(alignment.reverse(), Alignment.fromMap(two, one, alignment.asMap(B))); } @Test public void testReverse() { for(PhrasePosition pos : PhrasePosition.values()) { assertEquals(alignment.get(pos), alignment.reverse().get(pos.opposite())); } for(Mapping r : alignment.reverse()) { assertTrue(alignment.contains(r.reverse())); } assertEquals(alignment, alignment.reverse().reverse()); } @Test public void testGetIndices() { Set is = new HashSet(); for(int i = 0; i < 4; i++) { is.add(i); } assertEquals(is, alignment.getIndices(A)); is.add(4); assertEquals(is, alignment.getIndices(B)); } } ================================================ FILE: test/opennlp/ccg/alignment/IdentifiedPhraseReaderWriterTest.java ================================================ package opennlp.ccg.alignment; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.fail; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.junit.Before; import org.junit.Test; public class IdentifiedPhraseReaderWriterTest { String lineSep = System.getProperty("line.separator"); String input = "First phrase.\n Second phrase\r\n Third phrase . ", output = "First phrase." + lineSep + "Second phrase" + lineSep + "Third phrase ." + lineSep, paddedOutput = " First phrase. " + lineSep + " Second phrase " + lineSep + " Third phrase . " + lineSep; List phrases; @Before public void setUp() throws Exception { phrases = new ArrayList(); phrases.add(new Phrase("157", 0, Alignments.tokenize("First phrase."))); phrases.add(new Phrase("387b", 1, Alignments.tokenize("Second phrase"))); phrases.add(new Phrase("55", 2, Alignments.tokenize("Third phrase ."))); } @Test public void testIdentifiedPhraseReader() { try { new IdentifiedPhraseReader(new StringReader(""), null); fail("able to specify null number base"); } catch(IllegalArgumentException expected) { // do nothing } try { new IdentifiedPhraseReader(new StringReader(""), IndexBase.ZERO, null, ""); fail("able to specify null string"); } catch(IllegalArgumentException expected) { // do nothing } try { new IdentifiedPhraseReader(new StringReader(""), IndexBase.ZERO, "", null); fail("able to specify null string"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testBoth() { StringWriter sw = new StringWriter(); IdentifiedPhraseWriter writer = new IdentifiedPhraseWriter(sw); try { for(Phrase p : phrases) { writer.writePhrase(p); } } catch(IOException io) { fail(io.getMessage()); } IdentifiedPhraseReader reader = new IdentifiedPhraseReader(new StringReader(sw.getBuffer().toString())); Iterator i = phrases.iterator(); try { Phrase p; while((p = reader.readPhrase()) != null) { assertEquals(i.next(), p); } } catch(IOException io) { fail(io.getMessage()); } } @Test public void testReadPhrase() { IdentifiedPhraseReader reader = new IdentifiedPhraseReader(new StringReader(input)); try { Iterator i = phrases.iterator(); Phrase p; while((p = reader.readPhrase()) != null) { assertEquals(i.next(), p); } reader = new IdentifiedPhraseReader(new StringReader("")); assertNull(reader.readPhrase()); reader.close(); } catch(IOException io) { fail(io.getMessage()); } } @Test public void testWritePhrase() { StringWriter sw = new StringWriter(); IdentifiedPhraseWriter writer = new IdentifiedPhraseWriter(sw); try { for(Phrase p : phrases) { writer.writePhrase(p); } assertEquals(output, sw.getBuffer().toString()); writer.close(); } catch(IOException io) { fail(io.getMessage()); } // test padded version sw = new StringWriter(); writer = new IdentifiedPhraseWriter(sw, writer.getWordSeparator(), writer.getPhraseTag(), writer.getPhraseIdentifierAttribute(), true); try { for(Phrase p : phrases) { writer.writePhrase(p); } assertEquals(paddedOutput, sw.getBuffer().toString()); writer.close(); } catch(IOException io) { fail(io.getMessage()); } } } ================================================ FILE: test/opennlp/ccg/alignment/IndexBaseTest.java ================================================ package opennlp.ccg.alignment; import static opennlp.ccg.alignment.IndexBase.ONE; import static opennlp.ccg.alignment.IndexBase.ZERO; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import org.junit.Test; public class IndexBaseTest { Integer zero = new Integer(0), one = new Integer(1), two = new Integer(2), negOne = new Integer(-1), negTwo = new Integer(-2); @Test public void testGetStart() { assertEquals(zero, ZERO.start); assertEquals(one, ONE.start); } @Test public void testGetNullValue() { assertEquals(negOne, ZERO.nullValue); assertEquals(zero, ONE.nullValue); } @Test public void testIsValidIndex() { assertTrue(ZERO.isValidIndex(zero)); assertTrue(ZERO.isValidIndex(one)); assertTrue(ZERO.isValidIndex(negOne)); assertFalse(ZERO.isValidIndex(negTwo)); assertTrue(ONE.isValidIndex(zero)); assertTrue(ONE.isValidIndex(one)); assertFalse(ONE.isValidIndex(negOne)); assertFalse(ONE.isValidIndex(negTwo)); assertTrue(ZERO.isValidIndex(two)); assertTrue(ONE.isValidIndex(two)); } @Test public void testTranslate() { try { ZERO.translate(negTwo, ONE); fail("ZERO able to translate " + negTwo); } catch(IllegalArgumentException expected) { // do nothing } try { ONE.translate(negOne, ZERO); fail("ONE able to translate " + negOne); } catch(IllegalArgumentException expected) { // do nothing } // identity tests assertEquals(one, ZERO.translate(one, ZERO)); assertEquals(one, ONE.translate(one, ONE)); assertEquals(zero, ZERO.translate(zero, ZERO)); assertEquals(zero, ONE.translate(zero, ONE)); // actual translations assertEquals(zero, ZERO.translate(negOne, ONE)); assertEquals(one, ZERO.translate(zero, ONE)); assertEquals(two, ZERO.translate(one, ONE)); assertEquals(negOne, ONE.translate(zero, ZERO)); assertEquals(zero, ONE.translate(one, ZERO)); assertEquals(one, ONE.translate(two, ZERO)); } } ================================================ FILE: test/opennlp/ccg/alignment/MappingFormatTest.java ================================================ package opennlp.ccg.alignment; import static org.junit.Assert.*; import java.text.ParseException; import java.util.HashSet; import java.util.Set; import opennlp.ccg.alignment.MappingFormat.Field; import org.junit.Before; import org.junit.Test; import static opennlp.ccg.alignment.Alignments.*; public class MappingFormatTest { Set formats; Mapping vanilla, chocolate, nullId, nullValue; @Before public void setUp() throws Exception { MappingFormat moses = MappingFormat.getInstance(MOSES_ENCODING_SCHEME), mosesShort = MappingFormat.getInstance(MOSES_ENCODING_SCHEME, Alignments.MOSES_SHORT_FIELDS), mosesShortStrict = MappingFormat.getInstance(MOSES_ENCODING_SCHEME, Alignments.MOSES_SHORT_FIELDS, true), naacl = MappingFormat.getInstance(NAACL_ENCODING_SCHEME), naaclShort = MappingFormat.getInstance(NAACL_ENCODING_SCHEME, Alignments.NAACL_SHORT_FIELDS), naaclShortStrict = MappingFormat.getInstance(NAACL_ENCODING_SCHEME, Alignments.NAACL_SHORT_FIELDS, true), naaclVeryShort = MappingFormat.getInstance(NAACL_ENCODING_SCHEME, Alignments.NAACL_VERY_SHORT_FIELDS); formats = new HashSet(); formats.add(moses); formats.add(mosesShort); formats.add(mosesShortStrict); formats.add(naacl); formats.add(naaclShort); formats.add(naaclShortStrict); formats.add(naaclVeryShort); vanilla = new Mapping(31, 4, 9); chocolate = new Mapping(31, 13, 5, Status.POSSIBLE, 0.75); nullId = new Mapping(null, 2, 2); nullValue = new Mapping(17, -1, 5); } @Test public void testMappingFormat() { Set fields = new HashSet(); fields.add(Field.PHRASE_NUMBER_FIELD); try { MappingFormat.getInstance(MOSES_ENCODING_SCHEME, fields, true); fail("able to create Moses formatter with ID field"); } catch(IllegalArgumentException expected) { // should happen } try { MappingFormat.getInstance(NAACL_ENCODING_SCHEME, fields, false); fail("able to create NAACL formatter with only ID field"); } catch(IllegalArgumentException expected) { // should happen } try { MappingFormat.getInstance(MOSES_ENCODING_SCHEME, null, true); fail("able to create Moses formatter with null fields"); } catch(IllegalArgumentException expected) { // should happen } try { MappingFormat.getInstance(NAACL_ENCODING_SCHEME, null, false); fail("able to create NAACL formatter with null fields"); } catch(IllegalArgumentException expected) { // should happen } try { MappingFormat.getInstance(null, fields, true); fail("able to create formatter null scheme"); } catch(IllegalArgumentException expected) { // should happen } } @Test public void testFormatMapping() { for(MappingFormat mf : formats) { String v = mf.format(vanilla), c = mf.format(chocolate), ni = null; EncodingScheme es = mf.encodingScheme; try { ni = mf.formatMapping(nullId); } catch(IllegalArgumentException e) { if(!es.getRequired().contains(MappingFormat.Field.PHRASE_NUMBER_FIELD)) { fail("unexpected exception: " + e.getMessage()); } } try { mf.format(nullValue); fail("able to format mapping with null index"); } catch(IllegalArgumentException expected) { // should happen } if(es.equals(MOSES_ENCODING_SCHEME)) { if(mf.fields.contains(MappingFormat.Field.STATUS_FIELD)) { assertEquals("13-5-P", c); if(mf.isStrict()) { assertEquals("4-9-S", v); assertEquals("2-2-S", ni); } else { assertEquals("4-9", v); assertEquals("2-2", ni); } } else { assertEquals("4-9", v); assertEquals("13-5", c); assertEquals("2-2", ni); } } else if(es.equals(NAACL_ENCODING_SCHEME)) { try { ni = mf.formatMapping(nullId); fail("able to format mapping with null id"); } catch(IllegalArgumentException expected) { // should happen } if(mf.fields.contains(MappingFormat.Field.STATUS_FIELD)) { if(mf.fields.contains(MappingFormat.Field.CONFIDENCE_FIELD)) { if(mf.isStrict()) { assertEquals("31 5 10 S 1.0", v); assertEquals("31 14 6 P 0.75", c); } else { assertEquals("31 5 10", v); assertEquals("31 14 6 P 0.75", c); } } else { assertEquals("31 14 6 P", c); if(mf.isStrict()) { assertEquals("31 5 10 S", v); } else { assertEquals("31 5 10", v); } } } else { assertEquals("31 5 10", v); assertEquals("31 14 6", c); } } } } @Test public void testParseMapping() { for(MappingFormat mf : formats) { EncodingScheme es = mf.encodingScheme; Mapping v, c, ni; if(es.equals(MOSES_ENCODING_SCHEME)) { try { mf.parseMapping("-1-5"); fail("able to parse mapping with negative index"); } catch(ParseException expected) { // should } if(mf.fields.contains(MappingFormat.Field.STATUS_FIELD)) { try { v = mf.parseMapping("4-9-S").copyWithPhraseNumber(chocolate.phraseNumber); assertEquals(vanilla, v); c = mf.parseMapping("13-5-P").copyWithPhraseNumber(chocolate.phraseNumber); c.setConfidence(chocolate.confidence); assertEquals(chocolate, c); ni = mf.parseMapping("2-2-S"); assertEquals(nullId, ni); if(mf.isStrict()) { try { mf.parseMapping("4-9"); fail("strict format able to parse loose input"); } catch(ParseException expected) { assertEquals(3, expected.getErrorOffset()); } } else { v = mf.parseMapping("4-9").copyWithPhraseNumber(chocolate.phraseNumber); assertEquals(vanilla, v); ni = mf.parseMapping("2-2"); assertEquals(nullId, ni); } } catch(ParseException p) { fail("parse exception: " + p.getMessage()); } } else { try { v = mf.parseMapping("4-9").copyWithPhraseNumber(chocolate.phraseNumber); assertEquals(vanilla, v); c = mf.parseMapping("13-5").copyWithPhraseNumber(chocolate.phraseNumber); c.setStatus(Status.POSSIBLE); c.setConfidence(chocolate.confidence); assertEquals(chocolate, c); ni = mf.parseMapping("2-2"); assertEquals(nullId, ni); } catch(ParseException p) { fail("parse exception: " + p.getMessage()); } } } else if(es.equals(NAACL_ENCODING_SCHEME)) { try { mf.parseMapping("31 0 6 S 1.0"); fail("able to parse mapping with 0 index, but index base is 1"); } catch(ParseException expected) { // should } if(mf.fields.contains(MappingFormat.Field.STATUS_FIELD)) { if(mf.fields.contains(MappingFormat.Field.CONFIDENCE_FIELD)) { try { if(mf.isStrict()) { v = mf.parseMapping("31 5 10 S 1.0"); assertEquals(vanilla, v); try { mf.parseMapping("31 5 10"); fail("able to parse loose input with strict format"); } catch(ParseException expected) { assertEquals(7, expected.getErrorOffset()); } } else { v = mf.parseMapping("31 5 10"); assertEquals(vanilla, v); } c = mf.parseMapping("31 14 6 P 0.75"); assertEquals(chocolate, c); try { ni = mf.parseMapping("3 3 S"); } catch(ParseException should) { // expected assertEquals(4, should.getErrorOffset()); } } catch(ParseException p) { fail("parse exception: " + p.getMessage()); } } else { try { if(mf.isStrict()) { v = mf.parseMapping("31 5 10 S"); assertEquals(vanilla, v); try { mf.parseMapping("31 5 10"); fail("able to parse loose input with strict format"); } catch(ParseException expected) { assertEquals(7, expected.getErrorOffset()); } } else { v = mf.parseMapping("31 5 10"); assertEquals(vanilla, v); } c = mf.parseMapping("31 14 6 P"); c.setConfidence(chocolate.confidence); assertEquals(chocolate, c); try { ni = mf.parseMapping("3 3 S"); } catch(ParseException should) { // expected assertEquals(4, should.getErrorOffset()); } } catch(ParseException p) { fail("parse exception: " + p.getMessage()); } } } else { try { v = mf.parseMapping("31 5 10"); assertEquals(vanilla, v); c = mf.parseMapping("31 14 6"); c.setStatus(Status.POSSIBLE); c.setConfidence(chocolate.confidence); assertEquals(chocolate, c); try { ni = mf.parseMapping("3 3 S"); fail("able to parse mapping without ID"); } catch(ParseException should) { // expected assertEquals(4, should.getErrorOffset()); } } catch(ParseException p) { fail("parse exception: " + p.getMessage()); } } } } } } ================================================ FILE: test/opennlp/ccg/alignment/MappingGroupTest.java ================================================ package opennlp.ccg.alignment; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.fail; import org.junit.Before; import org.junit.Test; public class MappingGroupTest { MappingGroup one, two; @Before public void setUp() throws Exception { one = new MappingGroup(37, 12); two = new MappingGroup(1, 8); } @Test public void testMappingGroup() { try { new MappingGroup(null, 1); fail("able to specify null number"); } catch(IllegalArgumentException expected) { // do nothing } try { new MappingGroup(37, -1); fail("able to specify negative length"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testEqualsObject() { assertNotSame(one, two); assertNotSame(two, null); assertEquals(one, new MappingGroup(one.phraseNumber, one.length)); } @Test public void testCompareTo() { assertEquals(0, one.compareTo(one)); assertEquals(1, one.compareTo(two)); } } ================================================ FILE: test/opennlp/ccg/alignment/MappingReaderWriterTest.java ================================================ package opennlp.ccg.alignment; import static opennlp.ccg.alignment.Status.POSSIBLE; import static opennlp.ccg.alignment.Status.SURE; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.fail; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Set; import org.junit.Before; import org.junit.Test; import static opennlp.ccg.alignment.Alignments.*; public class MappingReaderWriterTest { MappingReader mosesReader, naaclReader; MappingWriter mosesWriter, naaclWriter; StringWriter mosesStringWriter, naaclStringWriter; MappingFormat mosesFormat = MappingFormat.getInstance(MOSES_ENCODING_SCHEME), naaclFormat = MappingFormat.getInstance(NAACL_ENCODING_SCHEME, Alignments.NAACL_SHORT_FIELDS, false); Set mosesMappings = new LinkedHashSet(), naaclMappings = new LinkedHashSet(); Set mosesGroups = new LinkedHashSet(), naaclGroups = new LinkedHashSet(); String lineSep = System.getProperty("line.separator"); String mosesInput = "0-1-S 0-0 2-1-P 3-3 4-8 21-23\r\n3-4 34-55-P 1-4 23-1-S\n", mosesOutput = "0-1 0-0 2-1-P 3-3 4-8 21-23" + lineSep + "3-4 34-55-P 1-4 23-1", naaclInput = "17 1 1 S\r17 2 3 P\n17 5 5\r\n17 4 4 S 0.75\n37 3 2 P", naaclOutput = "17 1 1" + lineSep + "17 2 3 P" + lineSep + "17 5 5" + lineSep + "17 4 4" + lineSep + "37 3 2 P", emptyInput = "", lineEndingOnly = "\n"; @Before public void setUp() throws Exception { mosesReader = new MappingReader(new StringReader(mosesInput), mosesFormat); naaclReader = new MappingReader(new StringReader(naaclInput), naaclFormat); mosesStringWriter = new StringWriter(); mosesWriter = new MappingWriter(mosesStringWriter, mosesFormat); naaclStringWriter = new StringWriter(); naaclWriter = new MappingWriter(naaclStringWriter, naaclFormat); mosesMappings.add(new Mapping(0, 0, 1, SURE)); mosesMappings.add(new Mapping(0, 0, 0)); mosesMappings.add(new Mapping(0, 2, 1, POSSIBLE)); mosesMappings.add(new Mapping(0, 3, 3)); mosesMappings.add(new Mapping(0, 4, 8)); mosesMappings.add(new Mapping(0, 21, 23)); mosesMappings.add(new Mapping(1, 3, 4)); mosesMappings.add(new Mapping(1, 34, 55, POSSIBLE)); mosesMappings.add(new Mapping(1, 1, 4)); mosesMappings.add(new Mapping(1, 23, 1, SURE)); naaclMappings.add(new Mapping(17, 0, 0, SURE)); naaclMappings.add(new Mapping(17, 1, 2, POSSIBLE)); naaclMappings.add(new Mapping(17, 4, 4)); naaclMappings.add(new Mapping(17, 3, 3, SURE, new Double(0.75d))); naaclMappings.add(new Mapping(37, 2, 1, POSSIBLE)); mosesGroups.add(new MappingGroup(0, 6)); mosesGroups.add(new MappingGroup(1, 4)); naaclGroups.add(new MappingGroup(17, 4)); naaclGroups.add(new MappingGroup(37, 1)); } @Test public void testConstructors() { try { new MappingReader(new StringReader(""), null); fail("able to specify null format"); } catch(IllegalArgumentException expected) { // do nothing } try { new MappingWriter(new StringWriter(), null); fail("able to specify null format"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testBoth() { Iterator mi = mosesMappings.iterator(); Iterator gi = mosesGroups.iterator(); StringWriter sw = new StringWriter(); MappingWriter mw = new MappingWriter(sw, mosesFormat); try { while(gi.hasNext()) { mw.startGroup(gi.next()); while(mw.canWrite() && mi.hasNext()) { mw.writeMapping(mi.next()); } } gi = mosesGroups.iterator(); mi = mosesMappings.iterator(); MappingReader mr = new MappingReader(new StringReader(sw.getBuffer().toString()), mosesFormat); while(gi.hasNext()) { MappingGroup g = gi.next(); assertEquals(g, mr.nextGroup()); while(mr.canRead()) { assertEquals(mi.next(), mr.readMapping()); } } mi = naaclMappings.iterator(); gi = naaclGroups.iterator(); sw = new StringWriter(); mw.close(); mw = new MappingWriter(sw, naaclFormat); while(gi.hasNext()) { mw.startGroup(gi.next()); while(mw.canWrite() && mi.hasNext()) { mw.writeMapping(mi.next()); } } mi = naaclMappings.iterator(); gi = naaclGroups.iterator(); mr.close(); mr = new MappingReader(new StringReader(sw.getBuffer().toString()), naaclFormat); while(gi.hasNext()) { MappingGroup g = gi.next(); assertEquals(g, mr.nextGroup()); while(mr.canRead()) { Mapping m = mi.next(); m.setConfidence(Alignments.DEFAULT_CONFIDENCE); assertEquals(m, mr.readMapping()); } } mosesWriter.close(); naaclWriter.close(); mr.close(); mw.close(); } catch(IOException io) { fail(io.getMessage()); } } @Test public void testMappingWriter() { Iterator mi = mosesMappings.iterator(); Iterator gi = mosesGroups.iterator(); try { while(gi.hasNext()) { mosesWriter.startGroup(gi.next()); while(mosesWriter.canWrite()) { mosesWriter.writeMapping(mi.next()); } } mosesWriter.close(); assertEquals(mosesOutput, mosesStringWriter.getBuffer().toString()); mi = naaclMappings.iterator(); gi = naaclGroups.iterator(); while(gi.hasNext()) { naaclWriter.startGroup(gi.next()); while(naaclWriter.canWrite()) { naaclWriter.writeMapping(mi.next()); } } naaclWriter.close(); assertEquals(naaclOutput, naaclStringWriter.getBuffer().toString()); } catch(IOException io) { fail(io.getMessage()); } MappingWriter mw = new MappingWriter(new StringWriter(), naaclFormat); try { mw.writeMapping(new Mapping(1, 0)); fail("able to write mapping without starting group"); } catch(IOException expected) { //should happen } mw = new MappingWriter(new StringWriter(), mosesFormat); try { mw.startGroup(new MappingGroup(0, 1)); try { mw.writeMapping(new Mapping(1, 1, 1)); fail("able to write mapping from different group"); } catch(IOException expected) { //should happen } } catch(IOException io) { fail("problem testing: " + io.getMessage()); } mw = new MappingWriter(new StringWriter(), naaclFormat); try { mw.startGroup(new MappingGroup(0, 1)); mw.writeMapping(new Mapping(0, 1, 1)); try { mw.writeMapping(new Mapping(0, 1, 2)); fail("able to write too many mappings"); } catch(IOException expected) { //should happen } } catch(IOException io) { fail("problem testing: " + io.getMessage()); } mw = new MappingWriter(new StringWriter(), mosesFormat); try { mw.startGroup(new MappingGroup(0, 2)); mw.writeMapping(new Mapping(0, 1, 1)); try { mw.close(); fail("able to write too few mappings"); } catch(IOException expected) { //should happen } } catch(IOException io) { fail("problem testing: " + io.getMessage()); } StringWriter sw = new StringWriter(); mw = new MappingWriter(sw, naaclFormat); try { mw.startGroup(new MappingGroup(0, 0)); mw.endGroup(); mw.close(); assertEquals("", sw.getBuffer().toString()); } catch(IOException io) { fail("problem testing: " + io.getMessage()); } } @Test public void testMappingReader() { Iterator mi = mosesMappings.iterator(); Iterator gi = mosesGroups.iterator(); try { MappingGroup g; while((g = mosesReader.nextGroup()) != null) { assertEquals(gi.next(), g); while(mosesReader.canRead()) { assertEquals(mi.next(), mosesReader.readMapping()); } } mosesReader.close(); mi = naaclMappings.iterator(); gi = naaclGroups.iterator(); while((g = naaclReader.nextGroup()) != null) { assertEquals(gi.next(), g); while(naaclReader.canRead()) { assertEquals(mi.next(), naaclReader.readMapping()); } } naaclReader.close(); mosesReader = new MappingReader(new StringReader(emptyInput), mosesFormat); naaclReader = new MappingReader(new StringReader(emptyInput), naaclFormat); try { assertFalse(mosesReader.ready()); assertFalse(mosesReader.canRead()); assertNull(mosesReader.nextGroup()); } catch(IOException io) { fail("problem testing: " + io.getMessage()); } try { assertFalse(naaclReader.ready()); assertFalse(naaclReader.canRead()); assertNull(naaclReader.nextGroup()); } catch(IOException expected) { // should happen } mosesReader = new MappingReader(new StringReader(lineEndingOnly), mosesFormat); naaclReader = new MappingReader(new StringReader(lineEndingOnly), naaclFormat); try { assertFalse(mosesReader.ready()); assertFalse(mosesReader.canRead()); assertNull(mosesReader.nextGroup()); } catch(IOException io) { fail("problem testing: " + io.getMessage()); } try { assertFalse(naaclReader.ready()); assertFalse(naaclReader.canRead()); assertNull(naaclReader.nextGroup()); } catch(IOException io) { fail("problem testing: " + io.getMessage()); } mosesReader = new MappingReader(new StringReader("5-4-"), mosesFormat); naaclReader = new MappingReader(new StringReader("0 S\n"), naaclFormat); try { mosesReader.nextGroup(); fail("able to get next group from garbage input"); } catch(IOException expected) { // should happen } try { naaclReader.nextGroup(); fail("able to get next group from garbage input"); } catch(IOException expected) { // should happen } mosesReader = new MappingReader(new StringReader(mosesInput), mosesFormat); naaclReader = new MappingReader(new StringReader(naaclInput), naaclFormat); try { mosesReader.readMapping(); fail("able to read mapping without group"); } catch(IOException expected) { // should happen } try { naaclReader.readMapping(); fail("able to read mapping without group"); } catch(IOException expected) { // should happen } mosesReader = new MappingReader(new StringReader(mosesInput), mosesFormat); naaclReader = new MappingReader(new StringReader(naaclInput), naaclFormat); MappingGroup mg = mosesReader.nextGroup(); for(int i = 0; i < mg.length - 1; i++) { mosesReader.readMapping(); } try { mosesReader.close(); fail("able to read too few mappings"); } catch(IOException expected) { // should happen } mg = naaclReader.nextGroup(); for(int i = 0; i < mg.length - 1; i++) { naaclReader.readMapping(); } try { naaclReader.close(); fail("able to read too few mappings"); } catch(IOException expected) { // should happen } } catch(IOException io) { fail(io.getMessage()); } } } ================================================ FILE: test/opennlp/ccg/alignment/MappingTest.java ================================================ package opennlp.ccg.alignment; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.fail; import org.junit.Before; import org.junit.Test; public class MappingTest { Integer id, first, second; Mapping mapping; @Before public void setUp() throws Exception { id = new Integer(37); first = new Integer(4); second = new Integer(7); mapping = new Mapping(id, first, second); } @Test public void testMapping() { try { new Mapping(null, 3); fail("able to specify null index"); } catch(IllegalArgumentException expected) { // do nothing } try { new Mapping(3, null); fail("able to specify null index"); } catch(IllegalArgumentException expected) { // do nothing } try { new Mapping(1, 2, 3, null); fail("able to specify null status"); } catch(IllegalArgumentException expected) { // do nothing } try { new Mapping(1, 2, 3, Status.SURE, null); fail("able to specify null confidence"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testGet() { assertEquals(id, mapping.getPhraseNumber()); assertEquals(first, mapping.getA()); assertEquals(first, mapping.get(PhrasePosition.A)); assertEquals(second, mapping.getB()); assertEquals(second, mapping.get(PhrasePosition.B)); } @Test public void testCompareTo() { Mapping m = new Mapping(id, first, second), n = new Mapping(id, first - 1, second + 1), o = new Mapping(id, first, second - 1), p = new Mapping(id, first + 1, second); assertEquals(0, mapping.compareTo(m)); assertEquals(0, m.compareTo(mapping)); assertEquals(1, mapping.compareTo(n)); assertEquals(-1, n.compareTo(mapping)); assertEquals(1, mapping.compareTo(o)); assertEquals(-1, o.compareTo(mapping)); assertEquals(-1, mapping.compareTo(p)); assertEquals(1, p.compareTo(mapping)); } @Test public void testEqualsObject() { Mapping m = new Mapping(id, first, second), n = new Mapping(id, first - 1, second + 1), o = new Mapping(id, first, second - 1), p = new Mapping(id, first + 1, second); assertEquals(mapping, m); assertNotSame(mapping, n); assertNotSame(mapping, o); assertNotSame(mapping, p); } @Test public void testReverse() { assertEquals(new Mapping(id, second, first), mapping.reverse()); } @Test public void testSet() { assertEquals(Alignments.DEFAULT_STATUS, mapping.getStatus()); mapping.setStatus(Status.POSSIBLE); assertEquals(Status.POSSIBLE, mapping.getStatus()); assertEquals(Alignments.DEFAULT_CONFIDENCE, mapping.getConfidence()); mapping.setConfidence(0.5); assertEquals(Double.valueOf(0.5d), mapping.getConfidence()); } @Test public void testHashCode() { int hash = mapping.hashCode(); mapping.setStatus(Status.POSSIBLE); assertEquals(hash, mapping.hashCode()); mapping.setConfidence(0.5); assertEquals(hash, mapping.hashCode()); } } ================================================ FILE: test/opennlp/ccg/alignment/PhraseReaderWriterTest.java ================================================ package opennlp.ccg.alignment; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.fail; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.junit.Before; import org.junit.Test; public class PhraseReaderWriterTest { String lineSep = System.getProperty("line.separator"); String input = "Phrase one.\nPhrase two\r\nPhrase three .", output = "Phrase one." + lineSep + "Phrase two" + lineSep + "Phrase three ." + lineSep; PhraseReader reader; PhraseWriter writer; List phrases; @Before public void setUp() throws Exception { phrases = new ArrayList(); phrases.add(new Phrase(0, Alignments.tokenize("Phrase one."))); phrases.add(new Phrase(1, Alignments.tokenize("Phrase two"))); phrases.add(new Phrase(2, Alignments.tokenize("Phrase three ."))); } @Test public void testConstructors() { try { new PhraseReader(new StringReader(""), null); fail("able to specify null number base"); } catch(IllegalArgumentException expected) { // do nothing } try { new PhraseWriter(new StringWriter(), null); fail("able to specify null number base"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testBoth() { StringWriter sw = new StringWriter(); writer = new PhraseWriter(sw); try { for(Phrase p : phrases) { writer.writePhrase(p); } } catch(IOException io) { fail(io.getMessage()); } reader = new PhraseReader(new StringReader(sw.getBuffer().toString())); Iterator i = phrases.iterator(); try { Phrase p; while((p = reader.readPhrase()) != null) { assertEquals(i.next(), p); } } catch(IOException io) { fail(io.getMessage()); } } @Test public void testReadPhrase() { reader = new PhraseReader(new StringReader(input)); try { Iterator i = phrases.iterator(); Phrase p; while((p = reader.readPhrase()) != null) { assertEquals(i.next(), p); } reader = new PhraseReader(new StringReader(""), reader.getNumberBase()); assertNull(reader.readPhrase()); } catch(IOException io) { fail(io.getMessage()); } } @Test public void testWritePhrase() { StringWriter sw = new StringWriter(); writer = new PhraseWriter(sw); try { for(Phrase p : phrases) { writer.writePhrase(p); } assertEquals(output, sw.getBuffer().toString()); } catch(IOException io) { fail(io.getMessage()); } } } ================================================ FILE: test/opennlp/ccg/alignment/PhraseTest.java ================================================ package opennlp.ccg.alignment; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotSame; import static org.junit.Assert.fail; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.junit.Before; import org.junit.Test; public class PhraseTest { Phrase phrase; String[] wordList = Alignments.tokenize("This is a test ."); @Before public void setUp() throws Exception { phrase = new Phrase(37, wordList); } @Test public void testCompare() { Phrase o = new Phrase(43, phrase), t = new Phrase(43, phrase); assertEquals(-1, phrase.compareTo(o)); assertEquals(1, t.compareTo(phrase)); assertEquals(0, o.compareTo(t)); assertEquals(0, phrase.compareTo(phrase)); } @Test public void testSize() { assertEquals(5, phrase.size()); } @Test public void testPhraseComparableOfIListOfString() { assertEquals(phrase, new Phrase(phrase.getNumber(), wordList)); String[] str = null; try { new Phrase(phrase.getNumber(), str); fail("able to create phrase with null word list"); } catch(IllegalArgumentException ex) { // expected } str = new String[]{"blah", null, "blah"}; try { new Phrase(phrase.getNumber(), str); fail("able to create phrase with null word in list"); } catch(IllegalArgumentException ex) { // expected } } @Test public void testGetNumber() { assertEquals(new Integer(37), phrase.getNumber()); } @Test public void testGetInt() { assertEquals("is", phrase.get(1)); assertEquals("a", phrase.get(2)); assertEquals(".", phrase.get(4)); try { phrase.get(phrase.size()); fail("able to access word in phrase after end"); } catch(IndexOutOfBoundsException expected) { // do nothing } } @Test public void testSetIntString() { try { phrase.set(2, "sdfskjdlkjflksjdlkj"); fail("able to set"); } catch(UnsupportedOperationException expected) { // noop } } @Test public void testAdd() { try { phrase.add("blah"); fail("able to add"); } catch(UnsupportedOperationException expected) { // noop } } public void testRemove() { try { phrase.remove("is"); fail("able to remove"); } catch(UnsupportedOperationException expected) { // noop } } public void testIteratorRemove() { try { Iterator i = phrase.iterator(); i.next(); i.remove(); fail("able to remove via iterator"); } catch(UnsupportedOperationException expected) { // noop } } @Test public void testEqualsObject() { List l = new ArrayList(); for(int i = 0; i < 3; i++) { l.add("blah"); } Phrase same = new Phrase(phrase.getNumber(), phrase), diff = new Phrase(17, l); assertEquals(phrase, same); assertNotSame(phrase, diff); if(!phrase.equals(same)) { fail("not equal"); } if(phrase.equals(diff)) { fail("equal"); } } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/AlignedEdgeFilterTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.util.Collections; import java.util.HashSet; import java.util.Set; import opennlp.ccg.disjunctivizer.AlignedEdgeFilter; import opennlp.ccg.disjunctivizer.MatchType; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFBaseTest; import opennlp.ccg.hylo.graph.LFVertex; import org.junit.Before; import org.junit.Test; public class AlignedEdgeFilterTest extends LFBaseTest { AlignedEdgeFilter filter; Set indices; @Before public void setUp() throws Exception { super.setUp(); indices = new HashSet(); indices.add(0); indices.add(2); indices.add(3); filter = new AlignedEdgeFilter(indices, MatchType.SOURCE_ALIGNED, MatchType.TARGET_UNALIGNED); } @Test public void testAlignedEdgeFilter() { try { new AlignedEdgeFilter(null, MatchType.SOURCE_ALIGNED); fail("able to specify null alignment indices"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testGetAlignmentIndices() { assertEquals(indices, filter.getAlignmentIndices()); } @SuppressWarnings("unchecked") @Test public void testSetAlignmentIndices() { filter.setAlignmentIndices(Collections.EMPTY_SET); assertTrue(filter.getAlignmentIndices().isEmpty()); try { filter.setAlignmentIndices(null); fail("able to specify null alignment indices"); } catch(IllegalArgumentException expected) { // do nothing } } @SuppressWarnings("unchecked") @Test public void testAllows() { LFEdge one = new LFEdge(new LFVertex(new NominalAtom("w0"), new Proposition("blah")), new LFVertex(new NominalAtom("w1"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blase"))), two = new LFEdge(new LFVertex(new NominalAtom("w2"), new Proposition("blah")), new LFVertex(new NominalAtom("w3"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blase"))); assertTrue(filter.allows(one)); assertFalse(filter.allows(two)); filter.setAlignmentIndices(Collections.EMPTY_SET); assertFalse(filter.allows(one)); assertFalse(filter.allows(two)); // make self-contradictory filter filter = new AlignedEdgeFilter(indices, MatchType.SOURCE_ALIGNED, MatchType.SOURCE_UNALIGNED); assertFalse(filter.allows(one)); assertFalse(filter.allows(two)); } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/DisjunctivizerTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.*; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.util.LinkedHashSet; import java.util.Properties; import java.util.Set; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import opennlp.ccg.alignment.Alignment; import opennlp.ccg.alignment.Alignments; import opennlp.ccg.alignment.Mapping; import opennlp.ccg.alignment.Phrase; import opennlp.ccg.alignment.PhrasePosition; import opennlp.ccg.alignment.Status; import opennlp.ccg.hylo.graph.LFGraphFactory; import opennlp.ccg.hylo.graph.LFBaseTest; import org.apache.xml.serializer.OutputPropertiesFactory; import org.apache.xml.serializer.Serializer; import org.apache.xml.serializer.SerializerFactory; import org.jdom.input.DOMBuilder; import org.junit.Before; import org.junit.Test; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.NodeList; public class DisjunctivizerTest extends LFBaseTest { static Properties OUTPUT_PROPERTIES = OutputPropertiesFactory.getDefaultMethodProperties("xml"); static { OUTPUT_PROPERTIES.setProperty("indent", "yes"); OUTPUT_PROPERTIES.setProperty("media-type", "text/xml"); OUTPUT_PROPERTIES.setProperty(OutputPropertiesFactory.S_KEY_INDENT_AMOUNT, "2"); OUTPUT_PROPERTIES.setProperty("{http\u003a//xml.apache.org/xalan}indent-amount", "2"); } DocumentBuilder documentBuilder; DOMBuilder domBuilder; File alignmentsFile, paraphrasesFile, outputFile; @Before public void setUp() throws Exception { super.setUp(); try { documentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); } catch(ParserConfigurationException e) { throw new Exception("problem with parser configuration: " + e.getLocalizedMessage(), e); } domBuilder = new DOMBuilder(); File testDir = new File(System.getProperty("user.dir"), "test"); paraphrasesFile = new File(testDir, "paraphrases.xml"); outputFile = new File(testDir, "output.xml"); } @Test public void testDisjunctivizer() { try { new Disjunctivizer(null); fail("able to create disjunctivizer with null document"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testBuildDisjunctiveLF() throws Exception { Document paraphrases = documentBuilder.parse(paraphrasesFile); Serializer s = SerializerFactory.getSerializer(OUTPUT_PROPERTIES); s.setOutputFormat(OUTPUT_PROPERTIES); s.setWriter(new BufferedWriter(new FileWriter(outputFile))); Disjunctivizer disj = null; NodeList paras = paraphrases.getElementsByTagName("paraphrase"); Document out = documentBuilder.newDocument(); Element dlfsElement = out.createElement("dlfs"); out.appendChild(dlfsElement); for(int i = 0; i < paras.getLength(); i++) { Element para = (Element)paras.item(i); Integer id = Integer.parseInt(para.getAttribute("id")); Element first = (Element)para.getElementsByTagName("first").item(0), second = (Element)para.getElementsByTagName("second").item(0); Set ms = new LinkedHashSet(); NodeList als = para.getElementsByTagName("alignments"); for(int j = 0; j < als.getLength(); j++) { Element al = (Element)als.item(j); if(al.getAttribute("source").equals("ANNOTATOR")) { NodeList as = al.getElementsByTagName("alignment"); for(int k = 0; k < as.getLength(); k++) { Element a = (Element)as.item(k); ms.add(new Mapping(id, Integer.parseInt(a.getAttribute("first")), Integer.parseInt(a.getAttribute("second")), Status.forAbbreviation(a.getAttribute("status")))); } } } Alignment a = new Alignment(new Phrase(id, Alignments.tokenize(first.getElementsByTagName("string").item(0).getTextContent())), new Phrase(id, Alignments.tokenize(second.getElementsByTagName("string").item(0).getTextContent())), ms); Element firstLF = (Element)first.getElementsByTagName("lf").item(0), secondLF = (Element)second.getElementsByTagName("lf").item(0); LFGraphDifference diff = (firstLF != null && secondLF != null) ? new LFGraphDifference(LFGraphFactory.newGraphFrom(firstLF), LFGraphFactory.newGraphFrom(secondLF), a) : null; for(PhrasePosition pos : PhrasePosition.values()) { Element str = out.createElement("string"); str.setAttribute("number", Integer.toString(id)); str.setAttribute("position", pos.name()); str.setTextContent(Alignments.untokenize(a.get(pos))); dlfsElement.appendChild(str); } Element msEl = out.createElement("mappings"); msEl.appendChild(out.createCDATASection(ms.toString())); dlfsElement.appendChild(msEl); if(diff == null) { dlfsElement.appendChild(out.createComment("missing LF!")); } else { if(disj == null) { disj = new Disjunctivizer(out); } Element dlf = disj.buildDisjunctiveLFFor(diff); dlfsElement.appendChild(dlf); assertEquals(dlf, disj.buildDisjunctiveLFFor(diff)); dlfsElement.appendChild(disj.buildDisjunctiveLFFor(diff.reverse())); assertNotSame(dlf, disj.buildDisjunctiveLFFor(diff.reverse())); } } s.asDOMSerializer().serialize(out); } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/EdgeMatchFilterTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import opennlp.ccg.disjunctivizer.EdgeMatchFilter; import opennlp.ccg.disjunctivizer.MatchType; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFBaseTest; import opennlp.ccg.hylo.graph.LFVertex; import org.junit.Before; import org.junit.Test; public class EdgeMatchFilterTest extends LFBaseTest { EdgeMatchFilter filter; LFEdge edge; @Before public void setUp() throws Exception { super.setUp(); edge = new LFEdge(new LFVertex(new NominalAtom("w0"), new Proposition("blah")), new LFVertex(new NominalAtom("w1"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blase"))); filter = new EdgeMatchFilter(edge, MatchType.LABEL_MISMATCH, MatchType.SOURCE_PREDICATE_MISMATCH, MatchType.TARGET_PREDICATE_MATCH); } @Test public void testEdgeMatchFilter() { try { new EdgeMatchFilter(null, MatchType.LABEL_MATCH); fail("able to specify null edge"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testAllows() { LFEdge test = new LFEdge(new LFVertex(new NominalAtom("w0"), new Proposition("blah blah")), new LFVertex(new NominalAtom("w1"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("boring"))); assertTrue(filter.allows(test)); test = new LFEdge(test.getSource(), test.getTarget(), edge.getLabel()); assertFalse(filter.allows(test)); } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/FilteredLFEdgeSetTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.*; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.Set; import opennlp.ccg.disjunctivizer.AlignedEdgeFilter; import opennlp.ccg.disjunctivizer.EdgeMatchFilter; import opennlp.ccg.disjunctivizer.FilteredLFEdgeSet; import opennlp.ccg.disjunctivizer.MatchType; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFBaseTest; import opennlp.ccg.hylo.graph.LFVertex; import opennlp.ccg.util.CompositeFilter; import opennlp.ccg.util.Filter; import org.junit.Before; import org.junit.Test; public class FilteredLFEdgeSetTest extends LFBaseTest { FilteredLFEdgeSet set; Set edges; Filter edgeFilter; LFEdge one, two, three; @SuppressWarnings("unchecked") @Before public void setUp() throws Exception { super.setUp(); one = new LFEdge(new LFVertex(new NominalAtom("w0"), new Proposition("blah")), new LFVertex(new NominalAtom("w1"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blase"))); two = new LFEdge(new LFVertex(new NominalAtom("w2"), new Proposition("blah")), new LFVertex(new NominalAtom("w3"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("bored"))); three = new LFEdge(new LFVertex(new NominalAtom("w0"), new Proposition("zzz")), new LFVertex(new NominalAtom("w2"), new Proposition("snooze")), new LFEdgeLabel(new ModeLabel("blase"))); edges = new LinkedHashSet(); edges.add(one); edges.add(two); edges.add(three); Set indices = new HashSet(); indices.add(0); indices.add(2); indices.add(3); edgeFilter = new CompositeFilter(new EdgeMatchFilter(one, MatchType.SOURCE_PREDICATE_MATCH, MatchType.LABEL_MISMATCH), new AlignedEdgeFilter(indices, MatchType.TARGET_ALIGNED, MatchType.SOURCE_ALIGNED)); set = new FilteredLFEdgeSet(edges, edgeFilter); } @Test public void testSourceView() { assertTrue(set.sourceView().contains(two.getSource())); assertEquals(1, set.sourceView().size()); } @Test public void testTargetView() { assertTrue(set.targetView().contains(two.getTarget())); assertEquals(1, set.targetView().size()); } @Test public void testLabelView() { assertTrue(set.labelView().contains(two.getLabel())); assertEquals(1, set.labelView().size()); } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/LFGraphDifferenceTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.util.Collections; import java.util.Iterator; import java.util.Map; import java.util.Set; import opennlp.ccg.alignment.Alignment; import opennlp.ccg.alignment.Mapping; import opennlp.ccg.alignment.Phrase; import opennlp.ccg.alignment.PhrasePosition; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFGraph; import opennlp.ccg.hylo.graph.LFBaseTest; import opennlp.ccg.hylo.graph.LFVertex; import org.junit.Before; import org.junit.Test; public class LFGraphDifferenceTest extends LFBaseTest { LFGraph aGraph, bGraph; Alignment alignment; LFGraphDifference diff; LFEdge aDet, aArg0, aMod, bArg0; @SuppressWarnings("unchecked") @Before public void setUp() throws Exception { super.setUp(); alignment = new Alignment(new Phrase(337, "A", "boy", "walks", "quickly"), new Phrase(337, "He", "moves"), Collections.EMPTY_SET); alignment.add(new Mapping(0, 0)); alignment.add(new Mapping(1, 0)); alignment.add(new Mapping(2, 1)); aGraph = new LFGraph(); LFVertex aw0 = new LFVertex(new NominalAtom("w0"), new Proposition("a")), aw1 = new LFVertex(new NominalAtom("w1"), new Proposition("boy")), aw2 = new LFVertex(new NominalAtom("w2"), new Proposition("walk")), aw3 = new LFVertex(new NominalAtom("w3"), new Proposition("quickly")); aGraph.addVertex(aw0); aGraph.addVertex(aw1); aGraph.addVertex(aw2); aGraph.addVertex(aw3); aDet = aGraph.addLabeledEdge(aw1, aw0, LFEdgeLabel.forMode(new ModeLabel("Det"))); aArg0 = aGraph.addLabeledEdge(aw2, aw1, LFEdgeLabel.forMode(new ModeLabel("Arg0"))); aMod = aGraph.addLabeledEdge(aw2, aw3, LFEdgeLabel.forMode(new ModeLabel("Mod"))); bGraph = new LFGraph(); LFVertex bw0 = new LFVertex(new NominalAtom("w0"), new Proposition("he")), bw1 = new LFVertex(new NominalAtom("w1"), new Proposition("move")); bGraph.addVertex(bw0); bGraph.addVertex(bw1); bArg0 = bGraph.addLabeledEdge(bw1, bw0, LFEdgeLabel.forMode(new ModeLabel("Arg0"))); diff = new LFGraphDifference(aGraph, bGraph, alignment); } @Test public void testLFGraphDifference() { try { new LFGraphDifference(null, bGraph, alignment); fail("able to create LF graph difference with null graph"); } catch(IllegalArgumentException expected) { // do nothing } try { new LFGraphDifference(aGraph, null, alignment); fail("able to create LF graph difference with null graph"); } catch(IllegalArgumentException expected) { // do nothing } try { new LFGraphDifference(aGraph, bGraph, null); fail("able to create LF graph difference with null graph"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testReverse() { LFGraphDifference ffid = diff.reverse(); assertEquals(diff.a.vertexSet(), ffid.b.vertexSet()); assertEquals(diff.a.edgeSet(), ffid.b.edgeSet()); for(PhrasePosition pos : PhrasePosition.values()) { assertEquals(diff.alignment.get(pos), ffid.alignment.get(pos.opposite())); } Map> m = ffid.alignment.asMap(); assertTrue(m.get(0).contains(0)); assertTrue(m.get(0).contains(1)); assertTrue(m.get(1).contains(2)); } @Test public void testDeletes() { Set dels = diff.deletes(); try { dels.add(bArg0); fail("able to add edge"); } catch(UnsupportedOperationException expected) { // noop } try { dels.remove(bArg0); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } try { Iterator i = dels.iterator(); i.next(); i.remove(); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } assertEquals(Collections.singleton(aMod), diff.deletes()); assertEquals(Collections.emptySet(), diff.reverse().deletes()); } @Test public void testInserts() { Set ins = diff.inserts(); try { ins.add(bArg0); fail("able to add edge"); } catch(UnsupportedOperationException expected) { // noop } try { ins.remove(bArg0); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } assertEquals(Collections.emptySet(), diff.inserts()); assertEquals(Collections.singleton(aMod), diff.reverse().inserts()); } @Test public void testSubstitutions() { Set subs = diff.substitutions(); try { subs.add(bArg0); fail("able to add edge"); } catch(UnsupportedOperationException expected) { // noop } try { subs.remove(bArg0); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } try { Iterator i = subs.iterator(); i.next(); i.remove(); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } assertTrue(subs.contains(bArg0)); assertFalse(subs.contains(aArg0)); assertFalse(subs.contains(aDet)); assertFalse(subs.contains(aMod)); assertEquals(Collections.singleton(bArg0), diff.substitutionsFor(aArg0)); assertEquals(Collections.singleton(aArg0), diff.reverse().substitutionsFor(bArg0)); } @Test public void testSubstitutionsBySource() { Map> map = diff.substitutionsBySource(); assertTrue(map.keySet().contains(bArg0.getSource())); assertTrue(map.get(bArg0.getSource()).contains(bArg0)); assertEquals(1, map.size()); try { map.remove(bArg0.getSource()); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } try { map.put(aArg0.getSource(), Collections.singleton(aArg0)); fail("able to put edge"); } catch(UnsupportedOperationException expected) { // noop } try { Iterator>> i = map.entrySet().iterator(); i.next(); i.remove(); fail("able to remove entry"); } catch(UnsupportedOperationException expected) { // noop } } @Test public void testSubstitutionsBySourceFor() { Map> map = diff.substitutionsBySourceFor(aArg0); assertTrue(map.keySet().contains(bArg0.getSource())); assertTrue(map.get(bArg0.getSource()).contains(bArg0)); assertEquals(1, map.size()); try { map.remove(bArg0.getSource()); fail("able to remove edge"); } catch(UnsupportedOperationException expected) { // noop } try { map.put(aArg0.getSource(), Collections.singleton(aArg0)); fail("able to put edge"); } catch(UnsupportedOperationException expected) { // noop } try { Iterator>> i = map.entrySet().iterator(); i.next(); i.remove(); fail("able to remove entry"); } catch(UnsupportedOperationException expected) { // noop } map = diff.substitutionsBySourceFor(aDet); assertTrue(map.isEmpty()); } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/LabelMatchFilterTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.*; import opennlp.ccg.disjunctivizer.LabelMatchFilter; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFBaseTest; import opennlp.ccg.hylo.graph.LFVertex; import org.junit.Before; import org.junit.Test; public class LabelMatchFilterTest extends LFBaseTest { LabelMatchFilter filter; LFEdgeLabel label; LFEdge one, two; @Before public void setUp() throws Exception { super.setUp(); one = new LFEdge(new LFVertex(new NominalAtom("w0"), new Proposition("blah")), new LFVertex(new NominalAtom("w1"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blase"))); two = new LFEdge(new LFVertex(new NominalAtom("w2"), new Proposition("blah")), new LFVertex(new NominalAtom("w3"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blurg"))); label = new LFEdgeLabel(new ModeLabel("blase")); filter = new LabelMatchFilter(label); } @Test public void testLabelMatchFilter() { try { new LabelMatchFilter(null); fail("able to specify null label"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testSetBasis() { filter.setBasis(new LFEdgeLabel(new ModeLabel("boo"))); assertFalse(filter.allows(one)); assertFalse(filter.allows(two)); try { filter.setBasis(null); fail("able to specify null label"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testAllows() { assertTrue(filter.allows(one)); assertFalse(filter.allows(two)); } } ================================================ FILE: test/opennlp/ccg/disjunctivizer/VertexMatchFilterTest.java ================================================ package opennlp.ccg.disjunctivizer; import static org.junit.Assert.*; import opennlp.ccg.disjunctivizer.MatchType; import opennlp.ccg.disjunctivizer.VertexMatchFilter; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.hylo.graph.LFEdge; import opennlp.ccg.hylo.graph.LFEdgeLabel; import opennlp.ccg.hylo.graph.LFBaseTest; import opennlp.ccg.hylo.graph.LFVertex; import org.junit.Before; import org.junit.Test; public class VertexMatchFilterTest extends LFBaseTest { VertexMatchFilter filter; LFVertex one, two; LFEdge edge; @Before public void setUp() throws Exception { super.setUp(); one = new LFVertex(new NominalAtom("w0"), new Proposition("blah")); two = new LFVertex(new NominalAtom("w1"), new Proposition("blah")); edge = new LFEdge(two, new LFVertex(new NominalAtom("w2"), new Proposition("blah blah")), new LFEdgeLabel(new ModeLabel("blase"))); filter = new VertexMatchFilter(two, MatchType.SOURCE_MATCH); } @Test public void testVertexMatchFilter() { try { new VertexMatchFilter(null, MatchType.SOURCE_PREDICATE_MATCH); fail("able to specify null basis"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testSetBasis() { filter.setBasis(one); assertFalse(filter.allows(edge)); edge = new LFEdge(one, edge.getTarget(), edge.getLabel()); assertTrue(filter.allows(edge)); try { filter.setBasis(null); fail("able to specify null basis"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testAllows() { assertTrue(filter.allows(edge)); edge = new LFEdge(one, edge.getTarget(), edge.getLabel()); assertFalse(filter.allows(edge)); } } ================================================ FILE: test/opennlp/ccg/hylo/graph/LFBaseTest.java ================================================ package opennlp.ccg.hylo.graph; import java.io.File; import opennlp.ccg.grammar.Grammar; import org.junit.Before; import org.junit.Test; public class LFBaseTest { static Grammar grammar = null; @Before @SuppressWarnings("deprecation") public void setUp() throws Exception { if(grammar == null) { grammar = new Grammar(new File(new File( new File(System.getProperty("user.dir")), "test"), "grammar.xml").toURL()); } } @Test public void dummy() {} } ================================================ FILE: test/opennlp/ccg/hylo/graph/LFEdgeFactoryTest.java ================================================ package opennlp.ccg.hylo.graph; import static org.junit.Assert.*; import org.junit.Before; import org.junit.Test; public class LFEdgeFactoryTest extends LFEdgeTest { LFEdgeFactory factory; @Before public void setUp() throws Exception { super.setUp(); factory = new DefaultLFEdgeFactory(); } @Test public void testCreateEdge() { assertNotSame(edge, factory.createEdge(edge.source, edge.target)); edge = new LFEdge(edge.source, edge.target, null); assertEquals(edge, factory.createLabeledEdge(edge.source, edge.target, null)); } @Test public void testCreateLabeledEdge() { assertEquals(edge, factory.createLabeledEdge(edge.source, edge.target, edge.label)); } } ================================================ FILE: test/opennlp/ccg/hylo/graph/LFEdgeTest.java ================================================ package opennlp.ccg.hylo.graph; import static org.junit.Assert.*; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import org.junit.Before; import org.junit.Test; public class LFEdgeTest extends LFBaseTest { LFVertex source, target; LFEdgeLabel label; LFEdge edge; @Before public void setUp() throws Exception { super.setUp(); source = new LFVertex(new NominalAtom("w3"), new Proposition("prop1")); target = new LFVertex(new NominalAtom("w9"), new Proposition("prop2")); label = new LFEdgeLabel(new ModeLabel("Arg0")); edge = new LFEdge(source, target, label); } @Test public void testLFEdge() { try { new LFEdge(null, target); fail("able to specify null target"); } catch(IllegalArgumentException expected) { // do nothing } try { new LFEdge(source, null); fail("able to specify null target"); } catch(IllegalArgumentException expected) { // do nothing } try { new LFEdge(source, target, null); } catch(IllegalArgumentException expected) { fail("unable to specify null label"); } } @Test public void testEqualsObject() { assertEquals(edge, new LFEdge(edge.source, edge.target, edge.label)); } } ================================================ FILE: test/opennlp/ccg/hylo/graph/LFGraphTest.java ================================================ package opennlp.ccg.hylo.graph; import static org.junit.Assert.*; import java.io.File; import java.util.HashSet; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import opennlp.ccg.realize.Realizer; import opennlp.ccg.synsem.LF; import org.jdom.input.DOMBuilder; import org.junit.Before; import org.junit.Test; public class LFGraphTest extends LFBaseTest { LF testLF; LFGraph graph, expected; @Before public void setUp() throws Exception { super.setUp(); DocumentBuilder db; try { db = DocumentBuilderFactory.newInstance().newDocumentBuilder(); } catch(ParserConfigurationException e) { throw new Exception("problem with parser configuration: " + e.getLocalizedMessage(), e); } File testFile = new File(new File(new File(System.getProperty("user.dir")), "test"), "testlf.xml"); testLF = Realizer.getLfFromElt(new DOMBuilder().build(db.parse(testFile).getDocumentElement())); graph = LFGraphFactory.newGraphFrom(testLF); expected = new LFGraph(LFGraphFactory.DEFAULT_EDGE_FACTORY); LFVertex w7 = new LFVertex(new NominalAtom("w7"), new Proposition("be")); w7.setAttribute(new ModeLabel("mood"), new Proposition("dcl")); w7.setAttribute(new ModeLabel("tense"), new Proposition("past")); expected.addVertex(w7); LFVertex w0 = new LFVertex(new NominalAtom("w0"), new Proposition("bank")); w0.setAttribute(new ModeLabel("det"), new Proposition("nil")); expected.addVertex(w0); LFVertex w1 = new LFVertex(new NominalAtom("w1"), new Proposition("of")); expected.addVertex(w1); LFVertex w2 = new LFVertex(new NominalAtom("w2"), new Proposition("holland")); w2.setAttribute(new ModeLabel("det"), new Proposition("nil")); w2.setAttribute(new ModeLabel("num"), new Proposition("sg")); expected.addVertex(w2); LFVertex w5 = new LFVertex(new NominalAtom("w5"), new Proposition("office")); w5.setAttribute(new ModeLabel("det"), new Proposition("nil")); w5.setAttribute(new ModeLabel("num"), new Proposition("sg")); expected.addVertex(w5); LFVertex w4 = new LFVertex(new NominalAtom("w4"), new Proposition("wuhan")); w4.setAttribute(new ModeLabel("num"), new Proposition("sg")); expected.addVertex(w4); LFVertex w9 = new LFVertex(new NominalAtom("w9"), new Proposition("officially")); expected.addVertex(w9); LFVertex w8 = new LFVertex(new NominalAtom("w8"), new Proposition("also")); expected.addVertex(w8); LFVertex w10 = new LFVertex(new NominalAtom("w10"), new Proposition("establish")); w10.setAttribute(new ModeLabel("tense"), new Proposition("past")); expected.addVertex(w10); LFVertex w11 = new LFVertex(new NominalAtom("w11"), new Proposition("just")); expected.addVertex(w11); LFVertex w12 = new LFVertex(new NominalAtom("w12"), new Proposition("recently")); expected.addVertex(w12); expected.addLabeledEdge(w7, w0, LFEdgeLabel.forMode(new ModeLabel("Arg0"))); expected.addLabeledEdge(w0, w1, LFEdgeLabel.forMode(new ModeLabel("Mod"))); expected.addLabeledEdge(w1, w2, LFEdgeLabel.forMode(new ModeLabel("Arg1"))); expected.addLabeledEdge(w2, w5, LFEdgeLabel.forMode(new ModeLabel("ApposRel"))); expected.addLabeledEdge(w5, w4, LFEdgeLabel.forMode(new ModeLabel("Mod"))); expected.addLabeledEdge(w7, w9, LFEdgeLabel.forMode(new ModeLabel("Arg1"))); expected.addLabeledEdge(w9, w0, LFEdgeLabel.forMode(new ModeLabel("Arg0"))); expected.addLabeledEdge(w7, w8, LFEdgeLabel.forMode(new ModeLabel("Mod"))); expected.addLabeledEdge(w7, w10, LFEdgeLabel.forMode(new ModeLabel("GenRel"))); expected.addLabeledEdge(w10, w0, LFEdgeLabel.forMode(new ModeLabel("Arg1"))); expected.addLabeledEdge(w10, w11, LFEdgeLabel.forMode(new ModeLabel("Mod"))); expected.addLabeledEdge(w10, w12, LFEdgeLabel.forMode(new ModeLabel("Mod"))); } @Test public void testLFGraph() { assertEquals(expected.vertexSet(), graph.vertexSet()); assertEquals(expected.edgeSet(), graph.edgeSet()); } @Test public void testRemoveVertex() { for(LFVertex v : new HashSet(graph.vertexSet())) { graph.removeVertex(v); assertNull(graph.findVertexByNominal(v.nominal)); } } @Test public void testFindVertexByNominal() { for(LFVertex vertex : expected.vertexSet()) { assertEquals(vertex, graph.findVertexByNominal(vertex.nominal)); } } } ================================================ FILE: test/opennlp/ccg/hylo/graph/LFVertexTest.java ================================================ package opennlp.ccg.hylo.graph; import static org.junit.Assert.*; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import opennlp.ccg.hylo.Mode; import opennlp.ccg.hylo.ModeLabel; import opennlp.ccg.hylo.Nominal; import opennlp.ccg.hylo.NominalAtom; import opennlp.ccg.hylo.Proposition; import org.junit.Before; import org.junit.Test; public class LFVertexTest extends LFBaseTest { Nominal wordNominal, nonwordNominal; Proposition proposition; Integer wordIndex, nonwordIndex; LFVertex word, nonword; Map attrs; @Before public void setUp() throws Exception { super.setUp(); wordNominal = new NominalAtom("w7"); nonwordNominal = new NominalAtom("x1"); proposition = new Proposition("prop"); wordIndex = Integer.valueOf(7); nonwordIndex = Integer.valueOf(1); attrs = new HashMap(); attrs.put(new ModeLabel("num"), new Proposition("sg")); attrs.put(new ModeLabel("det"), new Proposition("nil")); attrs.put(new ModeLabel("tense"), new Proposition("past")); word = new LFVertex(wordNominal, proposition, attrs); nonword = new LFVertex(nonwordNominal, proposition); } @Test public void testLFVertex() { try { new LFVertex((Nominal)null); fail("able to specify null nominal"); } catch(IllegalArgumentException expected) { // do nothing } try { new LFVertex(wordNominal, null); } catch(IllegalArgumentException expected) { fail("unable to specify null proposition"); } } @Test public void testGetType() { assertEquals(LFVertexType.WORD, word.getType()); assertEquals(LFVertexType.NONWORD, nonword.getType()); assertNotSame(LFVertexType.WORD, nonword.getType()); assertNotSame(LFVertexType.NONWORD, word.getType()); } @Test public void testGetIndex() { assertEquals(wordIndex, word.getIndex()); assertEquals(nonwordIndex, nonword.getIndex()); } @Test public void testAttributeNames() { assertEquals(attrs.keySet(), word.attributeNames()); assertEquals(Collections.emptySet(), nonword.attributeNames()); try { Iterator i = word.attributeNames().iterator(); i.next(); i.remove(); fail("able to remove attribute name"); } catch(UnsupportedOperationException expected) { // do nothing } } @Test public void testContainsAttribute() { Mode num = new ModeLabel("num"); assertTrue(word.containsAttribute(num)); word.removeAttribute(num); assertFalse(word.containsAttribute(num)); } @Test public void testGetAttribute() { for(Mode m : attrs.keySet()) { assertEquals(attrs.get(m), word.getAttributeValue(m)); assertNull(nonword.getAttributeValue(m)); } } @Test public void testAddAttribute() { Mode num = new ModeLabel("num"); Proposition prop = new Proposition("pl"); assertFalse(word.addAttribute(num, new Proposition("sg"))); assertTrue(word.addAttribute(num, prop)); assertFalse(word.addAttribute(num, prop)); assertTrue(word.containsAttribute(num)); assertTrue(nonword.addAttribute(num, new Proposition("sg"))); assertFalse(nonword.addAttribute(num, new Proposition("sg"))); assertTrue(nonword.containsAttribute(num)); assertTrue(nonword.addAttribute(num, prop)); assertFalse(nonword.addAttribute(num, prop)); assertTrue(nonword.containsAttribute(num)); } @Test public void testSetAttribute() { Mode num = new ModeLabel("num"); Proposition prop = new Proposition("pl"); assertEquals(attrs.get(num), word.setAttribute(num, prop)); assertEquals(prop, word.getAttributeValue(num)); assertNull(nonword.setAttribute(num, prop)); assertEquals(prop, nonword.getAttributeValue(num)); } @Test public void testRemoveAttribute() { Mode num = new ModeLabel("num"); assertEquals(attrs.get(num), word.removeAttribute(num)); assertNull(nonword.removeAttribute(num)); } @Test public void testEqualsObject() { LFVertex v = new LFVertex(wordNominal, proposition); for(Mode m : attrs.keySet()) { v.setAttribute(m, attrs.get(m)); } assertEquals(v, word); assertNotSame(v, nonword); } @Test public void testGetAttributeMap() { Map m = word.getAttributeMap(); assertEquals(attrs, m); try { Iterator> i = m.entrySet().iterator(); i.next(); i.remove(); fail("able to remove from attribute map"); } catch(UnsupportedOperationException expected) { // do nothing } try { m.put(new ModeLabel("foo"), new Proposition("bar")); fail("able to put into attribute map"); } catch(UnsupportedOperationException expected) { // do nothing } } } ================================================ FILE: test/opennlp/ccg/util/CompositeFilterTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.*; import java.util.HashSet; import java.util.Set; import org.junit.Before; import org.junit.Test; public class CompositeFilterTest { VisitedFilter visited; Integer target = 37; Filter lessThanFilter, greaterThanFilter; CompositeFilter bothFilter, equalToFilter; @SuppressWarnings("unchecked") @Before public void setUp() throws Exception { visited = new VisitedFilter(); lessThanFilter = new Filter() { @Override public boolean allows(Integer e) { return e < target; } }; greaterThanFilter = new Filter() { @Override public boolean allows(Integer e) { return e > target; } }; Set> s = new HashSet>(); s.add(lessThanFilter); s.add(greaterThanFilter); bothFilter = new CompositeFilter(s); equalToFilter = new CompositeFilter(new InverseFilter(bothFilter)); } @Test public void testContainsFilter() { assertTrue(bothFilter.containsFilter(lessThanFilter)); assertTrue(bothFilter.containsFilter(greaterThanFilter)); assertFalse(bothFilter.containsFilter(bothFilter)); } @Test public void testAddFilter() { Filter f = new VisitedFilter(); equalToFilter.addFilter(f); assertTrue(equalToFilter.allows(37)); assertFalse(equalToFilter.allows(37)); try { equalToFilter.addFilter(null); fail("able to add null filter"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testRemoveFilter() { Filter f = new VisitedFilter(); equalToFilter.addFilter(f); assertTrue(equalToFilter.allows(37)); equalToFilter.removeFilter(f); assertTrue(equalToFilter.allows(37)); assertTrue(equalToFilter.allows(37)); } @Test public void testAllows() { assertTrue(lessThanFilter.allows(17)); assertFalse(lessThanFilter.allows(38)); assertFalse(greaterThanFilter.allows(17)); assertTrue(greaterThanFilter.allows(38)); assertFalse(bothFilter.allows(37)); assertTrue(equalToFilter.allows(37)); } } ================================================ FILE: test/opennlp/ccg/util/DelegatedFilterTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.*; import org.junit.Before; import org.junit.Test; public class DelegatedFilterTest { Filter lengthFilter; DelegatedFilter stringFilter; @Before public void setUp() throws Exception { lengthFilter = new Filter() { @Override public boolean allows(Integer i) { return i <= 5; } }; stringFilter = new DelegatedFilter(lengthFilter) { @Override public Integer delegateValueFor(String e) { return e.length(); } }; } @Test public void testAllows() { assertTrue(stringFilter.allows("Scott")); assertTrue(stringFilter.allows("Mike")); assertTrue(stringFilter.allows("Jason")); assertFalse(stringFilter.allows("Dominic")); assertFalse(stringFilter.allows("Dennis")); } @Test public void testDelegateValueFor() { assertEquals(Integer.valueOf(5), stringFilter.delegateValueFor("Scott")); } } ================================================ FILE: test/opennlp/ccg/util/FilteredMapTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.junit.Before; import org.junit.Test; public class FilteredMapTest { Map map; FilteredMap filteredMap; Filter keyFilter; Integer target = 37; @Before public void setUp() throws Exception { map = new HashMap(); map.put(17, "seventeen"); map.put(31, "thirty-one"); map.put(37, "thirty-seven"); map.put(43, "forty-three"); keyFilter = new Filter() { @Override public boolean allows(Integer e) { return target >= e; } }; filteredMap = new FilteredMap(map, keyFilter); } @Test public void testFilteredMap() { try { new FilteredMap(map, null); fail("able to specify null key filter"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testGetOriginalMap() { assertEquals(map, filteredMap.getOriginalMap()); } @Test public void testGetKeyFilter() { assertEquals(keyFilter, filteredMap.getKeyFilter()); } @Test public void testEntrySet() { for(Entry e : filteredMap.entrySet()) { if(e.getKey() > target) { fail("filtered map contains bad key"); } } } @Test public void testPut() { assertNull(filteredMap.put(47, "blah")); assertNull(filteredMap.put(29, "twenty-nine")); assertFalse(filteredMap.containsKey(47)); assertTrue(filteredMap.containsKey(29)); assertEquals("twenty-nine", filteredMap.put(29, "blah")); assertEquals("blah", filteredMap.get(29)); } @Test public void testContainsValue() { assertTrue(filteredMap.containsValue("seventeen")); assertFalse(filteredMap.containsValue("forty-three")); } @Test public void testContainsKey() { assertTrue(filteredMap.containsKey(31)); assertFalse(filteredMap.containsKey(43)); } @Test public void testKeySet() { for(Integer k : filteredMap.keySet()) { if(k.equals(43)) { fail("filtered map contains bad key"); } } } @Test public void testValues() { assertTrue(filteredMap.values().contains("seventeen")); assertFalse(filteredMap.values().contains("forty-three")); filteredMap.remove(17); assertFalse(filteredMap.values().contains("seventeen")); } } ================================================ FILE: test/opennlp/ccg/util/FilteredSetTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.*; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import org.junit.Before; import org.junit.Test; public class FilteredSetTest { FilteredSet testSet, sameSet; List values; Filter testFilter, sameFilter; @Before public void setUp() throws Exception { values = new ArrayList(Arrays.asList("test", "test", "other", "different")); testFilter = new Filter() { @Override public boolean allows(String e) { return e.equals("test"); } }; sameFilter = new VisitedFilter(); testSet = new FilteredSet(values, testFilter); sameSet = new FilteredSet(values, sameFilter); } @Test public void testFilteredSet() { try { new FilteredSet(testSet, null); fail("able to specify null filter"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testSize() { assertEquals(1, testSet.size()); assertEquals(values.size() - 1, sameSet.size()); } @Test public void testAdd() { int sz = testSet.size(); assertFalse(testSet.add("test")); assertTrue(testSet.remove("test")); assertEquals(sz - 1, testSet.size()); assertFalse(sameSet.add("test")); assertTrue(sameSet.add("blah")); assertTrue(sameSet.add("xyxyx")); assertFalse(sameSet.add("xyxyx")); assertEquals(5, sameSet.size()); } @Test public void testIterator() { Iterator i = testSet.iterator(); assertTrue(i.hasNext()); assertEquals("test", i.next()); assertFalse(i.hasNext()); i = sameSet.iterator(); assertTrue(i.hasNext()); assertEquals("test", i.next()); assertEquals("other", i.next()); assertEquals("different", i.next()); i = sameSet.iterator(); assertEquals("test", i.next()); i.remove(); assertFalse(sameSet.contains("test")); } @Test public void testRemove() { testSet.remove("test"); assertFalse(testSet.contains("test")); sameSet.remove("test"); assertFalse(sameSet.contains("test")); } @Test public void testClear() { testSet.clear(); assertEquals(0, testSet.size()); sameSet.clear(); assertEquals(0, sameSet.size()); } @Test public void testGetOriginalCollection() { assertEquals(values, testSet.getOriginalCollection()); assertEquals(values, sameSet.getOriginalCollection()); } } ================================================ FILE: test/opennlp/ccg/util/InverseFilterTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.*; import org.junit.Before; import org.junit.Test; public class InverseFilterTest { VisitedFilter visited; InverseFilter inverse; @Before public void setUp() throws Exception { visited = new VisitedFilter(); inverse = new InverseFilter(visited); } @Test public void testInverseFilter() { try { new InverseFilter(null); fail("able to specify null filter"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testGetOriginalFilter() { assertEquals(visited, inverse.getOriginalFilter()); } @Test public void testAllows() { assertTrue(visited.allows(37)); assertFalse(visited.allows(37)); assertTrue(inverse.allows(37)); assertFalse(inverse.allows(17)); assertFalse(visited.allows(17)); assertTrue(inverse.allows(17)); } } ================================================ FILE: test/opennlp/ccg/util/MembershipFilterTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.*; import java.util.HashSet; import java.util.Set; import org.junit.Before; import org.junit.Test; public class MembershipFilterTest { Filter filter; Set strings; @Before public void setUp() throws Exception { strings = new HashSet(); strings.add("test"); strings.add("one"); strings.add("two"); strings.add("three"); filter = new MembershipFilter(strings); } @Test public void testMembershipFilter() { try { new MembershipFilter(null); fail("able to specify null members"); } catch(IllegalArgumentException expected) { // do nothing } } @Test public void testAllows() { for(String s : strings) { assertTrue(filter.allows(s)); } assertFalse(filter.allows("blah")); assertFalse(filter.allows("")); assertFalse(filter.allows(null)); } } ================================================ FILE: test/opennlp/ccg/util/VisitedFilterTest.java ================================================ package opennlp.ccg.util; import static org.junit.Assert.*; import org.junit.Before; import org.junit.Test; public class VisitedFilterTest { VisitedFilter filter; @Before public void setUp() throws Exception { filter = new VisitedFilter(); } @Test public void testAllows() { assertTrue(filter.allows(1)); assertFalse(filter.allows(1)); assertTrue(filter.allows(0)); } @Test public void testHasVisited() { assertTrue(filter.allows(1)); assertTrue(filter.hasVisited(1)); assertFalse(filter.hasVisited(13)); } } ================================================ FILE: test/output.xml ================================================ chinese officials have repeatedly indicated that taiwan is a province of china and that china is a domestic chinese issue . the chinese government has always been signifying that taiwan is a province of china and taiwan issue is the internal issue of china . 1, 1 <-> 2, 2 <-> 3, 2 <-> 5, 2 <-> 6, 3 <-> 4, 4 <-> 3, 4 <-> 5, 4 <-> 6, 5 <-> 7, 6 <-> 8, 6 <-> 15, 7 <-> 9, 8 <-> 10, 9 <-> 11, 10 <-> 12, 11 <-> 13, 12 <-> 14, 15 <-> 17, 16 <-> 18, 17 <-> 19, 18 <-> 21, 18 <-> 22, 19 <-> 20, 20 <-> 23]]]> the government of qinghai province made the best use of the situation , coming up with an idea to commercialize the urban infrastructure construction , and ratified , at the beginning of this year , provisions for xining city to encourage and guide foreign investments . the government of qinghai province put forward the idea of commercialization of city infrastructure , and approved some regulations on encouraging and attracting foreign investment in xining city early this year . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 13 <-> 5, 13 <-> 6, 14 <-> 5, 14 <-> 6, 15 <-> 5, 15 <-> 6, 16 <-> 7, 17 <-> 8, 18 <-> 9, 18 <-> 10, 19 <-> 9, 19 <-> 10, 21 <-> 12, 22 <-> 13, 24 <-> 14, 25 <-> 15, 26 <-> 16, 28 <-> 28, 29 <-> 28, 30 <-> 28, 32 <-> 29, 33 <-> 30, 35 <-> 18, 36 <-> 19, 36 <-> 25, 37 <-> 26, 38 <-> 27, 39 <-> 19, 40 <-> 20, 41 <-> 21, 42 <-> 22, 43 <-> 23, 44 <-> 24, 45 <-> 31]]]> relevant sections of the henan government released their foreign cooperation projects concerning economy and technology . representatives attending the meeting consulted on information of some projects they interested in. the competent agencies of the henan provincial government released [ a list of ] foreign economic technological cooperation projects of the province at the meeting and held discussions regarding the purpose of cooperation in the related projects with representatives at the meeting . 1, 1 <-> 2, 2 <-> 3, 3 <-> 4, 4 <-> 5, 4 <-> 21, 5 <-> 7, 6 <-> 8, 8 <-> 14, 9 <-> 17, 9 <-> 32, 10 <-> 18, 12 <-> 15, 14 <-> 16, 16 <-> 38, 17 <-> 39, 18 <-> 40, 19 <-> 24, 19 <-> 41, 20 <-> 26, 20 <-> 27, 21 <-> 28, 21 <-> 29, 21 <-> 30, 21 <-> 31, 23 <-> 33, 24 <-> 34, 24 <-> 35, 25 <-> 36]]]> foreign minister duma expressed his thoughts that the un was born after world war ii , and that france and other permanent member countries were all the victors of that war . the foreign minister feels that the united nations was born out of the second world war , and that france and the other permanent members of the council were victors in that war . 1, 1 <-> 2, 3 <-> 3, 5 <-> 3, 6 <-> 4, 7 <-> 5, 8 <-> 6, 8 <-> 7, 9 <-> 8, 10 <-> 9, 11 <-> 10, 11 <-> 11, 12 <-> 14, 13 <-> 15, 14 <-> 13, 15 <-> 16, 16 <-> 17, 17 <-> 18, 18 <-> 19, 19 <-> 20, 20 <-> 22, 21 <-> 23, 22 <-> 24, 23 <-> 24, 24 <-> 28, 27 <-> 29, 28 <-> 30, 29 <-> 31, 30 <-> 32, 31 <-> 33]]]> moore said the australian military attache to jakarta will conduct free investigations after he arrives in east timor tomorrow . moore said , after the australian military attache to jakarta arrives in east timor tomorrow , he can conduct investigations freely in the area . 0, 1 <-> 1, 2 <-> 4, 3 <-> 5, 4 <-> 6, 5 <-> 7, 5 <-> 16, 6 <-> 8, 7 <-> 9, 8 <-> 17, 8 <-> 18, 9 <-> 17, 9 <-> 18, 10 <-> 20, 11 <-> 19, 14 <-> 10, 15 <-> 11, 16 <-> 12, 16 <-> 23, 17 <-> 13, 17 <-> 23, 18 <-> 14, 19 <-> 24]]]> at that time , jiang zeming indicated that china would consider joining the missile technology control regime . the indication has been taken as the principle result of the meeting between the two state heads . jiang zemin promised at that time that china would consider joining the missile technology control agreement , which was seen as one of the major achievements in the china-us summit meeting . 3, 1 <-> 4, 2 <-> 5, 4 <-> 0, 5 <-> 1, 6 <-> 2, 7 <-> 6, 8 <-> 7, 9 <-> 8, 10 <-> 9, 11 <-> 10, 12 <-> 11, 13 <-> 12, 14 <-> 13, 15 <-> 14, 16 <-> 15, 17 <-> 16, 18 <-> 17, 19 <-> 17, 20 <-> 18, 20 <-> 19, 21 <-> 18, 21 <-> 19, 22 <-> 18, 22 <-> 19, 23 <-> 20, 24 <-> 21, 24 <-> 23, 25 <-> 24, 26 <-> 25, 27 <-> 26, 28 <-> 27, 29 <-> 29, 29 <-> 30, 30 <-> 28, 31 <-> 28, 32 <-> 28, 33 <-> 28, 34 <-> 29, 35 <-> 31]]]> more than 90% of china 's people centralize on one third of its territory . more than 90 percent of the population is compacted into one-third of the country 's territory . 0, 1 <-> 1, 2 <-> 2, 2 <-> 3, 3 <-> 4, 4 <-> 13, 6 <-> 6, 7 <-> 7, 7 <-> 8, 8 <-> 9, 9 <-> 10, 10 <-> 10, 11 <-> 11, 12 <-> 13, 12 <-> 14, 13 <-> 15, 14 <-> 16]]]> what 's more , next year la nina will be just as bad as el nino . and la nina will put on a frightening display next year no less devastating than that of el nino . 0, 1 <-> 0, 2 <-> 0, 4 <-> 9, 5 <-> 10, 6 <-> 1, 7 <-> 2, 8 <-> 3, 8 <-> 4, 8 <-> 5, 8 <-> 6, 8 <-> 7, 8 <-> 8, 8 <-> 11, 8 <-> 12, 8 <-> 13, 8 <-> 14, 9 <-> 3, 9 <-> 4, 9 <-> 5, 9 <-> 6, 9 <-> 7, 9 <-> 8, 9 <-> 11, 9 <-> 12, 9 <-> 13, 9 <-> 14, 10 <-> 3, 10 <-> 4, 10 <-> 6, 10 <-> 8, 10 <-> 11, 10 <-> 12, 10 <-> 13, 10 <-> 14, 11 <-> 3, 11 <-> 4, 11 <-> 6, 11 <-> 8, 11 <-> 11, 11 <-> 12, 11 <-> 13, 11 <-> 14, 12 <-> 3, 12 <-> 4, 12 <-> 6, 12 <-> 8, 12 <-> 11, 12 <-> 12, 12 <-> 13, 12 <-> 14, 13 <-> 3, 13 <-> 4, 13 <-> 6, 13 <-> 8, 13 <-> 11, 13 <-> 12, 13 <-> 13, 13 <-> 14, 14 <-> 17, 15 <-> 18, 16 <-> 19]]]> russian prime minister on domestic economy russian prime minister talked about russian economic situation . 0, 1 <-> 1, 2 <-> 2, 3 <-> 4, 4 <-> 5, 5 <-> 6, 5 <-> 7]]]> nowadays , there appeared a lot of enterprise groups with high technology and strong stamina . nowadays , there appeared a lot of enterprise groups with high technology and strong stamina . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 5 <-> 5, 6 <-> 6, 7 <-> 7, 8 <-> 8, 9 <-> 9, 10 <-> 10, 11 <-> 11, 12 <-> 12, 13 <-> 13, 14 <-> 14, 15 <-> 15]]]> the gnp created by the village and township enterprises occupied about 1\/3 of the total gnp in fujian province . the gross national product of the industry of villages and towns made up 1\/3 of the gnp of fujian province . 0, 1 <-> 1, 1 <-> 2, 1 <-> 3, 3 <-> 4, 4 <-> 5, 5 <-> 8, 6 <-> 9, 7 <-> 10, 8 <-> 6, 9 <-> 11, 9 <-> 12, 11 <-> 13, 12 <-> 14, 13 <-> 15, 15 <-> 16, 16 <-> 17, 17 <-> 18, 18 <-> 19, 19 <-> 20]]]> both parties expressed satisfaction for the friendly cooperation between the two armies in the past 30 years . both parties expressed satisfaction for the friendly cooperation between the two armies in the past 30 years . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 5 <-> 5, 6 <-> 6, 7 <-> 7, 8 <-> 8, 9 <-> 9, 10 <-> 10, 11 <-> 11, 12 <-> 12, 13 <-> 13, 14 <-> 14, 15 <-> 15, 16 <-> 16, 17 <-> 17]]]> also present at the seminar will be long yongtu , vice minister of china 's ministry of foreign trade and economic cooperation who will accompany rugerro to shanghai , and shen jueren , former chief negotiator of china in gatt negotiations . attendees will also include the vice minister of foreign trade & economic cooperation , yongtu long , who accompanied ruggiero to shanghai , and former chief representative for trade negotiations of the moftec . 2, 1 <-> 0, 1 <-> 3, 5 <-> 1, 6 <-> 3, 7 <-> 15, 8 <-> 14, 9 <-> 16, 10 <-> 5, 11 <-> 6, 16 <-> 7, 17 <-> 8, 18 <-> 9, 19 <-> 10, 20 <-> 11, 21 <-> 12, 22 <-> 17, 23 <-> 18, 24 <-> 18, 25 <-> 19, 26 <-> 20, 27 <-> 21, 28 <-> 22, 29 <-> 23, 33 <-> 24, 34 <-> 25, 35 <-> 26, 35 <-> 27, 35 <-> 29, 36 <-> 30, 37 <-> 32, 38 <-> 27, 39 <-> 28, 40 <-> 29, 41 <-> 33]]]> xinhua news agency , nanjing , 16 december , by zhoufang the investment of foreign capital in agriculture of jiangsu increased . xinhua news agency , nanjing , dec. 16 . ( reporter zhou fang ) an increasing number of investments from foreign capital are being made on agriculture in jiangsu . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 5 <-> 5, 6 <-> 7, 7 <-> 6, 8 <-> 8, 10 <-> 11, 10 <-> 12, 12 <-> 18, 13 <-> 19, 14 <-> 20, 15 <-> 21, 16 <-> 25, 17 <-> 26, 18 <-> 27, 19 <-> 28, 20 <-> 15, 21 <-> 29]]]> france ministry of foreign affairs said that leaders of iraq refused to co-operate with united nations and have caused this deeply regretted outcome . the french foreign ministry said that because iraqi leader refuses to cooperate with the un caused this sad events . 1, 1 <-> 3, 2 <-> 2, 3 <-> 2, 4 <-> 2, 5 <-> 4, 6 <-> 5, 7 <-> 8, 8 <-> 7, 9 <-> 7, 10 <-> 9, 11 <-> 10, 12 <-> 11, 13 <-> 12, 14 <-> 14, 15 <-> 14, 17 <-> 15, 18 <-> 15, 19 <-> 16, 20 <-> 17, 21 <-> 17, 22 <-> 18, 23 <-> 19]]]> during the preliminaries , lan wei , a highly-skilled twenty-six year-old native of guangdong , scored 355.35 for first place . in the qualifiers , 26-year-old lan wei from guangdong displayed excellent skill , and ranked no. 1 with a total score of 355.3 . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 5, 5 <-> 6, 8 <-> 9, 8 <-> 10, 8 <-> 11, 9 <-> 4, 10 <-> 4, 11 <-> 7, 12 <-> 7, 13 <-> 8, 14 <-> 12, 15 <-> 20, 16 <-> 22, 18 <-> 15, 18 <-> 16, 19 <-> 14, 19 <-> 15, 20 <-> 23]]]> nanjing , december 16 ( xinhua ) foreign fund put into jiangsu 's agricultural sector is increasing xinhua news agency , nanjing , dec. 16 ( reporter zhou fang ) - foreign investment in jiangsu 's agriculture is on the rise . 4, 1 <-> 5, 2 <-> 6, 3 <-> 7, 4 <-> 8, 5 <-> 0, 7 <-> 14, 8 <-> 15, 8 <-> 16, 9 <-> 15, 9 <-> 16, 10 <-> 15, 10 <-> 16, 11 <-> 17, 12 <-> 18, 13 <-> 19, 14 <-> 19, 15 <-> 20, 16 <-> 21, 16 <-> 22, 16 <-> 23]]]> someone connected global warming to the phenomenon of el nino . some linked global warming with el nino . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 8 <-> 5, 9 <-> 6, 10 <-> 7]]]> xinhua news agency , beijing , january 16th , by xintang xu and yuhong qian- the president of the bank of china , xianglong dai said that , china will continue implementing the financial opening up policy . xinhua news agency , beijing , january 16th ( reporter : xu xingtang , qian yuhong ) . dai xianglong , the president of people 's bank of china , said that china will carry on with the open financial policy . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 5 <-> 5, 6 <-> 6, 7 <-> 7, 10 <-> 10, 11 <-> 9, 12 <-> 11, 13 <-> 13, 14 <-> 12, 15 <-> 19, 16 <-> 20, 17 <-> 21, 18 <-> 23, 19 <-> 24, 20 <-> 25, 21 <-> 26, 22 <-> 27, 23 <-> 17, 24 <-> 16, 25 <-> 28, 26 <-> 29, 28 <-> 30, 29 <-> 31, 30 <-> 32, 30 <-> 33, 31 <-> 34, 32 <-> 35, 33 <-> 37, 34 <-> 36, 35 <-> 36, 36 <-> 38, 37 <-> 39]]]> this kind of measure restrained the economic growth , leading to the rise of unemployment . such measures checked economic growth and caused unemployment to rise . 0, 1 <-> 0, 2 <-> 0, 3 <-> 1, 4 <-> 2, 6 <-> 3, 7 <-> 4, 8 <-> 5, 9 <-> 6, 10 <-> 6, 12 <-> 9, 14 <-> 7, 15 <-> 10]]]> siemens germany is currently cooperating with a local thai company , and constructing another 23km long electric railway project in bangkok . the german shermans company is currently working together with a local company constructing a 23 kilometer electronic train engineering project . 2, 1 <-> 1, 2 <-> 4, 3 <-> 5, 4 <-> 6, 4 <-> 7, 5 <-> 8, 6 <-> 9, 7 <-> 10, 9 <-> 11, 12 <-> 12, 13 <-> 13, 14 <-> 14, 14 <-> 15, 16 <-> 16, 17 <-> 17, 17 <-> 18, 18 <-> 19, 21 <-> 20]]]> he said that the same resolve would lead to the success of the kyoto protocol . he said that the same resolve would lead to the success of the kyoto protocol . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 5 <-> 5, 6 <-> 6, 7 <-> 7, 8 <-> 8, 9 <-> 9, 10 <-> 10, 11 <-> 11, 12 <-> 12, 13 <-> 13, 14 <-> 14, 15 <-> 15]]]> however , the official who disclosed the above declined to elaborate what kind of missile technology is being exported and when china provided such technology to pakistan and iran . however , the official was not willing to explain what kind of missile technology that china provided to pakistan and iran , and when this technology was provided . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 8 <-> 4, 8 <-> 5, 8 <-> 6, 9 <-> 7, 10 <-> 8, 11 <-> 9, 12 <-> 10, 13 <-> 11, 14 <-> 12, 15 <-> 13, 19 <-> 22, 20 <-> 23, 21 <-> 15, 22 <-> 16, 22 <-> 26, 22 <-> 27, 23 <-> 24, 24 <-> 13, 24 <-> 25, 25 <-> 17, 26 <-> 18, 27 <-> 19, 28 <-> 20, 29 <-> 28]]]> yunnan gardens was developed by fujian company , there are 313 apartments in this residential project . the yunnan gardens project developed by the fujian society is a 313-unit housing project with land . 1, 1 <-> 2, 2 <-> 4, 3 <-> 4, 4 <-> 5, 5 <-> 7, 6 <-> 8, 9 <-> 9, 10 <-> 11, 11 <-> 11, 13 <-> 10, 14 <-> 12, 15 <-> 13, 16 <-> 16]]]> in the last two years , some foreign businessmen showed their interest in the construction of xiling city . during the past two years , a batch of foreign businessmen expressed their wishes to get involved in xining 's city construction one after another . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 4, 5 <-> 5, 6 <-> 6, 6 <-> 7, 6 <-> 8, 7 <-> 9, 8 <-> 10, 9 <-> 11, 10 <-> 12, 11 <-> 13, 12 <-> 17, 13 <-> 19, 14 <-> 21, 15 <-> 19, 16 <-> 18, 17 <-> 20, 18 <-> 25]]]> the dai yu xiang industrial consulting company said that spacious front door area is the biggest characteristic of yunnan gardens with land ownership . dai yuxiang real estate consulting firm said that the most special characteristic of the landed residences at yunnan gardens is their spacious appearance . 0, 2 <-> 1, 3 <-> 1, 4 <-> 2, 4 <-> 3, 5 <-> 4, 6 <-> 5, 7 <-> 6, 8 <-> 7, 9 <-> 21, 10 <-> 22, 11 <-> 22, 12 <-> 22, 13 <-> 19, 14 <-> 8, 15 <-> 9, 15 <-> 10, 16 <-> 11, 17 <-> 12, 18 <-> 17, 19 <-> 18, 20 <-> 14, 21 <-> 14, 22 <-> 15, 22 <-> 22, 23 <-> 23]]]> this measure restrained economic growth and raised the rate of unemployment . this curbs economic growth and increases unemployment rates . 0, 2 <-> 1, 3 <-> 2, 4 <-> 3, 5 <-> 4, 6 <-> 5, 8 <-> 7, 10 <-> 6, 11 <-> 8]]]> according to xinhua news agency , london , on february 3rd , bbc reporter bowen stayed in baghdad . the capital of iraq stated that there was no evidence to indicate the bomb-shelter destroyed by a us bomber was a military target . according to xinhua news report from london on february 13 , bonne , a reporter of bbc in iraqi capital baghdad , confirmed on 13th that no any sign showed the bomb shelter in baghdad destroyed by american bomber was a military blindage . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 6 <-> 6, 8 <-> 7, 9 <-> 8, 10 <-> 9, 11 <-> 10, 12 <-> 15, 12 <-> 16, 13 <-> 14, 14 <-> 11, 16 <-> 17, 17 <-> 20, 17 <-> 34, 20 <-> 19, 21 <-> 18, 22 <-> 18, 23 <-> 22, 24 <-> 25, 26 <-> 26, 27 <-> 26, 28 <-> 28, 30 <-> 29, 31 <-> 30, 32 <-> 31, 32 <-> 32, 33 <-> 35, 34 <-> 36, 36 <-> 37, 37 <-> 38, 38 <-> 39, 39 <-> 40, 40 <-> 41, 41 <-> 42, 42 <-> 43]]]> mr. siazon said , someone used to say , have n't we been burned once before ? when we were ready to accept them , it did n't take long before they fought agaion . siazon said : someone says , were we deceived before ? when we prepared to accept them but just after several days they began to fight again . 0, 1 <-> 0, 2 <-> 1, 3 <-> 2, 4 <-> 3, 5 <-> 4, 6 <-> 4, 7 <-> 4, 8 <-> 5, 9 <-> 6, 9 <-> 8, 11 <-> 7, 12 <-> 6, 12 <-> 8, 13 <-> 6, 13 <-> 8, 15 <-> 9, 16 <-> 10, 17 <-> 11, 18 <-> 12, 19 <-> 13, 20 <-> 13, 21 <-> 14, 22 <-> 15, 23 <-> 16, 24 <-> 17, 25 <-> 18, 25 <-> 19, 25 <-> 20, 25 <-> 21, 26 <-> 18, 26 <-> 19, 26 <-> 20, 26 <-> 21, 27 <-> 18, 27 <-> 19, 27 <-> 20, 27 <-> 21, 28 <-> 18, 28 <-> 19, 28 <-> 20, 28 <-> 21, 29 <-> 18, 29 <-> 19, 29 <-> 20, 29 <-> 21, 30 <-> 18, 30 <-> 19, 30 <-> 20, 30 <-> 21, 31 <-> 22, 32 <-> 23, 32 <-> 24, 32 <-> 25, 33 <-> 26, 34 <-> 27]]]> railroad officials blamed the bus passengers for the accident , the indian news agency india press trust report said . report of pti said that railway official charged the bus passengers upon the accident . 5, 1 <-> 6, 2 <-> 7, 3 <-> 8, 4 <-> 9, 5 <-> 10, 6 <-> 11, 7 <-> 12, 8 <-> 13, 14 <-> 2, 15 <-> 2, 16 <-> 2, 17 <-> 0, 18 <-> 3, 19 <-> 14]]]> we mentioned the problems australia is concerned with , such as human rights and dispatching army officers to east timor . we mentioned the problems that have attracted australia 's attention , such as the humam rights in east timor and the issue of sending military officers to east timor . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 7, 5 <-> 5, 5 <-> 6, 5 <-> 8, 5 <-> 9, 6 <-> 5, 6 <-> 6, 6 <-> 8, 6 <-> 9, 7 <-> 5, 7 <-> 6, 7 <-> 8, 7 <-> 9, 8 <-> 10, 9 <-> 11, 10 <-> 12, 11 <-> 14, 12 <-> 15, 13 <-> 19, 14 <-> 23, 15 <-> 24, 16 <-> 25, 17 <-> 26, 18 <-> 27, 19 <-> 17, 19 <-> 28, 20 <-> 18, 20 <-> 29]]]> the two prime ministers requested that un secretary-general prolong the un representative 's stay in cambodia to 6 months or longer . the two prime ministers of cambodia asked the un secretary general to extend the un representative 's term of stay in cambodia to six months or longer . 0, 1 <-> 1, 2 <-> 2, 3 <-> 3, 4 <-> 6, 6 <-> 8, 7 <-> 9, 7 <-> 10, 8 <-> 12, 9 <-> 13, 10 <-> 14, 11 <-> 15, 12 <-> 16, 13 <-> 19, 14 <-> 20, 15 <-> 5, 15 <-> 21, 16 <-> 22, 17 <-> 23, 18 <-> 24, 19 <-> 25, 20 <-> 26, 21 <-> 27]]]> ================================================ FILE: test/paraphrases.xml ================================================ chinese officials have repeatedly indicated that taiwan is a province of china and that china is a domestic chinese issue . w19:n:NN:issue w9:n:NN:province w18:n/n:NN:chinese w8:np/n:DT:a w17:n/n:JJ:domestic w7:s[dcl]\np/np:VBZ:be w16:np/n:DT:a w6:n:NN:taiwan w15:s[dcl]\np/np:VBZ:be w14:n:NN:china w4:s[pt]\np/s[em]:VBN:indicate w3:s\np\(s\np):RB:repeatedly w12:s[em]$\(s[em]$)/(s[em]$):CC:and w2:s[dcl]\np/(s[pt]\np):VBP:PERF w11:n:NN:china w1:n:NNS:official w10:np\np/np:IN:of w0:n/n:NN:chinese chinese:S-chinese:P-NN:T-n/n officials:S-official:P-NNS:T-n have:S-have:P-VBP:T-s[dcl]\np/(s[pt]\np) repeatedly:S-repeatedly:P-RB:T-s\np\(s\np) indicated:S-indicate:P-VBN:T-s[pt]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] taiwan:S-taiwan:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/np a:S-a:P-DT:T-np/n province:S-province:P-NN:T-n of:S-of:P-IN:T-np\np/np china:S-china:P-NN:T-n and:S-and:P-CC:T-s[em]$\(s[em]$)/(s[em]$) that:S-that:P-DT:T-s[em]/s[dcl] china:S-china:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/np a:S-a:P-DT:T-np/n domestic:S-domestic:P-JJ:T-n/n chinese:S-chinese:P-NN:T-n/n issue:S-issue:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the chinese government has always been signifying that taiwan is a province of china and taiwan issue is the internal issue of china . w9:s[dcl]\np/np:VBZ:be w19:n/n:JJ:internal w18:np/n:DT:the w8:n:NN:taiwan w17:s[dcl]\np/np:VBZ:be w16:n:NN:issue w6:s[ng]\np/s[em]:VBG:signify w15:n/n:JJ:taiwan w5:s[pt]\np/(s[ng]\np):VBN:PROG w14:s[dcl]$\(s[dcl]$)/(s[dcl]$):CC:and w4:s\np\(s\np):RB:always w13:n:NN:china w3:s[dcl]\np/(s[pt]\np):VBZ:PERF w12:np\np/np:IN:of w2:n:NN:government w11:n:NN:province w1:n/n:JJ:chinese w10:np/n:DT:a w0:np/n:DT:the w22:n:NN:china w21:np\np/np:IN:of w20:n:NN:issue the:S-the:P-DT:T-np/n chinese:S-chinese:P-JJ:T-n/n government:S-government:P-NN:T-n has:S-have:P-VBZ:T-s[dcl]\np/(s[pt]\np) always:S-always:P-RB:T-s\np\(s\np) been:S-be:P-VBN:T-s[pt]\np/(s[ng]\np) signifying:S-signify:P-VBG:T-s[ng]\np/s[em] that:S-that:P-DT:T-s[em]/s[dcl] taiwan:S-taiwan:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/np a:S-a:P-DT:T-np/n province:S-province:P-NN:T-n of:S-of:P-IN:T-np\np/np china:S-china:P-NN:T-n and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) taiwan:S-taiwan:P-JJ:T-n/n issue:S-issue:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/np the:S-the:P-DT:T-np/n internal:S-internal:P-JJ:T-n/n issue:S-issue:P-NN:T-n of:S-of:P-IN:T-np\np/np china:S-china:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the government of qinghai province made the best use of the situation , coming up with an idea to commercialize the urban infrastructure construction , and ratified , at the beginning of this year , provisions for xining city to encourage and guide foreign investments . w36:s[ng]\np/np:VBG:xining w35:np\np/(s[ng]\np):IN:for w34:n:NNS:provision w33:np\np/np:,:, w32:n:NN:year w31:np/n:DT:this w30:np\np/np:IN:of w29:n:NN:beginning w28:np/n:DT:the w27:s\np\(s\np)/np:IN:at w26:s[dcl]\np:VBD:ratify w25:s[dcl]$\(s[dcl]$)/(s[dcl]$):CC:and w23:n:NN:construction w22:n/n:NN:infrastructure w21:n/n:JJ:urban w20:np/n:DT:the w9:np\np/np:IN:of w8:n:NN:use w7:n/n:JJS:best w6:np/n:DT:the w5:s[dcl]\np/np:VBD:make w4:n:NN:province w3:n/n:NNP:qinghai w19:s[b]\np/np:VB:commercialize w2:np\np/np:IN:of w1:n:NN:government w17:n:NN:idea w0:np/n:DT:the w16:np/n:DT:an w15:pp/np:IN:with w14:s\np\(s\np):RP:up w13:s[ng]\np/pp:VBG:come w11:n:NN:situation w43:n:NNS:investment w10:np/n:DT:the w42:n/n:JJ:foreign w41:s[b]\np/np:VB:guide w40:s[b]$\(s[b]$)/(s[b]$):CC:and w39:s[b]\np/np:VB:encourage w37:n:NN:city the:S-the:P-DT:T-np/n government:S-government:P-NN:T-n of:S-of:P-IN:T-np\np/np qinghai:S-qinghai:P-NNP:T-n/n province:S-province:P-NN:T-n made:S-make:P-VBD:T-s[dcl]\np/np the:S-the:P-DT:T-np/n best:S-best:P-JJS:T-n/n use:S-use:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n situation:S-situation:P-NN:T-n ,:S-,:P-,:T-s\np\(s\np)/punct[,]/(s\np) coming:S-come:P-VBG:T-s[ng]\np/pp up:S-up:P-RP:T-s\np\(s\np) with:S-with:P-IN:T-pp/np an:S-an:P-DT:T-np/n idea:S-idea:P-NN:T-n to:S-to:P-TO:T-s[to]\np/(s[b]\np) commercialize:S-commercialize:P-VB:T-s[b]\np/np the:S-the:P-DT:T-np/n urban:S-urban:P-JJ:T-n/n infrastructure:S-infrastructure:P-NN:T-n/n construction:S-construction:P-NN:T-n ,:S-,:P-,:T-punct[,] and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) ratified:S-ratify:P-VBD:T-s[dcl]\np ,:S-,:P-,:T-punct[,] at:S-at:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n beginning:S-beginning:P-NN:T-n of:S-of:P-IN:T-np\np/np this:S-this:P-DT:T-np/n year:S-year:P-NN:T-n ,:S-,:P-,:T-np\np/np provisions:S-provision:P-NNS:T-n for:S-for:P-IN:T-np\np/(s[ng]\np) xining:S-xining:P-VBG:T-s[ng]\np/np city:S-city:P-NN:T-n to:S-to:P-TO:T-s[to]\np/(s[b]\np) encourage:S-encourage:P-VB:T-s[b]\np/np and:S-and:P-CC:T-s[b]$\(s[b]$)/(s[b]$) guide:S-guide:P-VB:T-s[b]\np/np foreign:S-foreign:P-JJ:T-n/n investments:S-investment:P-NNS:T-n .:S-.:P-.:T-sent\s[dcl] the government of qinghai province put forward the idea of commercialization of city infrastructure , and approved some regulations on encouraging and attracting foreign investment in xining city early this year . w30:n:NN:year w19:np\np/(s[ng]\np):IN:on w9:np\np/np:IN:of w18:n:NNS:regulation w8:n:NN:idea w17:np/n:DT:some w7:np/n:DT:the w16:s[dcl]\np/np:VBD:approve w6:s\np\(s\np):RP:forward w15:s[dcl]$\(s[dcl]$)\punct[,]/(s[dcl]$):CC:and w5:s[dcl]\np/np:VBD:put w4:n:NN:province w13:n:NN:infrastructure w3:n/n:NNP:qinghai w12:n/n:NN:city w2:np\np/np:IN:of w1:n:NN:government w11:np\np/np:IN:of w10:n:NN:commercialization w0:np/n:DT:the w29:s\np\(s\np)/n:DT:this w28:s\np\(s\np)/(s\np\(s\np)):RB:early w27:n:NN:city w26:s[ng]\np/np:VBG:xining w25:np\np/(s[ng]\np):IN:in w24:n:NN:investment w23:n/n:JJ:foreign w22:s[ng]\np/np:VBG:attract w21:s[ng]$\(s[ng]$)/(s[ng]$):CC:and w20:s[ng]\np/np:VBG:encourage the:S-the:P-DT:T-np/n government:S-government:P-NN:T-n of:S-of:P-IN:T-np\np/np qinghai:S-qinghai:P-NNP:T-n/n province:S-province:P-NN:T-n put:S-put:P-VBD:T-s[dcl]\np/np forward:S-forward:P-RP:T-s\np\(s\np) the:S-the:P-DT:T-np/n idea:S-idea:P-NN:T-n of:S-of:P-IN:T-np\np/np commercialization:S-commercialization:P-NN:T-n of:S-of:P-IN:T-np\np/np city:S-city:P-NN:T-n/n infrastructure:S-infrastructure:P-NN:T-n ,:S-,:P-,:T-punct[,] and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)\punct[,]/(s[dcl]$) approved:S-approve:P-VBD:T-s[dcl]\np/np some:S-some:P-DT:T-np/n regulations:S-regulation:P-NNS:T-n on:S-on:P-IN:T-np\np/(s[ng]\np) encouraging:S-encourage:P-VBG:T-s[ng]\np/np and:S-and:P-CC:T-s[ng]$\(s[ng]$)/(s[ng]$) attracting:S-attract:P-VBG:T-s[ng]\np/np foreign:S-foreign:P-JJ:T-n/n investment:S-investment:P-NN:T-n in:S-in:P-IN:T-np\np/(s[ng]\np) xining:S-xining:P-VBG:T-s[ng]\np/np city:S-city:P-NN:T-n early:S-early:P-RB:T-s\np\(s\np)/(s\np\(s\np)) this:S-this:P-DT:T-s\np\(s\np)/n year:S-year:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] relevant sections of the henan government released their foreign cooperation projects concerning economy and technology . representatives attending the meeting consulted on information of some projects they interested in. w19:s[dcl]\np/np/pp:VBD:consulted w9:n/n:NN:cooperation w18:n:NN:meeting w8:n/n:JJ:foreign w17:np/n:DT:the w7:np/n:PRP$:their w6:s[dcl]\np/np:VBD:release w16:s[ng]\np/np:VBG:attend w15:n:NNS:representative w5:n:NN:government w14:n:NN:technology w4:n/n:JJ:henan w13:n\n/n:CC:and w3:np/n:DT:the w12:n:NN:economy w2:np\np/np:IN:of w11:n/n:VBG:concern w1:n:NNS:section w10:n/n:NNS:project w0:n/n:JJ:relevant w27:s[adj]\np\(s[adj]\np):NN:in. w26:s[adj]\np:JJ:interested w25:np:PRP:they w24:n:NNS:project w23:np/n:DT:some w22:np\np/np:IN:of w21:n:NN:information w20:pp/np:IN:on relevant:S-relevant:P-JJ:T-n/n sections:S-section:P-NNS:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n henan:S-henan:P-JJ:T-n/n government:S-government:P-NN:T-n released:S-release:P-VBD:T-s[dcl]\np/np their:S-their:P-PRP$:T-np/n foreign:S-foreign:P-JJ:T-n/n cooperation:S-cooperation:P-NN:T-n/n projects:S-project:P-NNS:T-n/n concerning:S-concern:P-VBG:T-n/n economy:S-economy:P-NN:T-n and:S-and:P-CC:T-n\n/n technology:S-technology:P-NN:T-n .:S-.:P-.:T-punct[.] representatives:S-representative:P-NNS:T-n attending:S-attend:P-VBG:T-s[ng]\np/np the:S-the:P-DT:T-np/n meeting:S-meeting:P-NN:T-n consulted:S-consulted:P-VBD:T-s[dcl]\np/np/pp on:S-on:P-IN:T-pp/np information:S-information:P-NN:T-n of:S-of:P-IN:T-np\np/np some:S-some:P-DT:T-np/n projects:S-project:P-NNS:T-n they:S-they:P-PRP:T-np interested:S-interested:P-JJ:T-s[adj]\np in.:S-in.:P-NN:T-s[adj]\np\(s[adj]\np) the competent agencies of the henan provincial government released [ a list of ] foreign economic technological cooperation projects of the province at the meeting and held discussions regarding the purpose of cooperation in the related projects with representatives at the meeting . w36:n:NNS:project w35:n/n:JJ:related w34:np/n:DT:the w33:np\np/np:IN:in w32:n:NN:cooperation w31:np\np/np:IN:of w30:n:NN:purpose w29:np/n:DT:the w28:s[ng]\np/np:VBG:regard w27:n:NNS:discussion w26:s[dcl]\np/np:VBD:hold w25:s[dcl]$\(s[dcl]$)/(s[dcl]$):CC:and w24:n:NN:meeting w23:np/n:DT:the w22:s\np\(s\np)/np:IN:at w21:n:NN:province w20:np/n:DT:the w9:s\np\(s\np)/np:IN:lsb w8:s[dcl]\np/np:VBD:release w7:n:NN:government w6:n/n:JJ:provincial w5:n/n:JJ:henan w4:np/n:DT:the w3:np\np/np:IN:of w19:np\np/np:IN:of w2:n:NNS:agency w18:n:NNS:project w1:n/n:JJ:competent w17:n/n:NN:cooperation w0:np/n:DT:the w16:n/n:JJ:technological w15:n/n:JJ:economic w14:n/n:JJ:foreign w13:n/n:JJ:rsb w12:np\np/np:IN:of w11:n:NN:list w10:np/n:DT:a w41:n:NN:meeting w40:np/n:DT:the w39:s\np\(s\np)/np:IN:at w38:n:NNS:representative w37:np\np/np:IN:with the:S-the:P-DT:T-np/n competent:S-competent:P-JJ:T-n/n agencies:S-agency:P-NNS:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n henan:S-henan:P-JJ:T-n/n provincial:S-provincial:P-JJ:T-n/n government:S-government:P-NN:T-n released:S-release:P-VBD:T-s[dcl]\np/np [:S-[:P-IN:T-s\np\(s\np)/np a:S-a:P-DT:T-np/n list:S-list:P-NN:T-n of:S-of:P-IN:T-np\np/np ]:S-]:P-JJ:T-n/n foreign:S-foreign:P-JJ:T-n/n economic:S-economic:P-JJ:T-n/n technological:S-technological:P-JJ:T-n/n cooperation:S-cooperation:P-NN:T-n/n projects:S-project:P-NNS:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n province:S-province:P-NN:T-n at:S-at:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n meeting:S-meeting:P-NN:T-n and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) held:S-hold:P-VBD:T-s[dcl]\np/np discussions:S-discussion:P-NNS:T-n regarding:S-regard:P-VBG:T-s[ng]\np/np the:S-the:P-DT:T-np/n purpose:S-purpose:P-NN:T-n of:S-of:P-IN:T-np\np/np cooperation:S-cooperation:P-NN:T-n in:S-in:P-IN:T-np\np/np the:S-the:P-DT:T-np/n related:S-related:P-JJ:T-n/n projects:S-project:P-NNS:T-n with:S-with:P-IN:T-np\np/np representatives:S-representative:P-NNS:T-n at:S-at:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n meeting:S-meeting:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] foreign minister duma expressed his thoughts that the un was born after world war ii , and that france and other permanent member countries were all the victors of that war . w30:n:NN:war w9:s[dcl]\np/(s[pss]\np):VBD:PASS w19:n\(n/n)/n:CC:and w8:n:NN:un w18:n/n:NN:france w7:np/n:DT:the h1:n\(n/n)/n:CC:has-rel w16:s[em]$\(s[em]$)\punct[,]/(s[em]$):CC:and w5:n/s[em]:NNS:thought w14:n:NNS:ii w4:np/n:PRP$:his w3:s[dcl]\np/np:VBD:express w13:n/n:NN:war w12:n/n:NN:world w2:n:NN:duma w11:s\np\(s\np)/np:IN:after w1:n/n:NN:minister w10:s[pss]\np:VBN:bear w0:n/n:JJ:foreign w29:np/n:DT:that w28:np\np/np:IN:of w27:n:NNS:victors w26:np/n:DT:the w25:np/np:PDT:all w24:s[dcl]\np/np:VBD:be w23:n:NNS:country w22:n/n:NN:member w21:n/n:JJ:permanent w20:n/n:JJ:other foreign:S-foreign:P-JJ:T-n/n minister:S-minister:P-NN:T-n/n duma:S-duma:P-NN:T-n expressed:S-express:P-VBD:T-s[dcl]\np/np his:S-his:P-PRP$:T-np/n thoughts:S-thought:P-NNS:T-n/s[em] that:S-that:P-IN:T-s[em]/s[dcl] the:S-the:P-DT:T-np/n un:S-un:P-NN:T-n was:S-be:P-VBD:T-s[dcl]\np/(s[pss]\np) born:S-bear:P-VBN:T-s[pss]\np after:S-after:P-IN:T-s\np\(s\np)/np world:S-world:P-NN:T-n/n war:S-war:P-NN:T-n/n ii:S-ii:P-NNS:T-n ,:S-,:P-,:T-punct[,] and:S-and:P-CC:T-s[em]$\(s[em]$)\punct[,]/(s[em]$) that:S-that:P-DT:T-s[em]/s[dcl] france:S-france:P-NN:T-n/n and:S-and:P-CC:T-n\(n/n)/n other:S-other:P-JJ:T-n/n permanent:S-permanent:P-JJ:T-n/n member:S-member:P-NN:T-n/n countries:S-country:P-NNS:T-n were:S-be:P-VBD:T-s[dcl]\np/np all:S-all:P-PDT:T-np/np the:S-the:P-DT:T-np/n victors:S-victors:P-NNS:T-n of:S-of:P-IN:T-np\np/np that:S-that:P-DT:T-np/n war:S-war:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the foreign minister feels that the united nations was born out of the second world war , and that france and the other permanent members of the council were victors in that war . w32:n:NN:war w31:np/n:DT:that w30:s\np\(s\np)/np:IN:in w9:s[pss]\np:VBN:bear w19:n:NN:france w8:s[dcl]\np/(s[pss]\np):VBD:PASS w17:s[em]$\(s[em]$)\punct[,]/(s[em]$):CC:and w7:n:NNS:nation w6:n/n:JJ:united w15:n:NN:war w5:np/n:DT:the w14:n/n:NN:world w3:s[dcl]\np/s[em]:VBZ:feel w13:n/n:JJ:second w2:n:NN:minister w12:np/n:DT:the w11:pp/np:IN:of w1:n/n:JJ:foreign w10:s\np\(s\np)/pp:IN:out w0:np/n:DT:the w29:n:NNS:victors w28:s[dcl]\np/np:VBD:be w27:n:NN:council w26:np/n:DT:the w25:np\np/np:IN:of w24:n:NNS:member w23:n/n:JJ:permanent w22:n/n:JJ:other w21:np/n:DT:the w20:np\np/np:CC:and the:S-the:P-DT:T-np/n foreign:S-foreign:P-JJ:T-n/n minister:S-minister:P-NN:T-n feels:S-feel:P-VBZ:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] the:S-the:P-DT:T-np/n united:S-united:P-JJ:T-n/n nations:S-nation:P-NNS:T-n was:S-be:P-VBD:T-s[dcl]\np/(s[pss]\np) born:S-bear:P-VBN:T-s[pss]\np out:S-out:P-IN:T-s\np\(s\np)/pp of:S-of:P-IN:T-pp/np the:S-the:P-DT:T-np/n second:S-second:P-JJ:T-n/n world:S-world:P-NN:T-n/n war:S-war:P-NN:T-n ,:S-,:P-,:T-punct[,] and:S-and:P-CC:T-s[em]$\(s[em]$)\punct[,]/(s[em]$) that:S-that:P-DT:T-s[em]/s[dcl] france:S-france:P-NN:T-n and:S-and:P-CC:T-np\np/np the:S-the:P-DT:T-np/n other:S-other:P-JJ:T-n/n permanent:S-permanent:P-JJ:T-n/n members:S-member:P-NNS:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n council:S-council:P-NN:T-n were:S-be:P-VBD:T-s[dcl]\np/np victors:S-victors:P-NNS:T-n in:S-in:P-IN:T-s\np\(s\np)/np that:S-that:P-DT:T-np/n war:S-war:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] moore said the australian military attache to jakarta will conduct free investigations after he arrives in east timor tomorrow . w9:s[b]\np/np:VB:conduct w18:s\np\(s\np):NN:tomorrow w8:s[dcl]\np/(s[b]\np):MD:will w17:n:NN:timor w7:n:NNP:jakarta w16:n/n:JJ:east w6:np\np/np:TO:to w15:s\np\(s\np)/np:IN:in w5:n:NN:attache w14:s[dcl]\np:VBZ:arrive w4:n/n:JJ:military w13:np:PRP:he w3:n/n:JJ:australian w12:s\np\(s\np)/s[dcl]:IN:after w2:np/n:DT:the w11:n:NNS:investigation w1:s[dcl]\np/s[dcl]:VBD:say w10:n/n:JJ:free w0:n:NN:moore moore:S-moore:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[dcl] the:S-the:P-DT:T-np/n australian:S-australian:P-JJ:T-n/n military:S-military:P-JJ:T-n/n attache:S-attache:P-NN:T-n to:S-to:P-TO:T-np\np/np jakarta:S-jakarta:P-NNP:T-n will:S-will:P-MD:T-s[dcl]\np/(s[b]\np) conduct:S-conduct:P-VB:T-s[b]\np/np free:S-free:P-JJ:T-n/n investigations:S-investigation:P-NNS:T-n after:S-after:P-IN:T-s\np\(s\np)/s[dcl] he:S-he:P-PRP:T-np arrives:S-arrive:P-VBZ:T-s[dcl]\np in:S-in:P-IN:T-s\np\(s\np)/np east:S-east:P-JJ:T-n/n timor:S-timor:P-NN:T-n tomorrow:S-tomorrow:P-NN:T-s\np\(s\np) .:S-.:P-.:T-sent\s[dcl] moore said , after the australian military attache to jakarta arrives in east timor tomorrow , he can conduct investigations freely in the area . w19:n:NNS:investigation w9:n:NN:jakarta w18:s[b]\np/np:VB:conduct w8:np\np/np:TO:to w7:n:NN:attache w17:s[dcl]\np/(s[b]\np):MD:can w16:np:PRP:he w6:n/n:JJ:military w5:n/n:JJ:australian w14:s\np\(s\np):NN:tomorrow w4:np/n:DT:the w13:n:NN:timor w3:s/s/s[dcl]:IN:after w12:n/n:JJ:east w11:s\np\(s\np)/np:IN:in w1:s[dcl]\np/s[dcl]/punct[,]:VBD:say w10:s[dcl]\np:VBZ:arrive w0:n:NN:moore w23:n:NN:area w22:np/n:DT:the w21:s\np\(s\np)/np:IN:in w20:s\np\(s\np):RB:freely moore:S-moore:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[dcl]/punct[,] ,:S-,:P-,:T-punct[,] after:S-after:P-IN:T-s/s/s[dcl] the:S-the:P-DT:T-np/n australian:S-australian:P-JJ:T-n/n military:S-military:P-JJ:T-n/n attache:S-attache:P-NN:T-n to:S-to:P-TO:T-np\np/np jakarta:S-jakarta:P-NN:T-n arrives:S-arrive:P-VBZ:T-s[dcl]\np in:S-in:P-IN:T-s\np\(s\np)/np east:S-east:P-JJ:T-n/n timor:S-timor:P-NN:T-n tomorrow:S-tomorrow:P-NN:T-s\np\(s\np) ,:S-,:P-,:T-s/s\(s/s) he:S-he:P-PRP:T-np can:S-can:P-MD:T-s[dcl]\np/(s[b]\np) conduct:S-conduct:P-VB:T-s[b]\np/np investigations:S-investigation:P-NNS:T-n freely:S-freely:P-RB:T-s\np\(s\np) in:S-in:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n area:S-area:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] at that time , jiang zeming indicated that china would consider joining the missile technology control regime . the indication has been taken as the principle result of the meeting between the two state heads . w33:n:NNS:head w32:n/n:NN:state w31:n/n:CD:two w30:np/n:DT:the w9:s[dcl]\np/(s[b]\np):MD:would w19:s[dcl]\np/(s[pt]\np):VBZ:PERF w18:n:NN:indication w8:n:NN:china w17:np/n:DT:the w6:s[dcl]\np/s[em]:VBD:indicate w16:n:NN:regime w5:n:VBG:zeming w15:n/n:NN:control w14:n/n:NN:technology w4:n/n:VBG:jiang w13:n/n:NN:missile w2:n:NN:time w12:np/n:DT:the w11:s[ng]\np/np:VBG:join w1:np/n:DT:that w10:s[b]\np/np:VB:consider w0:s/s/np:IN:at w29:np\np/np:IN:between w28:n:NN:meeting w27:np/n:DT:the w26:np\np/np:IN:of w25:n:NN:result w24:n:NN:principle w23:np/n:DT:the w22:pp/np:IN:as w21:s[pss]\np/pp:VBN:take w20:s[pt]\np/(s[pss]\np):VBN:PASS at:S-at:P-IN:T-s/s/np that:S-that:P-DT:T-np/n time:S-time:P-NN:T-n ,:S-,:P-,:T-s/s\(s/s) jiang:S-jiang:P-VBG:T-n/n zeming:S-zeming:P-VBG:T-n indicated:S-indicate:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] china:S-china:P-NN:T-n would:S-would:P-MD:T-s[dcl]\np/(s[b]\np) consider:S-consider:P-VB:T-s[b]\np/np joining:S-join:P-VBG:T-s[ng]\np/np the:S-the:P-DT:T-np/n missile:S-missile:P-NN:T-n/n technology:S-technology:P-NN:T-n/n control:S-control:P-NN:T-n/n regime:S-regime:P-NN:T-n .:S-.:P-.:T-punct[.] the:S-the:P-DT:T-np/n indication:S-indication:P-NN:T-n has:S-have:P-VBZ:T-s[dcl]\np/(s[pt]\np) been:S-be:P-VBN:T-s[pt]\np/(s[pss]\np) taken:S-take:P-VBN:T-s[pss]\np/pp as:S-as:P-IN:T-pp/np the:S-the:P-DT:T-np/n principle:S-principle:P-NN:T-n result:S-result:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n meeting:S-meeting:P-NN:T-n between:S-between:P-IN:T-np\np/np the:S-the:P-DT:T-np/n two:S-two:P-CD:T-n/n state:S-state:P-NN:T-n/n heads:S-head:P-NNS:T-n .:S-.:P-.:T-sent\s[dcl] jiang zemin promised at that time that china would consider joining the missile technology control agreement , which was seen as one of the major achievements in the china-us summit meeting . w30:n:NN:meeting w19:s[pss]\np/pp:VBN:see w9:s[b]\np/(s[ng]\np):VB:consider w18:s[dcl]\np/(s[pss]\np):VBD:PASS w8:s[dcl]\np/(s[b]\np):MD:would w7:n:NN:china w15:n:NN:agreement w5:n:NN:time w14:n/n:NN:control w4:np/n:DT:that w13:n/n:NN:technology w3:s\np\(s\np)/np:IN:at w2:s[dcl]\np/np:VBD:promise w12:n/n:NN:missile w11:np/n:DT:the w1:n:NNP:zemin w10:s[ng]\np/np:VBG:join w0:n/n:NNP:jiang w29:n/n:NN:summit w28:n/n:JJ:china-us w27:np/n:DT:the w26:np\np/np:IN:in w25:n:NNS:achievement w24:n/n:JJ:major w23:np/n:DT:the w22:np\np/np:IN:of w21:n:CD:one w20:pp/np:IN:as jiang:S-jiang:P-NNP:T-n/n zemin:S-zemin:P-NNP:T-n promised:S-promise:P-VBD:T-s[dcl]\np/np at:S-at:P-IN:T-s\np\(s\np)/np that:S-that:P-DT:T-np/n time:S-time:P-NN:T-n that:S-that:P-IN:T-np\np/(s[dcl]/np) china:S-china:P-NN:T-n would:S-would:P-MD:T-s[dcl]\np/(s[b]\np) consider:S-consider:P-VB:T-s[b]\np/(s[ng]\np) joining:S-join:P-VBG:T-s[ng]\np/np the:S-the:P-DT:T-np/n missile:S-missile:P-NN:T-n/n technology:S-technology:P-NN:T-n/n control:S-control:P-NN:T-n/n agreement:S-agreement:P-NN:T-n ,:S-,:P-,:T-punct[,] which:S-which:P-WDT:T-np\np\punct[,]/(s[dcl]\np) was:S-be:P-VBD:T-s[dcl]\np/(s[pss]\np) seen:S-see:P-VBN:T-s[pss]\np/pp as:S-as:P-IN:T-pp/np one:S-one:P-CD:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n major:S-major:P-JJ:T-n/n achievements:S-achievement:P-NNS:T-n in:S-in:P-IN:T-np\np/np the:S-the:P-DT:T-np/n china-us:S-china-us:P-JJ:T-n/n summit:S-summit:P-NN:T-n/n meeting:S-meeting:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] more than 90% of china 's people centralize on one third of its territory . w9:n/n:CD:one w8:pp/np:IN:on w7:s[dcl]\np/pp:VBP:centralize w6:n:NNS:people w4:n:NN:china w3:np\np/np:IN:of w2:n:CD:90% w1:n/n\(s[adj]\np):IN:than w0:s[adj]\np:JJR:more w13:n:NN:territory w12:np/n:PRP$:its w11:np\np/np:IN:of w10:n:NN:third more:S-more:P-JJR:T-s[adj]\np than:S-than:P-IN:T-n/n\(s[adj]\np) 90%:S-90%:P-CD:T-n of:S-of:P-IN:T-np\np/np china:S-china:P-NN:T-n 's:S-'s:P-POS:T-np/n\np people:S-people:P-NNS:T-n centralize:S-centralize:P-VBP:T-s[dcl]\np/pp on:S-on:P-IN:T-pp/np one:S-one:P-CD:T-n/n third:S-third:P-NN:T-n of:S-of:P-IN:T-np\np/np its:S-its:P-PRP$:T-np/n territory:S-territory:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] more than 90 percent of the population is compacted into one-third of the country 's territory . w9:pp/np:IN:into w8:s[pss]\np/pp:VBN:compact w7:s[dcl]\np/(s[pss]\np):VBZ:PASS w6:n:NN:population w5:np/n:DT:the w4:np\np/np:IN:of w3:n:NN:percent w2:n/n:CD:90 w1:n/n/(n/n)\(s[adj]\np):IN:than w15:n:NN:territory w0:s[adj]\np:JJR:more w13:n:NN:country w12:np/n:DT:the w11:np\np/np:IN:of w10:n:NN:one-third more:S-more:P-JJR:T-s[adj]\np than:S-than:P-IN:T-n/n/(n/n)\(s[adj]\np) 90:S-90:P-CD:T-n/n percent:S-percent:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n population:S-population:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/(s[pss]\np) compacted:S-compact:P-VBN:T-s[pss]\np/pp into:S-into:P-IN:T-pp/np one-third:S-one-third:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n country:S-country:P-NN:T-n 's:S-'s:P-POS:T-np/n\np territory:S-territory:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] what 's more , next year la nina will be just as bad as el nino . w9:s[b]\np/(s[adj]\np):VB:be w8:s[dcl]\np/(s[b]\np):MD:will w7:n:NN:nina w6:np/n:DT:la w5:s/s:NN:year w4:s/s/(s/s):JJ:next w2:n:JJR:more w1:s[dcl]\np/np:VBZ:be w15:n:NN:nino w14:n/n:JJ:el w13:s[adj]\np\(s[adj]\np)/np:IN:as w12:s[adj]\np:JJ:bad w11:s[adj]\np/(s[adj]\np):RB:as w10:s[adj]\np/(s[adj]\np):RB:just what:S-what:P-WP:T-np/(s[dcl]\np) 's:S-be:P-VBZ:T-s[dcl]\np/np more:S-more:P-JJR:T-n ,:S-,:P-,:T-s/s\np next:S-next:P-JJ:T-s/s/(s/s) year:S-year:P-NN:T-s/s la:S-la:P-DT:T-np/n nina:S-nina:P-NN:T-n will:S-will:P-MD:T-s[dcl]\np/(s[b]\np) be:S-be:P-VB:T-s[b]\np/(s[adj]\np) just:S-just:P-RB:T-s[adj]\np/(s[adj]\np) as:S-as:P-RB:T-s[adj]\np/(s[adj]\np) bad:S-bad:P-JJ:T-s[adj]\np as:S-as:P-IN:T-s[adj]\np\(s[adj]\np)/np el:S-el:P-JJ:T-n/n nino:S-nino:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] and la nina will put on a frightening display next year no less devastating than that of el nino . w9:np\np/(np\np):JJ:next w18:n:NN:nino w8:n:NN:display w17:n/n:JJ:el w7:n/n:JJ:frightening w16:np\np/np:IN:of w6:np/n:DT:a w15:np:DT:that w5:pp/np:IN:on w14:s[adj]\np\(s[adj]\np)/np:IN:than w4:s[b]\np/pp:VB:put w13:s[adj]\np:JJ:devastating w3:s[dcl]\np/(s[b]\np):MD:will w12:s[adj]\np/(s[adj]\np):RBR:less w2:n:NN:nina w11:s[adj]\np/(s[adj]\np):RB:no w1:np/n:DT:la w10:np\np:NN:year w0:s/s:CC:and and:S-and:P-CC:T-s/s la:S-la:P-DT:T-np/n nina:S-nina:P-NN:T-n will:S-will:P-MD:T-s[dcl]\np/(s[b]\np) put:S-put:P-VB:T-s[b]\np/pp on:S-on:P-IN:T-pp/np a:S-a:P-DT:T-np/n frightening:S-frightening:P-JJ:T-n/n display:S-display:P-NN:T-n next:S-next:P-JJ:T-np\np/(np\np) year:S-year:P-NN:T-np\np no:S-no:P-RB:T-s[adj]\np/(s[adj]\np) less:S-less:P-RBR:T-s[adj]\np/(s[adj]\np) devastating:S-devastating:P-JJ:T-s[adj]\np than:S-than:P-IN:T-s[adj]\np\(s[adj]\np)/np that:S-that:P-DT:T-np of:S-of:P-IN:T-np\np/np el:S-el:P-JJ:T-n/n nino:S-nino:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] russian prime minister on domestic economy w2:n:NN:minister w1:n/n:JJ:prime w0:n/n:JJ:russian w5:n:NN:economy w4:n/n:JJ:domestic w3:np\np/np:IN:on russian:S-russian:P-JJ:T-n/n prime:S-prime:P-JJ:T-n/n minister:S-minister:P-NN:T-n on:S-on:P-IN:T-np\np/np domestic:S-domestic:P-JJ:T-n/n economy:S-economy:P-NN:T-n russian prime minister talked about russian economic situation . w2:n:NN:minister w1:n/n:JJ:prime w0:n/n:JJ:russian w7:n:NN:situation w6:n/n:JJ:economic w5:n/n:JJ:russian w4:pp/np:IN:about w3:s[dcl]\np/pp:VBD:talk russian:S-russian:P-JJ:T-n/n prime:S-prime:P-JJ:T-n/n minister:S-minister:P-NN:T-n talked:S-talk:P-VBD:T-s[dcl]\np/pp about:S-about:P-IN:T-pp/np russian:S-russian:P-JJ:T-n/n economic:S-economic:P-JJ:T-n/n situation:S-situation:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] nowadays , there appeared a lot of enterprise groups with high technology and strong stamina . w9:np\np/np:IN:with w8:n:NNS:group w7:n/n:NN:enterprise w6:s\np\(s\np)\(s\np\(s\np))/np:IN:of w5:n:NN:lot w4:s\np\(s\np)/n:DT:a w3:s[dcl]\np:VBD:appear w0:s/s:RB:nowadays w14:n:NN:stamina w13:n/n:JJ:strong w12:n\n/n:CC:and w11:n:NN:technology w10:n/n:JJ:high nowadays:S-nowadays:P-RB:T-s/s ,:S-,:P-,:T-s/s\(s/s) there:S-there:P-EX:T-np[thr] appeared:S-appear:P-VBD:T-s[dcl]\np a:S-a:P-DT:T-s\np\(s\np)/n lot:S-lot:P-NN:T-n of:S-of:P-IN:T-s\np\(s\np)\(s\np\(s\np))/np enterprise:S-enterprise:P-NN:T-n/n groups:S-group:P-NNS:T-n with:S-with:P-IN:T-np\np/np high:S-high:P-JJ:T-n/n technology:S-technology:P-NN:T-n and:S-and:P-CC:T-n\n/n strong:S-strong:P-JJ:T-n/n stamina:S-stamina:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] nowadays , there appeared a lot of enterprise groups with high technology and strong stamina . w9:np\np/np:IN:with w8:n:NNS:group w7:n/n:NN:enterprise w6:s\np\(s\np)\(s\np\(s\np))/np:IN:of w5:n:NN:lot w4:s\np\(s\np)/n:DT:a w3:s[dcl]\np:VBD:appear w0:s/s:RB:nowadays w14:n:NN:stamina w13:n/n:JJ:strong w12:n\n/n:CC:and w11:n:NN:technology w10:n/n:JJ:high nowadays:S-nowadays:P-RB:T-s/s ,:S-,:P-,:T-s/s\(s/s) there:S-there:P-EX:T-np[thr] appeared:S-appear:P-VBD:T-s[dcl]\np a:S-a:P-DT:T-s\np\(s\np)/n lot:S-lot:P-NN:T-n of:S-of:P-IN:T-s\np\(s\np)\(s\np\(s\np))/np enterprise:S-enterprise:P-NN:T-n/n groups:S-group:P-NNS:T-n with:S-with:P-IN:T-np\np/np high:S-high:P-JJ:T-n/n technology:S-technology:P-NN:T-n and:S-and:P-CC:T-n\n/n strong:S-strong:P-JJ:T-n/n stamina:S-stamina:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the gnp created by the village and township enterprises occupied about 1\/3 of the total gnp in fujian province . w9:s[dcl]\np/np:VBD:occupy w8:n:NNS:enterprise w18:n:NN:province w17:n/n:JJ:fujian w7:n/n:NN:township h1:n\(n/n)/n:CC:has-rel w16:np\np/np:IN:in w6:n\(n/n)/n:CC:and w15:n:NN:gnp w5:n/n:NN:village w14:n/n:JJ:total w4:np/n:DT:the w13:np/n:DT:the w3:s\np\(s\np)/np:IN:by w12:np\np/np:IN:of w2:s[pss]\np:VBN:create w11:n:CD:1\/3 w1:n:NN:gnp w10:n/n:IN:about w0:np/n:DT:the the:S-the:P-DT:T-np/n gnp:S-gnp:P-NN:T-n created:S-create:P-VBN:T-s[pss]\np by:S-by:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n village:S-village:P-NN:T-n/n and:S-and:P-CC:T-n\(n/n)/n township:S-township:P-NN:T-n/n enterprises:S-enterprise:P-NNS:T-n occupied:S-occupy:P-VBD:T-s[dcl]\np/np about:S-about:P-IN:T-n/n 1\/3:S-1\/3:P-CD:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n total:S-total:P-JJ:T-n/n gnp:S-gnp:P-NN:T-n in:S-in:P-IN:T-np\np/np fujian:S-fujian:P-JJ:T-n/n province:S-province:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the gross national product of the industry of villages and towns made up 1\/3 of the gnp of fujian province . w9:np\np/np:CC:and w19:n:NN:province w18:n/n:JJ:fujian w8:n:NNS:village w17:np\np/np:IN:of w7:np\np/np:IN:of w16:n:NN:gnp w6:n:NN:industry w15:np/n:DT:the w5:np/n:DT:the w14:np\np/np:IN:of w4:np\np/np:IN:of w13:n:CD:1\/3 w3:n:NN:product w12:s\np\(s\np):RP:up w2:n/n:JJ:national w11:s[dcl]\np/np:VBD:make w1:n/n:JJ:gross w10:n:NNS:town w0:np/n:DT:the the:S-the:P-DT:T-np/n gross:S-gross:P-JJ:T-n/n national:S-national:P-JJ:T-n/n product:S-product:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n industry:S-industry:P-NN:T-n of:S-of:P-IN:T-np\np/np villages:S-village:P-NNS:T-n and:S-and:P-CC:T-np\np/np towns:S-town:P-NNS:T-n made:S-make:P-VBD:T-s[dcl]\np/np up:S-up:P-RP:T-s\np\(s\np) 1\/3:S-1\/3:P-CD:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n gnp:S-gnp:P-NN:T-n of:S-of:P-IN:T-np\np/np fujian:S-fujian:P-JJ:T-n/n province:S-province:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] both parties expressed satisfaction for the friendly cooperation between the two armies in the past 30 years . w9:np/n:DT:the w8:np\np/np:IN:between w7:n:NN:cooperation w6:n/n:JJ:friendly w5:np/n:DT:the w4:np\np/np:IN:for w3:n:NN:satisfaction w2:s[dcl]\np/np:VBD:express w16:n:NNS:year w1:n:NNS:party w15:n/n:CD:30 w0:np/n:DT:both w14:n/n:JJ:past w13:np/n:DT:the w12:np\np/np:IN:in w11:n:NNS:army w10:n/n:CD:two both:S-both:P-DT:T-np/n parties:S-party:P-NNS:T-n expressed:S-express:P-VBD:T-s[dcl]\np/np satisfaction:S-satisfaction:P-NN:T-n for:S-for:P-IN:T-np\np/np the:S-the:P-DT:T-np/n friendly:S-friendly:P-JJ:T-n/n cooperation:S-cooperation:P-NN:T-n between:S-between:P-IN:T-np\np/np the:S-the:P-DT:T-np/n two:S-two:P-CD:T-n/n armies:S-army:P-NNS:T-n in:S-in:P-IN:T-np\np/np the:S-the:P-DT:T-np/n past:S-past:P-JJ:T-n/n 30:S-30:P-CD:T-n/n years:S-year:P-NNS:T-n .:S-.:P-.:T-sent\s[dcl] both parties expressed satisfaction for the friendly cooperation between the two armies in the past 30 years . w9:np/n:DT:the w8:np\np/np:IN:between w7:n:NN:cooperation w6:n/n:JJ:friendly w5:np/n:DT:the w4:np\np/np:IN:for w3:n:NN:satisfaction w2:s[dcl]\np/np:VBD:express w16:n:NNS:year w1:n:NNS:party w15:n/n:CD:30 w0:np/n:DT:both w14:n/n:JJ:past w13:np/n:DT:the w12:np\np/np:IN:in w11:n:NNS:army w10:n/n:CD:two both:S-both:P-DT:T-np/n parties:S-party:P-NNS:T-n expressed:S-express:P-VBD:T-s[dcl]\np/np satisfaction:S-satisfaction:P-NN:T-n for:S-for:P-IN:T-np\np/np the:S-the:P-DT:T-np/n friendly:S-friendly:P-JJ:T-n/n cooperation:S-cooperation:P-NN:T-n between:S-between:P-IN:T-np\np/np the:S-the:P-DT:T-np/n two:S-two:P-CD:T-n/n armies:S-army:P-NNS:T-n in:S-in:P-IN:T-np\np/np the:S-the:P-DT:T-np/n past:S-past:P-JJ:T-n/n 30:S-30:P-CD:T-n/n years:S-year:P-NNS:T-n .:S-.:P-.:T-sent\s[dcl] also present at the seminar will be long yongtu , vice minister of china 's ministry of foreign trade and economic cooperation who will accompany rugerro to shanghai , and shen jueren , former chief negotiator of china in gatt negotiations . attendees will also include the vice minister of foreign trade & economic cooperation , yongtu long , who accompanied ruggiero to shanghai , and former chief representative for trade negotiations of the moftec . xinhua news agency , nanjing , 16 december , by zhoufang the investment of foreign capital in agriculture of jiangsu increased . xinhua news agency , nanjing , dec. 16 . ( reporter zhou fang ) an increasing number of investments from foreign capital are being made on agriculture in jiangsu . france ministry of foreign affairs said that leaders of iraq refused to co-operate with united nations and have caused this deeply regretted outcome . w19:np/n:DT:this w9:n:NNS:iraq w18:s[pt]\np/np:VBN:cause w8:np\np/np:IN:of w17:s[dcl]\np/(s[pt]\np):VBP:PERF w7:n:NNS:leader w16:s[dcl]$\(s[dcl]$)/(s[dcl]$):CC:and w6:np/n:DT:that w15:n:NNS:nation w5:s[dcl]\np/s[dcl]:VBD:say w14:n/n:JJ:united w4:n:NNS:affair w13:pp/np:IN:with w3:n/n:JJ:foreign w12:s[b]\np/pp:VB:co-operate w2:np\np/np:IN:of w1:n:NN:ministry w10:s[dcl]\np/(s[to]\np):VBD:refuse w0:n/n:NN:france w22:n:NN:outcome w21:n/n:JJ:regretted w20:n/n/(n/n):RB:deeply france:S-france:P-NN:T-n/n ministry:S-ministry:P-NN:T-n of:S-of:P-IN:T-np\np/np foreign:S-foreign:P-JJ:T-n/n affairs:S-affair:P-NNS:T-n said:S-say:P-VBD:T-s[dcl]\np/s[dcl] that:S-that:P-DT:T-np/n leaders:S-leader:P-NNS:T-n of:S-of:P-IN:T-np\np/np iraq:S-iraq:P-NNS:T-n refused:S-refuse:P-VBD:T-s[dcl]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) co-operate:S-co-operate:P-VB:T-s[b]\np/pp with:S-with:P-IN:T-pp/np united:S-united:P-JJ:T-n/n nations:S-nation:P-NNS:T-n and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) have:S-have:P-VBP:T-s[dcl]\np/(s[pt]\np) caused:S-cause:P-VBN:T-s[pt]\np/np this:S-this:P-DT:T-np/n deeply:S-deeply:P-RB:T-n/n/(n/n) regretted:S-regretted:P-JJ:T-n/n outcome:S-outcome:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the french foreign ministry said that because iraqi leader refuses to cooperate with the un caused this sad events . w9:s[dcl]\np/(s[to]\np):VBZ:refuse w8:n:NN:leader w7:n/n:JJ:iraqi w6:s/s/s[dcl]:IN:because w4:s[dcl]\np/s[em]:VBD:say w18:n:NNS:event w3:n:NN:ministry w17:n/n:JJ:sad w2:n/n:JJ:foreign w16:np/n:DT:this w1:n/n:JJ:french w15:s[dcl]\pp/np:VBD:cause w0:np/n:DT:the w14:n:NN:un w13:np/n:DT:the w12:pp/np:IN:with w11:s[b]\np:VB:cooperate the:S-the:P-DT:T-np/n french:S-french:P-JJ:T-n/n foreign:S-foreign:P-JJ:T-n/n ministry:S-ministry:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] because:S-because:P-IN:T-s/s/s[dcl] iraqi:S-iraqi:P-JJ:T-n/n leader:S-leader:P-NN:T-n refuses:S-refuse:P-VBZ:T-s[dcl]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) cooperate:S-cooperate:P-VB:T-s[b]\np with:S-with:P-IN:T-pp/np the:S-the:P-DT:T-np/n un:S-un:P-NN:T-n caused:S-cause:P-VBD:T-s[dcl]\pp/np this:S-this:P-DT:T-np/n sad:S-sad:P-JJ:T-n/n events:S-event:P-NNS:T-n .:S-.:P-.:T-sent\s[dcl] during the preliminaries , lan wei , a highly-skilled twenty-six year-old native of guangdong , scored 355.35 for first place . w9:n/n:JJ:twenty-six w8:n/n:JJ:highly-skilled w7:np/n:DT:a w5:n:NNS:wei w19:n:NN:place w4:n/n:JJ:lan w18:n/n:JJ:first w2:n:NNS:preliminaries w17:s\np\(s\np)/np:IN:for w16:s\np\(s\np):CD:355.35 w1:np/n:DT:the w15:s[dcl]\np:VBD:score w0:s/s/np:IN:during w13:n:NN:guangdong w12:np\np/np:IN:of w11:n:NN:native w10:n/n:JJ:year-old during:S-during:P-IN:T-s/s/np the:S-the:P-DT:T-np/n preliminaries:S-preliminaries:P-NNS:T-n ,:S-,:P-,:T-s/s\(s/s) lan:S-lan:P-JJ:T-n/n wei:S-wei:P-NNS:T-n ,:S-,:P-,:T-np\np/punct[,]/np a:S-a:P-DT:T-np/n highly-skilled:S-highly-skilled:P-JJ:T-n/n twenty-six:S-twenty-six:P-JJ:T-n/n year-old:S-year-old:P-JJ:T-n/n native:S-native:P-NN:T-n of:S-of:P-IN:T-np\np/np guangdong:S-guangdong:P-NN:T-n ,:S-,:P-,:T-punct[,] scored:S-score:P-VBD:T-s[dcl]\np 355.35:S-355.35:P-CD:T-s\np\(s\np) for:S-for:P-IN:T-s\np\(s\np)/np first:S-first:P-JJ:T-n/n place:S-place:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] in the qualifiers , 26-year-old lan wei from guangdong displayed excellent skill , and ranked no. 1 with a total score of 355.3 . w9:s[pss]\np:VBN:display w19:n/n:JJ:total w18:np/n:DT:a w8:s[ng]\np/(s[pss]\np):VBG:guangdong w17:s\np\(s\np)/np:IN:with w7:np\np/(s[ng]\np):IN:from w16:n[num]:CD:1 w6:n:NNS:wei w15:s\np\(s\np)/n[num]:NNP:no. w5:n/n:JJ:lan w14:s[pss]\np:VBD:rank w4:n/n:JJ:26-year-old w13:s[pss]$\(s[pss]$)\punct[,]/(s[pss]$):CC:and w3:np\np/np:,:, w2:n:NNS:qualifiers w11:s\np\(s\np):NN:skill w1:np/n:DT:the w10:s\np\(s\np)/(s\np\(s\np)):JJ:excellent w0:s/s/np:IN:in w22:n:CD:355.3 w21:np\np/np:IN:of w20:n:NN:score in:S-in:P-IN:T-s/s/np the:S-the:P-DT:T-np/n qualifiers:S-qualifiers:P-NNS:T-n ,:S-,:P-,:T-np\np/np 26-year-old:S-26-year-old:P-JJ:T-n/n lan:S-lan:P-JJ:T-n/n wei:S-wei:P-NNS:T-n from:S-from:P-IN:T-np\np/(s[ng]\np) guangdong:S-guangdong:P-VBG:T-s[ng]\np/(s[pss]\np) displayed:S-display:P-VBN:T-s[pss]\np excellent:S-excellent:P-JJ:T-s\np\(s\np)/(s\np\(s\np)) skill:S-skill:P-NN:T-s\np\(s\np) ,:S-,:P-,:T-punct[,] and:S-and:P-CC:T-s[pss]$\(s[pss]$)\punct[,]/(s[pss]$) ranked:S-rank:P-VBD:T-s[pss]\np no.:S-no.:P-NNP:T-s\np\(s\np)/n[num] 1:S-1:P-CD:T-n[num] with:S-with:P-IN:T-s\np\(s\np)/np a:S-a:P-DT:T-np/n total:S-total:P-JJ:T-n/n score:S-score:P-NN:T-n of:S-of:P-IN:T-np\np/np 355.3:S-355.3:P-CD:T-n .:S-.:P-.:T-sent\(s/s) nanjing , december 16 ( xinhua ) foreign fund put into jiangsu 's agricultural sector is increasing w9:s[pss]\np/pp:VBD:put w8:n:NN:fund w7:n/n:JJ:foreign w6:np/n:DT:rrb w5:np\np\(np\np)/np:IN:xinhua w4:np\np:NNS:lrb w3:np\np/(np\np):CD:16 w2:n:NN:december w16:s[ng]\np:VBG:increase w15:s[dcl]\np/(s[ng]\np):VBZ:PROG w0:s/s:NN:nanjing w14:n:NN:sector w13:n/n:JJ:agricultural w11:n:NNP:jiangsu w10:pp/np:IN:into nanjing:S-nanjing:P-NN:T-s/s ,:S-,:P-,:T-s/s\(s/s) december:S-december:P-NN:T-n 16:S-16:P-CD:T-np\np/(np\np) (:S-(:P-NNS:T-np\np xinhua:S-xinhua:P-IN:T-np\np\(np\np)/np ):S-):P-DT:T-np/n foreign:S-foreign:P-JJ:T-n/n fund:S-fund:P-NN:T-n put:S-put:P-VBD:T-s[pss]\np/pp into:S-into:P-IN:T-pp/np jiangsu:S-jiangsu:P-NNP:T-n 's:S-'s:P-POS:T-np/n\np agricultural:S-agricultural:P-JJ:T-n/n sector:S-sector:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/(s[ng]\np) increasing:S-increase:P-VBG:T-s[ng]\np xinhua news agency , nanjing , dec. 16 ( reporter zhou fang ) - foreign investment in jiangsu 's agriculture is on the rise . w19:s[dcl]\np/pp:VBZ:be w9:n/n:NN:reporter w18:n:NN:agriculture w8:n/n:NN:lrb w7:n/n:CD:16 w16:n:NNP:jiangsu w6:s[dcl]\np/np:VBZ:dec. w15:np\np/np:IN:in w14:n:NN:investment w4:n:NN:nanjing w13:n/n:JJ:foreign w12:n:SYM:rrb w2:n:NN:agency w11:n/n:VBG:fang w1:n/n:NN:news w10:n/n:NN:zhou w0:np/n:DT:xinhua w22:n:NN:rise w21:np/n:DT:the w20:pp/np:IN:on xinhua:S-xinhua:P-DT:T-np/n news:S-news:P-NN:T-n/n agency:S-agency:P-NN:T-n ,:S-,:P-,:T-np\np/punct[,]/np nanjing:S-nanjing:P-NN:T-n ,:S-,:P-,:T-punct[,] dec.:S-dec.:P-VBZ:T-s[dcl]\np/np 16:S-16:P-CD:T-n/n (:S-(:P-NN:T-n/n reporter:S-reporter:P-NN:T-n/n zhou:S-zhou:P-NN:T-n/n fang:S-fang:P-VBG:T-n/n ):S-):P-SYM:T-n -:S--:P--:T-punct[-] foreign:S-foreign:P-JJ:T-n/n investment:S-investment:P-NN:T-n in:S-in:P-IN:T-np\np/np jiangsu:S-jiangsu:P-NNP:T-n 's:S-'s:P-POS:T-np/n\np agriculture:S-agriculture:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/pp on:S-on:P-IN:T-pp/np the:S-the:P-DT:T-np/n rise:S-rise:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] someone connected global warming to the phenomenon of el nino . w9:n:NN:nino w8:n/n:JJ:el w7:np\np/np:IN:of w6:n:NN:phenomenon w5:np/n:DT:the w4:np\np/np:TO:to w3:n:NN:warming w2:n/n:JJ:global w1:n/n:JJ:connected w0:np/n:DT:someone someone:S-someone:P-DT:T-np/n connected:S-connected:P-JJ:T-n/n global:S-global:P-JJ:T-n/n warming:S-warming:P-NN:T-n to:S-to:P-TO:T-np\np/np the:S-the:P-DT:T-np/n phenomenon:S-phenomenon:P-NN:T-n of:S-of:P-IN:T-np\np/np el:S-el:P-JJ:T-n/n nino:S-nino:P-NN:T-n .:S-.:P-.:T-sent\np some linked global warming with el nino . w2:n/n:JJ:global w1:n/n:VBN:link w0:np/n:DT:some w6:n:NN:nino w5:n/n:JJ:el w4:np\np/np:IN:with w3:n:NN:warming some:S-some:P-DT:T-np/n linked:S-link:P-VBN:T-n/n global:S-global:P-JJ:T-n/n warming:S-warming:P-NN:T-n with:S-with:P-IN:T-np\np/np el:S-el:P-JJ:T-n/n nino:S-nino:P-NN:T-n .:S-.:P-.:T-sent\np xinhua news agency , beijing , january 16th , by xintang xu and yuhong qian- the president of the bank of china , xianglong dai said that , china will continue implementing the financial opening up policy . w34:n:NN:policy w33:np\np/np:IN:up w32:n:NN:opening w31:n/n:JJ:financial w30:np/n:DT:the w9:s[ng]\np/np:VBG:xintang w19:np\np/np:IN:of w18:n:NN:bank w8:s\np\(s\np)/(s[ng]\np):IN:by w17:np/n:DT:the w16:np\np/np:IN:of w6:s\np\(s\np):JJ:16th w15:n:NN:president w5:s\np\(s\np)/(s\np\(s\np)):JJ:january w4:s[ng]\np:VBG:beijing w14:np/n:DT:the w13:s\np\(s\np)/np:IN:qian- w12:n:NN:yuhong w2:n:NN:agency w11:n\n/n:CC:and w1:n/n:NN:news w10:n:NN:xu w0:np/n:DT:xinhua w29:s[ng]\np/np:VBG:implement w28:s[b]\np/(s[ng]\np):VB:continue w27:s[dcl]\np/(s[b]\np):MD:will w26:n:NN:china w24:s[dcl]\np/s[em]:VBD:say w23:s\np/(s\np):NN:dai w22:s\np/(s\np)/(s\np/(s\np)):JJ:xianglong w20:n:NN:china xinhua:S-xinhua:P-DT:T-np/n news:S-news:P-NN:T-n/n agency:S-agency:P-NN:T-n ,:S-,:P-,:T-s\np/(s\np)/punct[,]/(s\np) beijing:S-beijing:P-VBG:T-s[ng]\np ,:S-,:P-,:T-punct[,] january:S-january:P-JJ:T-s\np\(s\np)/(s\np\(s\np)) 16th:S-16th:P-JJ:T-s\np\(s\np) ,:S-,:P-,:T-s\np\(s\np)/(s\np\(s\np)) by:S-by:P-IN:T-s\np\(s\np)/(s[ng]\np) xintang:S-xintang:P-VBG:T-s[ng]\np/np xu:S-xu:P-NN:T-n and:S-and:P-CC:T-n\n/n yuhong:S-yuhong:P-NN:T-n qian-:S-qian-:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n president:S-president:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n bank:S-bank:P-NN:T-n of:S-of:P-IN:T-np\np/np china:S-china:P-NN:T-n ,:S-,:P-,:T-punct[,] xianglong:S-xianglong:P-JJ:T-s\np/(s\np)/(s\np/(s\np)) dai:S-dai:P-NN:T-s\np/(s\np) said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] ,:S-,:P-,:T-punct[,] china:S-china:P-NN:T-n will:S-will:P-MD:T-s[dcl]\np/(s[b]\np) continue:S-continue:P-VB:T-s[b]\np/(s[ng]\np) implementing:S-implement:P-VBG:T-s[ng]\np/np the:S-the:P-DT:T-np/n financial:S-financial:P-JJ:T-n/n opening:S-opening:P-NN:T-n up:S-up:P-IN:T-np\np/np policy:S-policy:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] xinhua news agency , beijing , january 16th ( reporter : xu xingtang , qian yuhong ) . dai xianglong , the president of people 's bank of china , said that china will carry on with the open financial policy . w37:n/n:JJ:open w36:np/n:DT:the w35:s\np\(s\np)/np:IN:with w34:s\np\(s\np):RP:on w33:s[b]\np:VB:carry w32:s[dcl]\np/(s[b]\np):MD:will w31:n:NN:china w9:n:NN:reporter w18:n:NNP:xianglong w8:n/n:NN:lrb w17:n/n:NNP:dai w7:n/n:JJ:16th w16:n:NN:rrb w6:n/n:JJ:january w15:n/n:JJ:yuhong w14:n/n:JJ:qian w4:s[ng]\np/np:VBG:beijing w13:np\np/np:,:, w2:n:NN:agency w12:n:NNP:xingtang w11:n/n:NNP:xu w1:n/n:NN:news w10:np\np/np:IN:| w0:np/n:DT:xinhua w29:s[dcl]\np/s[em]:VBD:say w27:n:NN:china w26:np\np/np:IN:of w25:n:NN:bank w23:n:NNS:people w22:np\np/np:IN:of w21:n:NN:president w20:np/n:DT:the w39:n:NN:policy w38:n/n:JJ:financial xinhua:S-xinhua:P-DT:T-np/n news:S-news:P-NN:T-n/n agency:S-agency:P-NN:T-n ,:S-,:P-,:T-s[dcl]\np/(s[dcl]\np) beijing:S-beijing:P-VBG:T-s[ng]\np/np ,:S-,:P-,:T-s\np\(s\np)/np january:S-january:P-JJ:T-n/n 16th:S-16th:P-JJ:T-n/n (:S-(:P-NN:T-n/n reporter:S-reporter:P-NN:T-n &#58;:S-&#58;:P-IN:T-np\np/np xu:S-xu:P-NNP:T-n/n xingtang:S-xingtang:P-NNP:T-n ,:S-,:P-,:T-np\np/np qian:S-qian:P-JJ:T-n/n yuhong:S-yuhong:P-JJ:T-n/n ):S-):P-NN:T-n .:S-.:P-.:T-punct[.] dai:S-dai:P-NNP:T-n/n xianglong:S-xianglong:P-NNP:T-n ,:S-,:P-,:T-np\np/punct[,]/np the:S-the:P-DT:T-np/n president:S-president:P-NN:T-n of:S-of:P-IN:T-np\np/np people:S-people:P-NNS:T-n 's:S-'s:P-POS:T-np/n\np bank:S-bank:P-NN:T-n of:S-of:P-IN:T-np\np/np china:S-china:P-NN:T-n ,:S-,:P-,:T-punct[,] said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-DT:T-s[em]/s[dcl] china:S-china:P-NN:T-n will:S-will:P-MD:T-s[dcl]\np/(s[b]\np) carry:S-carry:P-VB:T-s[b]\np on:S-on:P-RP:T-s\np\(s\np) with:S-with:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n open:S-open:P-JJ:T-n/n financial:S-financial:P-JJ:T-n/n policy:S-policy:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] this kind of measure restrained the economic growth , leading to the rise of unemployment . w9:s[ng]\np:VBG:lead w7:n:NN:growth w6:n/n:JJ:economic w5:np/n:DT:the w4:s[dcl]\np/np:VBD:restrain w3:n:NN:measure w2:np\np/np:IN:of w1:n:NN:kind w0:np/n:DT:this w14:n:NN:unemployment w13:np\np/np:IN:of w12:n:NN:rise w11:np/n:DT:the w10:s\np\(s\np)/np:TO:to this:S-this:P-DT:T-np/n kind:S-kind:P-NN:T-n of:S-of:P-IN:T-np\np/np measure:S-measure:P-NN:T-n restrained:S-restrain:P-VBD:T-s[dcl]\np/np the:S-the:P-DT:T-np/n economic:S-economic:P-JJ:T-n/n growth:S-growth:P-NN:T-n ,:S-,:P-,:T-s\np\(s\np)/(s\np) leading:S-lead:P-VBG:T-s[ng]\np to:S-to:P-TO:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n rise:S-rise:P-NN:T-n of:S-of:P-IN:T-np\np/np unemployment:S-unemployment:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] such measures checked economic growth and caused unemployment to rise . w9:s[b]\np:VB:rise w7:n:NN:unemployment w6:s[dcl]\np/(s[to]\np)/np:VBD:cause w5:s[dcl]$\(s[dcl]$)/(s[dcl]$):CC:and w4:n:NN:growth w3:n/n:JJ:economic w2:s[dcl]\np/np:VBD:check w1:n:NNS:measure w0:np/np:JJ:such such:S-such:P-JJ:T-np/np measures:S-measure:P-NNS:T-n checked:S-check:P-VBD:T-s[dcl]\np/np economic:S-economic:P-JJ:T-n/n growth:S-growth:P-NN:T-n and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) caused:S-cause:P-VBD:T-s[dcl]\np/(s[to]\np)/np unemployment:S-unemployment:P-NN:T-n to:S-to:P-TO:T-s[to]\np/(s[b]\np) rise:S-rise:P-VB:T-s[b]\np .:S-.:P-.:T-sent\s[dcl] siemens germany is currently cooperating with a local thai company , and constructing another 23km long electric railway project in bangkok . w19:s\np\(s\np)/np:IN:in w9:n:NN:company w18:n:NN:project w8:n/n:NN:thai w17:n/n:NN:railway w7:n/n:JJ:local w16:n/n:JJ:electric w6:np/n:DT:a w15:n/n:JJ:long w5:pp/np:IN:with w14:n/n:JJ:23km w4:s[ng]\np/pp:VBG:cooperate w13:np/n:DT:another w3:s\np\(s\np):RB:currently w12:s[ng]\np/np:VBG:construct w2:s[dcl]\np/(s[ng]\np):VBZ:PROG w11:s[ng]$\(s[ng]$)\punct[,]/(s[ng]$):CC:and w1:n:NN:germany w0:n/n:NNS:siemens w20:n:NN:bangkok siemens:S-siemens:P-NNS:T-n/n germany:S-germany:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/(s[ng]\np) currently:S-currently:P-RB:T-s\np\(s\np) cooperating:S-cooperate:P-VBG:T-s[ng]\np/pp with:S-with:P-IN:T-pp/np a:S-a:P-DT:T-np/n local:S-local:P-JJ:T-n/n thai:S-thai:P-NN:T-n/n company:S-company:P-NN:T-n ,:S-,:P-,:T-punct[,] and:S-and:P-CC:T-s[ng]$\(s[ng]$)\punct[,]/(s[ng]$) constructing:S-construct:P-VBG:T-s[ng]\np/np another:S-another:P-DT:T-np/n 23km:S-23km:P-JJ:T-n/n long:S-long:P-JJ:T-n/n electric:S-electric:P-JJ:T-n/n railway:S-railway:P-NN:T-n/n project:S-project:P-NN:T-n in:S-in:P-IN:T-s\np\(s\np)/np bangkok:S-bangkok:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the german shermans company is currently working together with a local company constructing a 23 kilometer electronic train engineering project . w19:n:NN:project w9:np/n:DT:a w18:n/n:NN:engineering w8:s\np\(s\np)/np:IN:with w17:n/n:NN:train w7:s\np\(s\np):RB:together w6:s[ng]\np:VBG:work w16:n/n:JJ:electronic w15:n/n:NN:kilometer w5:s\np\(s\np):RB:currently w4:s[dcl]\np/(s[ng]\np):VBZ:PROG w14:n/n:CD:23 w13:np/n:DT:a w3:n:NN:company w12:s[ng]\np/np:VBG:construct w2:n/n:NNS:shermans w11:n:NN:company w1:n/n:NN:german w10:n/n:JJ:local w0:np/n:DT:the the:S-the:P-DT:T-np/n german:S-german:P-NN:T-n/n shermans:S-shermans:P-NNS:T-n/n company:S-company:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/(s[ng]\np) currently:S-currently:P-RB:T-s\np\(s\np) working:S-work:P-VBG:T-s[ng]\np together:S-together:P-RB:T-s\np\(s\np) with:S-with:P-IN:T-s\np\(s\np)/np a:S-a:P-DT:T-np/n local:S-local:P-JJ:T-n/n company:S-company:P-NN:T-n constructing:S-construct:P-VBG:T-s[ng]\np/np a:S-a:P-DT:T-np/n 23:S-23:P-CD:T-n/n kilometer:S-kilometer:P-NN:T-n/n electronic:S-electronic:P-JJ:T-n/n train:S-train:P-NN:T-n/n engineering:S-engineering:P-NN:T-n/n project:S-project:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] he said that the same resolve would lead to the success of the kyoto protocol . w9:np/n:DT:the w8:pp/np:TO:to w7:s[b]\np/pp:VB:lead w6:s[dcl]\np/(s[b]\np):MD:would w5:n:NN:resolve w4:n/n:JJ:same w3:np/n:DT:the w1:s[dcl]\np/s[em]:VBD:say w0:np:PRP:he w14:n:NN:protocol w13:n/n:NN:kyoto w12:np/n:DT:the w11:np\np/np:IN:of w10:n:NN:success he:S-he:P-PRP:T-np said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] the:S-the:P-DT:T-np/n same:S-same:P-JJ:T-n/n resolve:S-resolve:P-NN:T-n would:S-would:P-MD:T-s[dcl]\np/(s[b]\np) lead:S-lead:P-VB:T-s[b]\np/pp to:S-to:P-TO:T-pp/np the:S-the:P-DT:T-np/n success:S-success:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n kyoto:S-kyoto:P-NN:T-n/n protocol:S-protocol:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] he said that the same resolve would lead to the success of the kyoto protocol . w9:np/n:DT:the w8:pp/np:TO:to w7:s[b]\np/pp:VB:lead w6:s[dcl]\np/(s[b]\np):MD:would w5:n:NN:resolve w4:n/n:JJ:same w3:np/n:DT:the w1:s[dcl]\np/s[em]:VBD:say w0:np:PRP:he w14:n:NN:protocol w13:n/n:NN:kyoto w12:np/n:DT:the w11:np\np/np:IN:of w10:n:NN:success he:S-he:P-PRP:T-np said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] the:S-the:P-DT:T-np/n same:S-same:P-JJ:T-n/n resolve:S-resolve:P-NN:T-n would:S-would:P-MD:T-s[dcl]\np/(s[b]\np) lead:S-lead:P-VB:T-s[b]\np/pp to:S-to:P-TO:T-pp/np the:S-the:P-DT:T-np/n success:S-success:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n kyoto:S-kyoto:P-NN:T-n/n protocol:S-protocol:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] however , the official who disclosed the above declined to elaborate what kind of missile technology is being exported and when china provided such technology to pakistan and iran . w19:np\np\(np\np)/(np\np):CC:and w18:s[pss]\np:VBN:export w8:s[dcl]\np/(s[to]\np):VBD:decline w17:s[ng]\np/(s[pss]\np):VBG:PASS w7:n:NN:above w16:s[dcl]\np/(s[ng]\np):VBZ:PROG w6:np/n:DT:the w15:n:NN:technology w5:s[dcl]\np/np:VBD:disclose w14:n/n:NN:missile w13:np\np/np:IN:of w3:n:NN:official w12:n:NN:kind w2:np/n:DT:the w11:np/n:WP:what w10:s[b]\np:VB:elaborate w0:s/s:RB:however w28:s[b]\np:VB:iran w27:s[b]$\(s[b]$)/(s[b]$):CC:and w26:s[b]\np:VB:pakistan w24:n:NN:technology w23:np/np:JJ:such w22:s[dcl]\np/np:VBD:provide w21:n:NN:china w20:np\np/s[dcl]:WRB:when however:S-however:P-RB:T-s/s ,:S-,:P-,:T-s/s\(s/s) the:S-the:P-DT:T-np/n official:S-official:P-NN:T-n who:S-who:P-WP:T-np\np/(s[dcl]\np) disclosed:S-disclose:P-VBD:T-s[dcl]\np/np the:S-the:P-DT:T-np/n above:S-above:P-NN:T-n declined:S-decline:P-VBD:T-s[dcl]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) elaborate:S-elaborate:P-VB:T-s[b]\np what:S-what:P-WP:T-np/n kind:S-kind:P-NN:T-n of:S-of:P-IN:T-np\np/np missile:S-missile:P-NN:T-n/n technology:S-technology:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/(s[ng]\np) being:S-be:P-VBG:T-s[ng]\np/(s[pss]\np) exported:S-export:P-VBN:T-s[pss]\np and:S-and:P-CC:T-np\np\(np\np)/(np\np) when:S-when:P-WRB:T-np\np/s[dcl] china:S-china:P-NN:T-n provided:S-provide:P-VBD:T-s[dcl]\np/np such:S-such:P-JJ:T-np/np technology:S-technology:P-NN:T-n to:S-to:P-TO:T-s[to]\np/(s[b]\np) pakistan:S-pakistan:P-VB:T-s[b]\np and:S-and:P-CC:T-s[b]$\(s[b]$)/(s[b]$) iran:S-iran:P-VB:T-s[b]\np .:S-.:P-.:T-sent\(np\np) however , the official was not willing to explain what kind of missile technology that china provided to pakistan and iran , and when this technology was provided . w19:n:NN:iran w9:n:NN:kind w18:n\n/n:CC:and w8:np/n:WP:what w17:n:NN:pakistan w7:s[b]\np/np:VB:explain w16:pp/np:TO:to w15:s[pss]\np/pp:VBN:provide w5:s[adj]\np/(s[to]\np):JJ:willing w14:s[dcl]\np/(s[pss]\np):VBZ:china w4:s\np\(s\np):RB:not w3:s[dcl]\np/(s[adj]\np):VBD:be w12:n:NN:technology w2:n:NN:official w11:n/n:NN:missile w1:np/n:DT:the w10:np\np/np:IN:of w0:s/s:RB:however w26:s[pss]\np:VBN:provide w25:s[dcl]\np/(s[pss]\np):VBD:PASS w24:n:NN:technology w23:np/n:DT:this w22:s/s/s[dcl]:WRB:when w21:s$\(s$)/(s$):CC:and however:S-however:P-RB:T-s/s ,:S-,:P-,:T-punct[,] the:S-the:P-DT:T-np/n official:S-official:P-NN:T-n was:S-be:P-VBD:T-s[dcl]\np/(s[adj]\np) not:S-not:P-RB:T-s\np\(s\np) willing:S-willing:P-JJ:T-s[adj]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) explain:S-explain:P-VB:T-s[b]\np/np what:S-what:P-WP:T-np/n kind:S-kind:P-NN:T-n of:S-of:P-IN:T-np\np/np missile:S-missile:P-NN:T-n/n technology:S-technology:P-NN:T-n that:S-that:P-WDT:T-np\np/(s[dcl]\np) china:S-china:P-VBZ:T-s[dcl]\np/(s[pss]\np) provided:S-provide:P-VBN:T-s[pss]\np/pp to:S-to:P-TO:T-pp/np pakistan:S-pakistan:P-NN:T-n and:S-and:P-CC:T-n\n/n iran:S-iran:P-NN:T-n ,:S-,:P-,:T-s/s\np and:S-and:P-CC:T-s$\(s$)/(s$) when:S-when:P-WRB:T-s/s/s[dcl] this:S-this:P-DT:T-np/n technology:S-technology:P-NN:T-n was:S-be:P-VBD:T-s[dcl]\np/(s[pss]\np) provided:S-provide:P-VBN:T-s[pss]\np .:S-.:P-.:T-sent\(s/s) yunnan gardens was developed by fujian company , there are 313 apartments in this residential project . w9:n/n:CD:313 w8:s[dcl]\np[thr]/np:VBP:be w6:n:NN:company w5:n/n:JJ:fujian w4:s\np\(s\np)/np:IN:by w3:s[pss]\np:VBN:develop w2:s[dcl]\np/(s[pss]\np):VBD:PASS w1:n:NNS:garden w0:n/n:JJ:yunnan w14:n:NN:project w13:n/n:JJ:residential w12:np/n:DT:this w11:np\np/np:IN:in w10:n:NNS:apartment yunnan:S-yunnan:P-JJ:T-n/n gardens:S-garden:P-NNS:T-n was:S-be:P-VBD:T-s[dcl]\np/(s[pss]\np) developed:S-develop:P-VBN:T-s[pss]\np by:S-by:P-IN:T-s\np\(s\np)/np fujian:S-fujian:P-JJ:T-n/n company:S-company:P-NN:T-n ,:S-,:P-,:T-punct[,] there:S-there:P-EX:T-np[thr] are:S-be:P-VBP:T-s[dcl]\np[thr]/np 313:S-313:P-CD:T-n/n apartments:S-apartment:P-NNS:T-n in:S-in:P-IN:T-np\np/np this:S-this:P-DT:T-np/n residential:S-residential:P-JJ:T-n/n project:S-project:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the yunnan gardens project developed by the fujian society is a 313-unit housing project with land . w9:s[dcl]\np/np:VBZ:be w8:n:NN:society w7:n/n:JJ:fujian w6:np/n:DT:the w5:s\np\(s\np)/np:IN:by w4:s[pss]\np:VBN:develop w3:n:NN:project w2:n/n:NNS:garden w1:n/n:JJ:yunnan w15:n:NN:land w0:np/n:DT:the w14:np\np/np:IN:with w13:n:NN:project w12:n/n:NN:housing w11:n/n:JJ:313-unit w10:np/n:DT:a the:S-the:P-DT:T-np/n yunnan:S-yunnan:P-JJ:T-n/n gardens:S-garden:P-NNS:T-n/n project:S-project:P-NN:T-n developed:S-develop:P-VBN:T-s[pss]\np by:S-by:P-IN:T-s\np\(s\np)/np the:S-the:P-DT:T-np/n fujian:S-fujian:P-JJ:T-n/n society:S-society:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/np a:S-a:P-DT:T-np/n 313-unit:S-313-unit:P-JJ:T-n/n housing:S-housing:P-NN:T-n/n project:S-project:P-NN:T-n with:S-with:P-IN:T-np\np/np land:S-land:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] in the last two years , some foreign businessmen showed their interest in the construction of xiling city . w9:s[dcl]\np/np:VBD:show w8:n:NNS:businessman w7:n/n:JJ:foreign w6:np/n:DT:some w4:n:NNS:year w3:n/n:CD:two w17:n:NN:city w2:n/n:JJ:last w16:s[ng]\np/np:VBG:xiling w1:np/n:DT:the w15:np\np/(s[ng]\np):IN:of w0:s/s/np:IN:in w14:n:NN:construction w13:np/n:DT:the w12:np\np/np:IN:in w11:n:NN:interest w10:np/n:PRP$:their in:S-in:P-IN:T-s/s/np the:S-the:P-DT:T-np/n last:S-last:P-JJ:T-n/n two:S-two:P-CD:T-n/n years:S-year:P-NNS:T-n ,:S-,:P-,:T-s/s\(s/s) some:S-some:P-DT:T-np/n foreign:S-foreign:P-JJ:T-n/n businessmen:S-businessman:P-NNS:T-n showed:S-show:P-VBD:T-s[dcl]\np/np their:S-their:P-PRP$:T-np/n interest:S-interest:P-NN:T-n in:S-in:P-IN:T-np\np/np the:S-the:P-DT:T-np/n construction:S-construction:P-NN:T-n of:S-of:P-IN:T-np\np/(s[ng]\np) xiling:S-xiling:P-VBG:T-s[ng]\np/np city:S-city:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] during the past two years , a batch of foreign businessmen expressed their wishes to get involved in xining 's city construction one after another . w9:n/n:JJ:foreign w18:n:NN:xining w8:np\np/np:IN:of w17:pp/np:IN:in w7:n:NN:batch w16:s[adj]\np/pp:VBN:involve w6:np/n:DT:a w15:s[b]\np/(s[adj]\np):VB:get w4:n:NNS:year w13:n:NNS:wish w3:n/n:CD:two w12:np/n:PRP$:their w2:n/n:JJ:past w11:s[dcl]\np/np:VBD:express w1:np/n:DT:the w10:n:NNS:businessman w0:s/s/np:IN:during w24:np:DT:another w23:np\np/np:IN:after w22:n:NN:one w21:n/n:NN:construction w20:n/n:NN:city during:S-during:P-IN:T-s/s/np the:S-the:P-DT:T-np/n past:S-past:P-JJ:T-n/n two:S-two:P-CD:T-n/n years:S-year:P-NNS:T-n ,:S-,:P-,:T-s/s\(s/s) a:S-a:P-DT:T-np/n batch:S-batch:P-NN:T-n of:S-of:P-IN:T-np\np/np foreign:S-foreign:P-JJ:T-n/n businessmen:S-businessman:P-NNS:T-n expressed:S-express:P-VBD:T-s[dcl]\np/np their:S-their:P-PRP$:T-np/n wishes:S-wish:P-NNS:T-n to:S-to:P-TO:T-s[to]\np/(s[b]\np) get:S-get:P-VB:T-s[b]\np/(s[adj]\np) involved:S-involve:P-VBN:T-s[adj]\np/pp in:S-in:P-IN:T-pp/np xining:S-xining:P-NN:T-n 's:S-'s:P-POS:T-np/n\np city:S-city:P-NN:T-n/n construction:S-construction:P-NN:T-n/n one:S-one:P-NN:T-n after:S-after:P-IN:T-np\np/np another:S-another:P-DT:T-np .:S-.:P-.:T-sent\s[dcl] the dai yu xiang industrial consulting company said that spacious front door area is the biggest characteristic of yunnan gardens with land ownership . w19:n:NNS:garden w9:n/n:JJ:spacious w18:n/n:JJ:yunnan w7:s[dcl]\np/s[em]:VBD:say w17:np\np/np:IN:of w16:n:NN:characteristic w6:n:NN:company w15:n/n:JJS:biggest w5:n/n:NN:consulting w14:np/n:DT:the w4:n/n:JJ:industrial w13:s[dcl]\np/np:VBZ:be w3:n/n:VBG:xiang w12:n:NN:area w2:n/n:NN:yu w11:n/n:NN:door w1:n/n:JJ:dai w10:n/n:JJ:front w0:np/n:DT:the w22:n:NN:ownership w21:n/n:NN:land w20:np\np/np:IN:with the:S-the:P-DT:T-np/n dai:S-dai:P-JJ:T-n/n yu:S-yu:P-NN:T-n/n xiang:S-xiang:P-VBG:T-n/n industrial:S-industrial:P-JJ:T-n/n consulting:S-consulting:P-NN:T-n/n company:S-company:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] spacious:S-spacious:P-JJ:T-n/n front:S-front:P-JJ:T-n/n door:S-door:P-NN:T-n/n area:S-area:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/np the:S-the:P-DT:T-np/n biggest:S-biggest:P-JJS:T-n/n characteristic:S-characteristic:P-NN:T-n of:S-of:P-IN:T-np\np/np yunnan:S-yunnan:P-JJ:T-n/n gardens:S-garden:P-NNS:T-n with:S-with:P-IN:T-np\np/np land:S-land:P-NN:T-n/n ownership:S-ownership:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] dai yuxiang real estate consulting firm said that the most special characteristic of the landed residences at yunnan gardens is their spacious appearance . w19:s[dcl]\np/np:VBZ:be w9:n/n/(n/n):RBS:most w18:n:NNS:garden w8:np/n:DT:the w17:n/n:JJ:yunnan w6:s[dcl]\np/s[em]:VBD:say w16:np\np/np:IN:at w15:n:NNS:residence w5:n:NN:firm w14:n/n:VBN:land w4:n/n:NN:consulting w13:np/n:DT:the w3:n/n:NN:estate w12:np\np/np:IN:of w2:n/n:JJ:real w11:n:NN:characteristic w1:n/n:NNP:yuxiang w10:n/n:JJ:special w0:n/n:NNP:dai w22:n:NN:appearance w21:n/n:JJ:spacious w20:np/n:PRP$:their dai:S-dai:P-NNP:T-n/n yuxiang:S-yuxiang:P-NNP:T-n/n real:S-real:P-JJ:T-n/n estate:S-estate:P-NN:T-n/n consulting:S-consulting:P-NN:T-n/n firm:S-firm:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] the:S-the:P-DT:T-np/n most:S-most:P-RBS:T-n/n/(n/n) special:S-special:P-JJ:T-n/n characteristic:S-characteristic:P-NN:T-n of:S-of:P-IN:T-np\np/np the:S-the:P-DT:T-np/n landed:S-land:P-VBN:T-n/n residences:S-residence:P-NNS:T-n at:S-at:P-IN:T-np\np/np yunnan:S-yunnan:P-JJ:T-n/n gardens:S-garden:P-NNS:T-n is:S-be:P-VBZ:T-s[dcl]\np/np their:S-their:P-PRP$:T-np/n spacious:S-spacious:P-JJ:T-n/n appearance:S-appearance:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] this measure restrained economic growth and raised the rate of unemployment . w9:np\np/np:IN:of w8:n:NN:rate w7:np/n:DT:the w6:s[dcl]\np/np:VBD:raise w5:s[dcl]$\(s[dcl]$)/(s[dcl]$):CC:and w4:n:NN:growth w3:n/n:JJ:economic w2:s[dcl]\np/np:VBD:restrain w1:n:NN:measure w0:np/n:DT:this w10:n:NN:unemployment this:S-this:P-DT:T-np/n measure:S-measure:P-NN:T-n restrained:S-restrain:P-VBD:T-s[dcl]\np/np economic:S-economic:P-JJ:T-n/n growth:S-growth:P-NN:T-n and:S-and:P-CC:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) raised:S-raise:P-VBD:T-s[dcl]\np/np the:S-the:P-DT:T-np/n rate:S-rate:P-NN:T-n of:S-of:P-IN:T-np\np/np unemployment:S-unemployment:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] this curbs economic growth and increases unemployment rates . according to xinhua news agency , london , on february 3rd , bbc reporter bowen stayed in baghdad . the capital of iraq stated that there was no evidence to indicate the bomb-shelter destroyed by a us bomber was a military target . w36:s[dcl]\np/np:VBD:be w35:s\np/(s\np):NN:bomber w34:np:PRP:us w33:np:DT:a w32:s\np\(s\np)/np:IN:by w31:s[pss]\np:VBN:destroy w30:n:NN:bomb-shelter w29:np/n:DT:the w28:s[b]\np/np:VB:indicate w26:n:NN:evidence w25:np/n:DT:no w24:s[dcl]\np[thr]/np:VBD:be w21:s[dcl]\np/s[em]:VBD:state w20:n:NNP:iraq h1:n\(n/n)/n:,:has-rel w9:n/n:JJ:3rd w8:n/n:JJ:february w7:s\np/(s\np)/np:IN:on w5:n:NN:london w4:n:NN:agency w3:n/n:NN:news w19:np\np/np:IN:of w2:n/n:NNP:xinhua w18:n:NN:capital w1:pp/np:TO:to w17:np/n:DT:the w0:s/s/pp:VBG:accord w16:n:NN:baghdad w15:pp/np:IN:in w14:s[dcl]\np/pp:VBD:stay w13:n:NN:bowen w12:n/n:NN:reporter w11:n/n:JJ:bbc w10:n\(n/n)/n:,:, w39:n:NN:target w38:n/n:JJ:military w37:np/n:DT:a according:S-accord:P-VBG:T-s/s/pp to:S-to:P-TO:T-pp/np xinhua:S-xinhua:P-NNP:T-n/n news:S-news:P-NN:T-n/n agency:S-agency:P-NN:T-n ,:S-,:P-,:T-punct[,] london:S-london:P-NN:T-n ,:S-,:P-,:T-s[dcl]\np/(s[dcl]\np) on:S-on:P-IN:T-s\np/(s\np)/np february:S-february:P-JJ:T-n/n 3rd:S-3rd:P-JJ:T-n/n ,:S-,:P-,:T-n\(n/n)/n bbc:S-bbc:P-JJ:T-n/n reporter:S-reporter:P-NN:T-n/n bowen:S-bowen:P-NN:T-n stayed:S-stay:P-VBD:T-s[dcl]\np/pp in:S-in:P-IN:T-pp/np baghdad:S-baghdad:P-NN:T-n .:S-.:P-.:T-punct[.] the:S-the:P-DT:T-np/n capital:S-capital:P-NN:T-n of:S-of:P-IN:T-np\np/np iraq:S-iraq:P-NNP:T-n stated:S-state:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-IN:T-s[em]/s[dcl] there:S-there:P-EX:T-np[thr] was:S-be:P-VBD:T-s[dcl]\np[thr]/np no:S-no:P-DT:T-np/n evidence:S-evidence:P-NN:T-n to:S-to:P-TO:T-s[to]\np/(s[b]\np) indicate:S-indicate:P-VB:T-s[b]\np/np the:S-the:P-DT:T-np/n bomb-shelter:S-bomb-shelter:P-NN:T-n destroyed:S-destroy:P-VBN:T-s[pss]\np by:S-by:P-IN:T-s\np\(s\np)/np a:S-a:P-DT:T-np us:S-us:P-PRP:T-np bomber:S-bomber:P-NN:T-s\np/(s\np) was:S-be:P-VBD:T-s[dcl]\np/np a:S-a:P-DT:T-np/n military:S-military:P-JJ:T-n/n target:S-target:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] according to xinhua news report from london on february 13 , bonne , a reporter of bbc in iraqi capital baghdad , confirmed on 13th that no any sign showed the bomb shelter in baghdad destroyed by american bomber was a military blindage . w36:s\np\(s\np)/np:IN:by w35:s[pss]\np:VBN:destroy w34:n:NN:baghdad w33:np\np/np:IN:in w32:n:NN:shelter w31:n/n:NN:bomb w30:np/n:DT:the w29:s[dcl]\np/s[dcl]:VBD:show w28:n:NN:sign w27:n/n:DT:any w26:np/n:DT:no w24:n:NN:13th w23:s\np\(s\np)/np:IN:on w22:s[dcl]\np/np:VBD:confirm w20:n:NN:baghdad w9:n:CD:13 w8:n/n:JJ:february w7:np\np/np:IN:on w6:n:NN:london w5:np\np/np:IN:from w4:n:NN:report w3:n/n:NN:news w19:n/n:NN:capital w2:np/n:DT:xinhua w18:n/n:JJ:iraqi w1:pp/np:TO:to w17:np\np/np:IN:in w0:s/s/pp:VBG:accord w16:n:NN:bbc w15:np\np/np:IN:of w14:n:NN:reporter w13:np/n:DT:a w11:n:NN:bonne w42:n:NN:blindage w41:n/n:JJ:military w40:np/n:DT:a w39:s[dcl]\np/np:VBD:be w38:n:NN:bomber w37:n/n:JJ:american according:S-accord:P-VBG:T-s/s/pp to:S-to:P-TO:T-pp/np xinhua:S-xinhua:P-DT:T-np/n news:S-news:P-NN:T-n/n report:S-report:P-NN:T-n from:S-from:P-IN:T-np\np/np london:S-london:P-NN:T-n on:S-on:P-IN:T-np\np/np february:S-february:P-JJ:T-n/n 13:S-13:P-CD:T-n ,:S-,:P-,:T-np\np/punct[,]/np bonne:S-bonne:P-NN:T-n ,:S-,:P-,:T-punct[,] a:S-a:P-DT:T-np/n reporter:S-reporter:P-NN:T-n of:S-of:P-IN:T-np\np/np bbc:S-bbc:P-NN:T-n in:S-in:P-IN:T-np\np/np iraqi:S-iraqi:P-JJ:T-n/n capital:S-capital:P-NN:T-n/n baghdad:S-baghdad:P-NN:T-n ,:S-,:P-,:T-s[dcl]\np/(s[dcl]\np) confirmed:S-confirm:P-VBD:T-s[dcl]\np/np on:S-on:P-IN:T-s\np\(s\np)/np 13th:S-13th:P-NN:T-n that:S-that:P-IN:T-np\np/(s[dcl]/np) no:S-no:P-DT:T-np/n any:S-any:P-DT:T-n/n sign:S-sign:P-NN:T-n showed:S-show:P-VBD:T-s[dcl]\np/s[dcl] the:S-the:P-DT:T-np/n bomb:S-bomb:P-NN:T-n/n shelter:S-shelter:P-NN:T-n in:S-in:P-IN:T-np\np/np baghdad:S-baghdad:P-NN:T-n destroyed:S-destroy:P-VBN:T-s[pss]\np by:S-by:P-IN:T-s\np\(s\np)/np american:S-american:P-JJ:T-n/n bomber:S-bomber:P-NN:T-n was:S-be:P-VBD:T-s[dcl]\np/np a:S-a:P-DT:T-np/n military:S-military:P-JJ:T-n/n blindage:S-blindage:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] mr. siazon said , someone used to say , have n't we been burned once before ? when we were ready to accept them , it did n't take long before they fought agaion . w32:s\np\(s\np):NN:agaion w31:s[dcl]\np:VBD:fight w30:np:PRP:they w19:s[adj]\np/(s[to]\np):JJ:ready w9:s[dcl]\np/np:VBP:have w18:s[dcl]\np/(s[adj]\np):VBD:be w17:np:PRP:we w7:s[b]\np:VB:say w16:s/s/s[dcl]:WRB:when w15:s\np\(s\np):RB:before w5:s[pss]\np/(s[to]\np):VBD:use w14:s\np\(s\np)/(s\np\(s\np)):RB:once w4:np:DT:someone w13:s[pss]\np:VBN:burn w12:s[dcl]\np/(s[pss]\np):VBD:been w2:s[dcl]\np/s[dcl]/punct[,]:VBD:say w11:np:PRP:we w1:n:NN:siazon w10:s\np\(s\np):RB:n't w0:np/n:DT:mr. w29:s\np\(s\np)/s[dcl]:IN:before w28:s\np\(s\np):RB:long w27:s[b]\np:VB:take w26:s\np\(s\np):RB:n't w25:s[dcl]\np/(s[b]\np):VBD:do w24:np:PRP:it w22:np:PRP:them w21:s[b]\np/np:VB:accept mr.:S-mr.:P-DT:T-np/n siazon:S-siazon:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[dcl]/punct[,] ,:S-,:P-,:T-punct[,] someone:S-someone:P-DT:T-np used:S-use:P-VBD:T-s[pss]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) say:S-say:P-VB:T-s[b]\np ,:S-,:P-,:T-s[pss]\np\(s[pss]\np) have:S-have:P-VBP:T-s[dcl]\np/np n't:S-n't:P-RB:T-s\np\(s\np) we:S-we:P-PRP:T-np been:S-been:P-VBD:T-s[dcl]\np/(s[pss]\np) burned:S-burn:P-VBN:T-s[pss]\np once:S-once:P-RB:T-s\np\(s\np)/(s\np\(s\np)) before:S-before:P-RB:T-s\np\(s\np) ?:S-?:P-?:T-punct[?] when:S-when:P-WRB:T-s/s/s[dcl] we:S-we:P-PRP:T-np were:S-be:P-VBD:T-s[dcl]\np/(s[adj]\np) ready:S-ready:P-JJ:T-s[adj]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) accept:S-accept:P-VB:T-s[b]\np/np them:S-them:P-PRP:T-np ,:S-,:P-,:T-s/s\(s/s) it:S-it:P-PRP:T-np did:S-do:P-VBD:T-s[dcl]\np/(s[b]\np) n't:S-n't:P-RB:T-s\np\(s\np) take:S-take:P-VB:T-s[b]\np long:S-long:P-RB:T-s\np\(s\np) before:S-before:P-IN:T-s\np\(s\np)/s[dcl] they:S-they:P-PRP:T-np fought:S-fight:P-VBD:T-s[dcl]\np agaion:S-agaion:P-NN:T-s\np\(s\np) .:S-.:P-.:T-sent\s[dcl] siazon said : someone says , were we deceived before ? when we prepared to accept them but just after several days they began to fight again . w19:n/n:JJ:several w9:s\np\(s\np):IN:before w18:s/s/np:IN:after w8:s[dcl]\np:VBD:deceive w17:s/s/(s/s):RB:just w7:np:PRP:we w16:s$\(s$)/(s$):CC:but w6:s[dcl]\np/np:VBD:be w15:np:PRP:them w5:s[dcl]$\(s[dcl]$)/(s[dcl]$):,:, w14:s[b]\np/np:VB:accept w4:s[dcl]\np:VBZ:say w3:np:DT:someone w12:s[dcl]\np/(s[to]\np):VBD:prepare w2:s\s/s[dcl]:IN:| w11:np:PRP:we w1:s[dcl]\np/s[dcl]:VBD:say w10:s/s/s[dcl]:WRB:when w0:n:NN:siazon w25:s\np\(s\np):RB:again w24:s[b]\np:VB:fight w22:s[dcl]\np/(s[to]\np):VBD:begin w21:np:PRP:they w20:n:NNS:day siazon:S-siazon:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\np/s[dcl] &#58;:S-&#58;:P-IN:T-s\s/s[dcl] someone:S-someone:P-DT:T-np says:S-say:P-VBZ:T-s[dcl]\np ,:S-,:P-,:T-s[dcl]$\(s[dcl]$)/(s[dcl]$) were:S-be:P-VBD:T-s[dcl]\np/np we:S-we:P-PRP:T-np deceived:S-deceive:P-VBD:T-s[dcl]\np before:S-before:P-IN:T-s\np\(s\np) ?:S-?:P-?:T-punct[?] when:S-when:P-WRB:T-s/s/s[dcl] we:S-we:P-PRP:T-np prepared:S-prepare:P-VBD:T-s[dcl]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) accept:S-accept:P-VB:T-s[b]\np/np them:S-them:P-PRP:T-np but:S-but:P-CC:T-s$\(s$)/(s$) just:S-just:P-RB:T-s/s/(s/s) after:S-after:P-IN:T-s/s/np several:S-several:P-JJ:T-n/n days:S-day:P-NNS:T-n they:S-they:P-PRP:T-np began:S-begin:P-VBD:T-s[dcl]\np/(s[to]\np) to:S-to:P-TO:T-s[to]\np/(s[b]\np) fight:S-fight:P-VB:T-s[b]\np again:S-again:P-RB:T-s\np\(s\np) .:S-.:P-.:T-sent\s[dcl] railroad officials blamed the bus passengers for the accident , the indian news agency india press trust report said . w18:s[dcl]\s[dcl]\np:VBD:say w8:n:NN:accident w17:n:NN:report w7:np/n:DT:the w16:n/n:NN:trust w6:pp/np:IN:for w15:n/n:NN:press w5:n:NNS:passenger w14:n/n:NN:india w4:n/n:NN:bus w13:n/n:NN:agency w3:np/n:DT:the w12:n/n:NN:news w2:s[dcl]\np/pp/np:VBD:blame w11:n/n:JJ:indian w1:n:NNS:official w10:np/n:DT:the w0:n/n:NN:railroad railroad:S-railroad:P-NN:T-n/n officials:S-official:P-NNS:T-n blamed:S-blame:P-VBD:T-s[dcl]\np/pp/np the:S-the:P-DT:T-np/n bus:S-bus:P-NN:T-n/n passengers:S-passenger:P-NNS:T-n for:S-for:P-IN:T-pp/np the:S-the:P-DT:T-np/n accident:S-accident:P-NN:T-n ,:S-,:P-,:T-s\s/(s\s) the:S-the:P-DT:T-np/n indian:S-indian:P-JJ:T-n/n news:S-news:P-NN:T-n/n agency:S-agency:P-NN:T-n/n india:S-india:P-NN:T-n/n press:S-press:P-NN:T-n/n trust:S-trust:P-NN:T-n/n report:S-report:P-NN:T-n said:S-say:P-VBD:T-s[dcl]\s[dcl]\np .:S-.:P-.:T-sent\s[dcl] report of pti said that railway official charged the bus passengers upon the accident . w9:n/n:NN:bus w8:np/n:DT:the w7:s[dcl]\np/pp/np:VBD:charge w6:n:NN:official w5:n/n:NN:railway w3:s[dcl]\np/s[em]:VBD:say w2:n:NNS:pti w1:np\np/np:IN:of w0:n:NN:report w13:n:NN:accident w12:np/n:DT:the w11:pp/np:IN:upon w10:n:NNS:passenger report:S-report:P-NN:T-n of:S-of:P-IN:T-np\np/np pti:S-pti:P-NNS:T-n said:S-say:P-VBD:T-s[dcl]\np/s[em] that:S-that:P-DT:T-s[em]/s[dcl] railway:S-railway:P-NN:T-n/n official:S-official:P-NN:T-n charged:S-charge:P-VBD:T-s[dcl]\np/pp/np the:S-the:P-DT:T-np/n bus:S-bus:P-NN:T-n/n passengers:S-passenger:P-NNS:T-n upon:S-upon:P-IN:T-pp/np the:S-the:P-DT:T-np/n accident:S-accident:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] we mentioned the problems australia is concerned with , such as human rights and dispatching army officers to east timor . w9:np\np/np:IN:as w18:n:NN:timor w8:np\np/(np\np):JJ:such w7:pp/np:IN:with w17:n/n:JJ:east w16:np\np/np:TO:to w6:s[pss]\np/pp:VBN:concern w15:n:NNS:officer w5:s[dcl]\np/(s[pss]\np):VBZ:PASS w14:n/n:NN:army w4:n:NN:australia w13:n/n:VBG:dispatch w3:n/n:NNS:problem w12:np\np/np:CC:and w2:np/n:DT:the w11:n:NNS:rights w1:s[dcl]\np/np:VBD:mention w10:n/n:JJ:human w0:np:PRP:we we:S-we:P-PRP:T-np mentioned:S-mention:P-VBD:T-s[dcl]\np/np the:S-the:P-DT:T-np/n problems:S-problem:P-NNS:T-n/n australia:S-australia:P-NN:T-n is:S-be:P-VBZ:T-s[dcl]\np/(s[pss]\np) concerned:S-concern:P-VBN:T-s[pss]\np/pp with:S-with:P-IN:T-pp/np ,:S-,:P-,:T-punct[,] such:S-such:P-JJ:T-np\np/(np\np) as:S-as:P-IN:T-np\np/np human:S-human:P-JJ:T-n/n rights:S-rights:P-NNS:T-n and:S-and:P-CC:T-np\np/np dispatching:S-dispatch:P-VBG:T-n/n army:S-army:P-NN:T-n/n officers:S-officer:P-NNS:T-n to:S-to:P-TO:T-np\np/np east:S-east:P-JJ:T-n/n timor:S-timor:P-NN:T-n .:S-.:P-.:T-sent\(np\np) we mentioned the problems that have attracted australia 's attention , such as the humam rights in east timor and the issue of sending military officers to east timor . w19:np\np/np:CC:and w9:n:NN:attention w18:n:NN:timor w17:n/n:JJ:east w7:n:NN:australia w6:s[pt]\np/np:VBN:attract w16:np\np/np:IN:in w15:n:NNS:rights w5:s[dcl]\np/(s[pt]\np):VBP:PERF w14:n/n:NN:humam w3:n:NNS:problem w13:np/n:DT:the w12:np\np/np:IN:as w2:np/n:DT:the w11:np\np/(np\np):JJ:such w1:s[dcl]\np/np:VBD:mention w0:np:PRP:we w28:n:NN:timor w27:n/n:JJ:east w26:s\np\(s\np)/np:TO:to w25:n:NNS:officer w24:n/n:JJ:military w23:s[ng]\np/np:VBG:send w22:np\np/(s[ng]\np):IN:of w21:n:NN:issue w20:np/n:DT:the we:S-we:P-PRP:T-np mentioned:S-mention:P-VBD:T-s[dcl]\np/np the:S-the:P-DT:T-np/n problems:S-problem:P-NNS:T-n that:S-that:P-WDT:T-np\np/(s[dcl]\np) have:S-have:P-VBP:T-s[dcl]\np/(s[pt]\np) attracted:S-attract:P-VBN:T-s[pt]\np/np australia:S-australia:P-NN:T-n 's:S-'s:P-POS:T-np/n\np attention:S-attention:P-NN:T-n ,:S-,:P-,:T-s[dcl]\np\(s[dcl]\np) such:S-such:P-JJ:T-np\np/(np\np) as:S-as:P-IN:T-np\np/np the:S-the:P-DT:T-np/n humam:S-humam:P-NN:T-n/n rights:S-rights:P-NNS:T-n in:S-in:P-IN:T-np\np/np east:S-east:P-JJ:T-n/n timor:S-timor:P-NN:T-n and:S-and:P-CC:T-np\np/np the:S-the:P-DT:T-np/n issue:S-issue:P-NN:T-n of:S-of:P-IN:T-np\np/(s[ng]\np) sending:S-send:P-VBG:T-s[ng]\np/np military:S-military:P-JJ:T-n/n officers:S-officer:P-NNS:T-n to:S-to:P-TO:T-s\np\(s\np)/np east:S-east:P-JJ:T-n/n timor:S-timor:P-NN:T-n .:S-.:P-.:T-sent\s[dcl] the two prime ministers requested that un secretary-general prolong the un representative 's stay in cambodia to 6 months or longer . w19:np\np\(np\np)/(np\np):CC:or w9:np/n:DT:the w18:n:NNS:month w8:s[ng]\np/np:VBG:prolong w7:n:NN:secretary-general w17:n/n:CD:6 w16:np\np/np:TO:to w6:n/n:JJ:un w15:n:NN:cambodia w5:np/n:DT:that w14:np\np/np:IN:in w4:s[dcl]\np/np:VBD:request w13:n:NN:stay w3:n:NNS:minister w2:n/n:JJ:prime w11:n:NN:representative w1:n/n:CD:two w10:n/n:JJ:un w0:np/n:DT:the w20:np\np:JJR:longer the:S-the:P-DT:T-np/n two:S-two:P-CD:T-n/n prime:S-prime:P-JJ:T-n/n ministers:S-minister:P-NNS:T-n requested:S-request:P-VBD:T-s[dcl]\np/np that:S-that:P-DT:T-np/n un:S-un:P-JJ:T-n/n secretary-general:S-secretary-general:P-NN:T-n prolong:S-prolong:P-VBG:T-s[ng]\np/np the:S-the:P-DT:T-np/n un:S-un:P-JJ:T-n/n representative:S-representative:P-NN:T-n 's:S-'s:P-POS:T-np/n\np stay:S-stay:P-NN:T-n in:S-in:P-IN:T-np\np/np cambodia:S-cambodia:P-NN:T-n to:S-to:P-TO:T-np\np/np 6:S-6:P-CD:T-n/n months:S-month:P-NNS:T-n or:S-or:P-CC:T-np\np\(np\np)/(np\np) longer:S-longer:P-JJR:T-np\np .:S-.:P-.:T-sent\s[dcl] the two prime ministers of cambodia asked the un secretary general to extend the un representative 's term of stay in cambodia to six months or longer . w19:n:NN:stay w9:n:NN:secretary w18:np\np/np:IN:of w8:n/n:NN:un w17:n:NN:term w7:np/n:DT:the w6:s[dcl]\np/(s[to]\np)/np:VBD:ask w15:n:NN:representative w5:n:NN:cambodia w14:n/n:JJ:un w4:np\np/np:IN:of w3:n:NNS:minister w13:np/n:DT:the w12:s[b]\np/pp/np:VB:extend w2:n/n:JJ:prime w1:n/n:CD:two w10:s[adj]\np:JJ:general w0:np/n:DT:the w26:np\np:JJR:longer w25:np\np/(np\np):CC:or w24:n:NNS:month w23:n/n:CD:six w22:pp/np:TO:to w21:n:NN:cambodia w20:np\np/np:IN:in the:S-the:P-DT:T-np/n two:S-two:P-CD:T-n/n prime:S-prime:P-JJ:T-n/n ministers:S-minister:P-NNS:T-n of:S-of:P-IN:T-np\np/np cambodia:S-cambodia:P-NN:T-n asked:S-ask:P-VBD:T-s[dcl]\np/(s[to]\np)/np the:S-the:P-DT:T-np/n un:S-un:P-NN:T-n/n secretary:S-secretary:P-NN:T-n general:S-general:P-JJ:T-s[adj]\np to:S-to:P-TO:T-s[to]\np/(s[b]\np) extend:S-extend:P-VB:T-s[b]\np/pp/np the:S-the:P-DT:T-np/n un:S-un:P-JJ:T-n/n representative:S-representative:P-NN:T-n 's:S-'s:P-POS:T-np/n\np term:S-term:P-NN:T-n of:S-of:P-IN:T-np\np/np stay:S-stay:P-NN:T-n in:S-in:P-IN:T-np\np/np cambodia:S-cambodia:P-NN:T-n to:S-to:P-TO:T-pp/np six:S-six:P-CD:T-n/n months:S-month:P-NNS:T-n or:S-or:P-CC:T-np\np/(np\np) longer:S-longer:P-JJR:T-np\np .:S-.:P-.:T-sent\s[dcl] ================================================ FILE: test/rules.xml ================================================ ================================================ FILE: test/testlf.xml ================================================