Repository: chrislit/abydos
Branch: master
Commit: 344346a5fceb
Files: 895
Total size: 86.4 MB
Directory structure:
gitextract_pif85cg4/
├── .circleci/
│ └── config.yml
├── .codeclimate.yml
├── .coveragerc
├── .github/
│ └── CODEOWNERS
├── .gitignore
├── .gitmodules
├── .project
├── .pypirc
├── .pyup.yml
├── .travis.yml
├── AUTHORS.rst
├── CODE_OF_CONDUCT.rst
├── CODING_STANDARDS.rst
├── FAQ.rst
├── HISTORY.rst
├── LICENSE
├── MANIFEST.in
├── Pipfile
├── README.rst
├── VERSION.rst
├── abydos/
│ ├── __init__.py
│ ├── compression/
│ │ ├── __init__.py
│ │ ├── _arithmetic.py
│ │ ├── _bwt.py
│ │ └── _rle.py
│ ├── corpus/
│ │ ├── __init__.py
│ │ ├── _corpus.py
│ │ ├── _ngram_corpus.py
│ │ └── _unigram_corpus.py
│ ├── distance/
│ │ ├── __init__.py
│ │ ├── _aline.py
│ │ ├── _ample.py
│ │ ├── _anderberg.py
│ │ ├── _andres_marzo_delta.py
│ │ ├── _average_linkage.py
│ │ ├── _azzoo.py
│ │ ├── _bag.py
│ │ ├── _baroni_urbani_buser_i.py
│ │ ├── _baroni_urbani_buser_ii.py
│ │ ├── _batagelj_bren.py
│ │ ├── _baulieu_i.py
│ │ ├── _baulieu_ii.py
│ │ ├── _baulieu_iii.py
│ │ ├── _baulieu_iv.py
│ │ ├── _baulieu_ix.py
│ │ ├── _baulieu_v.py
│ │ ├── _baulieu_vi.py
│ │ ├── _baulieu_vii.py
│ │ ├── _baulieu_viii.py
│ │ ├── _baulieu_x.py
│ │ ├── _baulieu_xi.py
│ │ ├── _baulieu_xii.py
│ │ ├── _baulieu_xiii.py
│ │ ├── _baulieu_xiv.py
│ │ ├── _baulieu_xv.py
│ │ ├── _baystat.py
│ │ ├── _benini_i.py
│ │ ├── _benini_ii.py
│ │ ├── _bennet.py
│ │ ├── _bhattacharyya.py
│ │ ├── _bisim.py
│ │ ├── _bleu.py
│ │ ├── _block_levenshtein.py
│ │ ├── _brainerd_robinson.py
│ │ ├── _braun_blanquet.py
│ │ ├── _canberra.py
│ │ ├── _cao.py
│ │ ├── _chao_dice.py
│ │ ├── _chao_jaccard.py
│ │ ├── _chebyshev.py
│ │ ├── _chord.py
│ │ ├── _clark.py
│ │ ├── _clement.py
│ │ ├── _cohen_kappa.py
│ │ ├── _cole.py
│ │ ├── _complete_linkage.py
│ │ ├── _consonni_todeschini_i.py
│ │ ├── _consonni_todeschini_ii.py
│ │ ├── _consonni_todeschini_iii.py
│ │ ├── _consonni_todeschini_iv.py
│ │ ├── _consonni_todeschini_v.py
│ │ ├── _cormode_lz.py
│ │ ├── _cosine.py
│ │ ├── _covington.py
│ │ ├── _damerau_levenshtein.py
│ │ ├── _dennis.py
│ │ ├── _dice.py
│ │ ├── _dice_asymmetric_i.py
│ │ ├── _dice_asymmetric_ii.py
│ │ ├── _digby.py
│ │ ├── _discounted_levenshtein.py
│ │ ├── _dispersion.py
│ │ ├── _distance.py
│ │ ├── _doolittle.py
│ │ ├── _dunning.py
│ │ ├── _editex.py
│ │ ├── _euclidean.py
│ │ ├── _eudex.py
│ │ ├── _eyraud.py
│ │ ├── _fager_mcgowan.py
│ │ ├── _faith.py
│ │ ├── _fellegi_sunter.py
│ │ ├── _fidelity.py
│ │ ├── _fleiss.py
│ │ ├── _fleiss_levin_paik.py
│ │ ├── _flexmetric.py
│ │ ├── _forbes_i.py
│ │ ├── _forbes_ii.py
│ │ ├── _fossum.py
│ │ ├── _fuzzywuzzy_partial_string.py
│ │ ├── _fuzzywuzzy_token_set.py
│ │ ├── _fuzzywuzzy_token_sort.py
│ │ ├── _generalized_fleiss.py
│ │ ├── _gilbert.py
│ │ ├── _gilbert_wells.py
│ │ ├── _gini_i.py
│ │ ├── _gini_ii.py
│ │ ├── _goodall.py
│ │ ├── _goodman_kruskal_lambda.py
│ │ ├── _goodman_kruskal_lambda_r.py
│ │ ├── _goodman_kruskal_tau_a.py
│ │ ├── _goodman_kruskal_tau_b.py
│ │ ├── _gotoh.py
│ │ ├── _gower_legendre.py
│ │ ├── _guth.py
│ │ ├── _guttman_lambda_a.py
│ │ ├── _guttman_lambda_b.py
│ │ ├── _gwet_ac.py
│ │ ├── _hamann.py
│ │ ├── _hamming.py
│ │ ├── _harris_lahey.py
│ │ ├── _hassanat.py
│ │ ├── _hawkins_dotson.py
│ │ ├── _hellinger.py
│ │ ├── _henderson_heron.py
│ │ ├── _higuera_mico.py
│ │ ├── _horn_morisita.py
│ │ ├── _hurlbert.py
│ │ ├── _ident.py
│ │ ├── _inclusion.py
│ │ ├── _indel.py
│ │ ├── _isg.py
│ │ ├── _iterative_substring.py
│ │ ├── _jaccard.py
│ │ ├── _jaccard_nm.py
│ │ ├── _jaro_winkler.py
│ │ ├── _jensen_shannon.py
│ │ ├── _johnson.py
│ │ ├── _kendall_tau.py
│ │ ├── _kent_foster_i.py
│ │ ├── _kent_foster_ii.py
│ │ ├── _koppen_i.py
│ │ ├── _koppen_ii.py
│ │ ├── _kuder_richardson.py
│ │ ├── _kuhns_i.py
│ │ ├── _kuhns_ii.py
│ │ ├── _kuhns_iii.py
│ │ ├── _kuhns_iv.py
│ │ ├── _kuhns_ix.py
│ │ ├── _kuhns_v.py
│ │ ├── _kuhns_vi.py
│ │ ├── _kuhns_vii.py
│ │ ├── _kuhns_viii.py
│ │ ├── _kuhns_x.py
│ │ ├── _kuhns_xi.py
│ │ ├── _kuhns_xii.py
│ │ ├── _kulczynski_i.py
│ │ ├── _kulczynski_ii.py
│ │ ├── _lcprefix.py
│ │ ├── _lcsseq.py
│ │ ├── _lcsstr.py
│ │ ├── _lcsuffix.py
│ │ ├── _length.py
│ │ ├── _levenshtein.py
│ │ ├── _lig3.py
│ │ ├── _lorentzian.py
│ │ ├── _maarel.py
│ │ ├── _manhattan.py
│ │ ├── _marking.py
│ │ ├── _marking_metric.py
│ │ ├── _masi.py
│ │ ├── _matusita.py
│ │ ├── _maxwell_pilliner.py
│ │ ├── _mcconnaughey.py
│ │ ├── _mcewen_michael.py
│ │ ├── _meta_levenshtein.py
│ │ ├── _michelet.py
│ │ ├── _millar.py
│ │ ├── _minhash.py
│ │ ├── _minkowski.py
│ │ ├── _mlipns.py
│ │ ├── _monge_elkan.py
│ │ ├── _morisita.py
│ │ ├── _mountford.py
│ │ ├── _mra.py
│ │ ├── _ms_contingency.py
│ │ ├── _mutual_information.py
│ │ ├── _ncd_arith.py
│ │ ├── _ncd_bwtrle.py
│ │ ├── _ncd_bz2.py
│ │ ├── _ncd_lzma.py
│ │ ├── _ncd_lzss.py
│ │ ├── _ncd_paq9a.py
│ │ ├── _ncd_rle.py
│ │ ├── _ncd_zlib.py
│ │ ├── _needleman_wunsch.py
│ │ ├── _overlap.py
│ │ ├── _ozbay.py
│ │ ├── _pattern.py
│ │ ├── _pearson_chi_squared.py
│ │ ├── _pearson_heron_ii.py
│ │ ├── _pearson_ii.py
│ │ ├── _pearson_iii.py
│ │ ├── _pearson_phi.py
│ │ ├── _peirce.py
│ │ ├── _phonetic_distance.py
│ │ ├── _phonetic_edit_distance.py
│ │ ├── _positional_q_gram_dice.py
│ │ ├── _positional_q_gram_jaccard.py
│ │ ├── _positional_q_gram_overlap.py
│ │ ├── _prefix.py
│ │ ├── _q_gram.py
│ │ ├── _quantitative_cosine.py
│ │ ├── _quantitative_dice.py
│ │ ├── _quantitative_jaccard.py
│ │ ├── _ratcliff_obershelp.py
│ │ ├── _raup_crick.py
│ │ ├── _rees_levenshtein.py
│ │ ├── _relaxed_hamming.py
│ │ ├── _roberts.py
│ │ ├── _rogers_tanimoto.py
│ │ ├── _rogot_goldberg.py
│ │ ├── _rouge_l.py
│ │ ├── _rouge_s.py
│ │ ├── _rouge_su.py
│ │ ├── _rouge_w.py
│ │ ├── _russell_rao.py
│ │ ├── _saps.py
│ │ ├── _scott_pi.py
│ │ ├── _shape.py
│ │ ├── _shapira_storer_i.py
│ │ ├── _sift4.py
│ │ ├── _sift4_extended.py
│ │ ├── _sift4_simplest.py
│ │ ├── _single_linkage.py
│ │ ├── _size.py
│ │ ├── _smith_waterman.py
│ │ ├── _soft_cosine.py
│ │ ├── _softtf_idf.py
│ │ ├── _sokal_michener.py
│ │ ├── _sokal_sneath_i.py
│ │ ├── _sokal_sneath_ii.py
│ │ ├── _sokal_sneath_iii.py
│ │ ├── _sokal_sneath_iv.py
│ │ ├── _sokal_sneath_v.py
│ │ ├── _sorgenfrei.py
│ │ ├── _ssk.py
│ │ ├── _steffensen.py
│ │ ├── _stiles.py
│ │ ├── _strcmp95.py
│ │ ├── _stuart_tau.py
│ │ ├── _suffix.py
│ │ ├── _synoname.py
│ │ ├── _tarantula.py
│ │ ├── _tarwid.py
│ │ ├── _tetrachoric.py
│ │ ├── _tf_idf.py
│ │ ├── _tichy.py
│ │ ├── _token_distance.py
│ │ ├── _tulloss_r.py
│ │ ├── _tulloss_s.py
│ │ ├── _tulloss_t.py
│ │ ├── _tulloss_u.py
│ │ ├── _tversky.py
│ │ ├── _typo.py
│ │ ├── _unigram_subtuple.py
│ │ ├── _unknown_a.py
│ │ ├── _unknown_b.py
│ │ ├── _unknown_c.py
│ │ ├── _unknown_d.py
│ │ ├── _unknown_e.py
│ │ ├── _unknown_f.py
│ │ ├── _unknown_g.py
│ │ ├── _unknown_h.py
│ │ ├── _unknown_i.py
│ │ ├── _unknown_j.py
│ │ ├── _unknown_k.py
│ │ ├── _unknown_l.py
│ │ ├── _unknown_m.py
│ │ ├── _upholt.py
│ │ ├── _vps.py
│ │ ├── _warrens_i.py
│ │ ├── _warrens_ii.py
│ │ ├── _warrens_iii.py
│ │ ├── _warrens_iv.py
│ │ ├── _warrens_v.py
│ │ ├── _weighted_jaccard.py
│ │ ├── _whittaker.py
│ │ ├── _yates_chi_squared.py
│ │ ├── _yjhhr.py
│ │ ├── _yujian_bo.py
│ │ ├── _yule_q.py
│ │ ├── _yule_q_ii.py
│ │ └── _yule_y.py
│ ├── fingerprint/
│ │ ├── __init__.py
│ │ ├── _bwtf.py
│ │ ├── _bwtrlef.py
│ │ ├── _consonant.py
│ │ ├── _count.py
│ │ ├── _extract.py
│ │ ├── _extract_position_frequency.py
│ │ ├── _fingerprint.py
│ │ ├── _lacss.py
│ │ ├── _lc_cutter.py
│ │ ├── _occurrence.py
│ │ ├── _occurrence_halved.py
│ │ ├── _omission_key.py
│ │ ├── _phonetic.py
│ │ ├── _position.py
│ │ ├── _qgram.py
│ │ ├── _skeleton_key.py
│ │ ├── _string.py
│ │ └── _synoname_toolcode.py
│ ├── phones/
│ │ ├── __init__.py
│ │ └── _phones.py
│ ├── phonetic/
│ │ ├── __init__.py
│ │ ├── _ainsworth.py
│ │ ├── _alpha_sis.py
│ │ ├── _beider_morse.py
│ │ ├── _beider_morse_data.py
│ │ ├── _caverphone.py
│ │ ├── _daitch_mokotoff.py
│ │ ├── _davidson.py
│ │ ├── _dolby.py
│ │ ├── _double_metaphone.py
│ │ ├── _eudex.py
│ │ ├── _fonem.py
│ │ ├── _fuzzy_soundex.py
│ │ ├── _haase.py
│ │ ├── _henry_early.py
│ │ ├── _koelner.py
│ │ ├── _lein.py
│ │ ├── _meta_soundex.py
│ │ ├── _metaphone.py
│ │ ├── _mra.py
│ │ ├── _norphone.py
│ │ ├── _nrl.py
│ │ ├── _nysiis.py
│ │ ├── _onca.py
│ │ ├── _parmar_kumbharana.py
│ │ ├── _phonem.py
│ │ ├── _phonet.py
│ │ ├── _phonetic.py
│ │ ├── _phonetic_spanish.py
│ │ ├── _phonex.py
│ │ ├── _phonic.py
│ │ ├── _phonix.py
│ │ ├── _pshp_soundex_first.py
│ │ ├── _pshp_soundex_last.py
│ │ ├── _refined_soundex.py
│ │ ├── _reth_schek.py
│ │ ├── _roger_root.py
│ │ ├── _russell_index.py
│ │ ├── _sfinx_bis.py
│ │ ├── _sound_d.py
│ │ ├── _soundex.py
│ │ ├── _soundex_br.py
│ │ ├── _spanish_metaphone.py
│ │ ├── _spfc.py
│ │ ├── _statistics_canada.py
│ │ └── _waahlin.py
│ ├── stats/
│ │ ├── __init__.py
│ │ ├── _confusion_table.py
│ │ ├── _mean.py
│ │ └── _pairwise.py
│ ├── stemmer/
│ │ ├── __init__.py
│ │ ├── _caumanns.py
│ │ ├── _clef_german.py
│ │ ├── _clef_german_plus.py
│ │ ├── _clef_swedish.py
│ │ ├── _lovins.py
│ │ ├── _paice_husk.py
│ │ ├── _porter.py
│ │ ├── _porter2.py
│ │ ├── _s_stemmer.py
│ │ ├── _schinke.py
│ │ ├── _snowball.py
│ │ ├── _snowball_danish.py
│ │ ├── _snowball_dutch.py
│ │ ├── _snowball_german.py
│ │ ├── _snowball_norwegian.py
│ │ ├── _snowball_swedish.py
│ │ ├── _stemmer.py
│ │ └── _uea_lite.py
│ ├── tokenizer/
│ │ ├── __init__.py
│ │ ├── _c_or_v_cluster.py
│ │ ├── _character.py
│ │ ├── _cv_cluster.py
│ │ ├── _legalipy.py
│ │ ├── _nltk.py
│ │ ├── _q_grams.py
│ │ ├── _q_skipgrams.py
│ │ ├── _regexp.py
│ │ ├── _saps.py
│ │ ├── _sonoripy.py
│ │ ├── _tokenizer.py
│ │ ├── _vc_cluster.py
│ │ ├── _whitespace.py
│ │ └── _wordpunct.py
│ └── util/
│ ├── __init__.py
│ ├── _data.py
│ ├── _ncr.py
│ └── _prod.py
├── abydos.xcf
├── azure-pipelines.yml
├── badge_update.py
├── binder/
│ ├── Basic Examples.ipynb
│ ├── Reversed Metaphone using Keras seq2seq.ipynb
│ ├── Text Classification of Drug Reviews.ipynb
│ └── requirements.txt
├── data/
│ └── features/
│ ├── features_csv_to_dict.py
│ ├── features_symbols.csv
│ └── features_terms.csv
├── docs/
│ ├── Makefile
│ ├── _build/
│ │ └── .gitignore
│ ├── _static/
│ │ └── .gitignore
│ ├── _templates/
│ │ └── .gitignore
│ ├── abydos.bib
│ ├── abydos.compression.rst
│ ├── abydos.corpus.rst
│ ├── abydos.distance.rst
│ ├── abydos.fingerprint.rst
│ ├── abydos.phones.rst
│ ├── abydos.phonetic.rst
│ ├── abydos.rst
│ ├── abydos.stats.rst
│ ├── abydos.stemmer.rst
│ ├── abydos.tokenizer.rst
│ ├── abydos.util.rst
│ ├── conf.py
│ ├── faq.rst
│ ├── history.rst
│ ├── index.rst
│ ├── intro.rst
│ ├── make.bat
│ ├── modules.rst
│ └── requirements.txt
├── helpers/
│ ├── bm_php2py.py
│ └── call_and_write_log.py
├── pyproject.toml
├── requirements-dev.txt
├── requirements-test.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── stubs/
│ ├── lzss/
│ │ └── __init__.pyi
│ ├── numpy/
│ │ ├── __init__.pyi
│ │ └── core/
│ │ ├── __init__.pyi
│ │ ├── _internal.pyi
│ │ ├── numeric.pyi
│ │ └── numerictypes.pyi
│ ├── paq/
│ │ └── __init__.pyi
│ └── syllabipy/
│ ├── __init__.pyi
│ ├── legalipy.pyi
│ └── sonoripy.pyi
├── tests/
│ ├── __init__.py
│ ├── compression/
│ │ ├── __init__.py
│ │ ├── test_compression_arithmetic.py
│ │ ├── test_compression_bwt.py
│ │ └── test_compression_rle.py
│ ├── corpora/
│ │ ├── fake_words.csv
│ │ ├── googlebooks-ger-all-1gram-20120701-y
│ │ ├── googlebooks-ger-all-2gram-20120701-yp
│ │ ├── googlebooks-ger-all-3gram-20120701-yp
│ │ ├── homophones.csv
│ │ ├── misspellings.csv
│ │ ├── nachnamen.bm.cc.csv
│ │ ├── nachnamen.bm.csv
│ │ ├── nachnamen.csv
│ │ ├── ngerman.csv
│ │ ├── paicehusk.csv
│ │ ├── php_caverphone.csv
│ │ ├── simple-ngrams-pos.txt
│ │ ├── simple-ngrams.txt
│ │ ├── snowball_danish.csv
│ │ ├── snowball_dutch.csv
│ │ ├── snowball_german.csv
│ │ ├── snowball_lovins.csv
│ │ ├── snowball_norwegian.csv
│ │ ├── snowball_porter.csv
│ │ ├── snowball_porter2.csv
│ │ ├── snowball_schinke.csv
│ │ ├── snowball_swedish.csv
│ │ ├── uea-lite_wsj.csv
│ │ ├── uscensus2000.bm.cc.csv
│ │ ├── uscensus2000.bm.csv
│ │ ├── uscensus2000.csv
│ │ ├── variantNames.csv
│ │ └── wikipediaCommonMisspellings.csv
│ ├── corpus/
│ │ ├── __init__.py
│ │ ├── test_corpus_corpus.py
│ │ ├── test_corpus_n_gram_corpus.py
│ │ └── test_corpus_unigram_corpus.py
│ ├── distance/
│ │ ├── __init__.py
│ │ ├── test_distance__distance.py
│ │ ├── test_distance__token_distance.py
│ │ ├── test_distance_aline.py
│ │ ├── test_distance_ample.py
│ │ ├── test_distance_anderberg.py
│ │ ├── test_distance_andres_marzo_delta.py
│ │ ├── test_distance_average_linkage.py
│ │ ├── test_distance_azzoo.py
│ │ ├── test_distance_bag.py
│ │ ├── test_distance_baroni_urbani_buser_i.py
│ │ ├── test_distance_baroni_urbani_buser_ii.py
│ │ ├── test_distance_batagelj_bren.py
│ │ ├── test_distance_baulieu_i.py
│ │ ├── test_distance_baulieu_ii.py
│ │ ├── test_distance_baulieu_iii.py
│ │ ├── test_distance_baulieu_iv.py
│ │ ├── test_distance_baulieu_ix.py
│ │ ├── test_distance_baulieu_v.py
│ │ ├── test_distance_baulieu_vi.py
│ │ ├── test_distance_baulieu_vii.py
│ │ ├── test_distance_baulieu_viii.py
│ │ ├── test_distance_baulieu_x.py
│ │ ├── test_distance_baulieu_xi.py
│ │ ├── test_distance_baulieu_xii.py
│ │ ├── test_distance_baulieu_xiii.py
│ │ ├── test_distance_baulieu_xiv.py
│ │ ├── test_distance_baulieu_xv.py
│ │ ├── test_distance_baystat.py
│ │ ├── test_distance_benini_i.py
│ │ ├── test_distance_benini_ii.py
│ │ ├── test_distance_bennet.py
│ │ ├── test_distance_bhattacharyya.py
│ │ ├── test_distance_bisim.py
│ │ ├── test_distance_bleu.py
│ │ ├── test_distance_block_levenshtein.py
│ │ ├── test_distance_brainerd_robinson.py
│ │ ├── test_distance_braun_blanquet.py
│ │ ├── test_distance_canberra.py
│ │ ├── test_distance_cao.py
│ │ ├── test_distance_chao_dice.py
│ │ ├── test_distance_chao_jaccard.py
│ │ ├── test_distance_chebyshev.py
│ │ ├── test_distance_chord.py
│ │ ├── test_distance_clark.py
│ │ ├── test_distance_clement.py
│ │ ├── test_distance_cohen_kappa.py
│ │ ├── test_distance_cole.py
│ │ ├── test_distance_complete_linkage.py
│ │ ├── test_distance_consonni_todeschini_i.py
│ │ ├── test_distance_consonni_todeschini_ii.py
│ │ ├── test_distance_consonni_todeschini_iii.py
│ │ ├── test_distance_consonni_todeschini_iv.py
│ │ ├── test_distance_consonni_todeschini_v.py
│ │ ├── test_distance_cormode_lz.py
│ │ ├── test_distance_cosine.py
│ │ ├── test_distance_covington.py
│ │ ├── test_distance_damerau_levenshtein.py
│ │ ├── test_distance_dennis.py
│ │ ├── test_distance_dice.py
│ │ ├── test_distance_dice_asymmetric_i.py
│ │ ├── test_distance_dice_asymmetric_ii.py
│ │ ├── test_distance_digby.py
│ │ ├── test_distance_discounted_levenshtein.py
│ │ ├── test_distance_dispersion.py
│ │ ├── test_distance_doolittle.py
│ │ ├── test_distance_dunning.py
│ │ ├── test_distance_editex.py
│ │ ├── test_distance_euclidean.py
│ │ ├── test_distance_eudex.py
│ │ ├── test_distance_eyraud.py
│ │ ├── test_distance_fager_mcgowan.py
│ │ ├── test_distance_faith.py
│ │ ├── test_distance_fellegi_sunter.py
│ │ ├── test_distance_fidelity.py
│ │ ├── test_distance_fleiss.py
│ │ ├── test_distance_fleiss_levin_paik.py
│ │ ├── test_distance_flexmetric.py
│ │ ├── test_distance_forbes_i.py
│ │ ├── test_distance_forbes_ii.py
│ │ ├── test_distance_fossum.py
│ │ ├── test_distance_fuzzywuzzy_partial_string.py
│ │ ├── test_distance_fuzzywuzzy_token_set.py
│ │ ├── test_distance_fuzzywuzzy_token_sort.py
│ │ ├── test_distance_generalized_fleiss.py
│ │ ├── test_distance_gilbert.py
│ │ ├── test_distance_gilbert_wells.py
│ │ ├── test_distance_gini_i.py
│ │ ├── test_distance_gini_ii.py
│ │ ├── test_distance_goodall.py
│ │ ├── test_distance_goodman_kruskal_lambda.py
│ │ ├── test_distance_goodman_kruskal_lambda_r.py
│ │ ├── test_distance_goodman_kruskal_tau_a.py
│ │ ├── test_distance_goodman_kruskal_tau_b.py
│ │ ├── test_distance_gotoh.py
│ │ ├── test_distance_gower_legendre.py
│ │ ├── test_distance_guth.py
│ │ ├── test_distance_guttman_lambda_a.py
│ │ ├── test_distance_guttman_lambda_b.py
│ │ ├── test_distance_gwet_ac.py
│ │ ├── test_distance_hamann.py
│ │ ├── test_distance_hamming.py
│ │ ├── test_distance_harris_lahey.py
│ │ ├── test_distance_hassanat.py
│ │ ├── test_distance_hawkins_dotson.py
│ │ ├── test_distance_hellinger.py
│ │ ├── test_distance_henderson_heron.py
│ │ ├── test_distance_higuera_mico.py
│ │ ├── test_distance_horn_morisita.py
│ │ ├── test_distance_hurlbert.py
│ │ ├── test_distance_ident.py
│ │ ├── test_distance_inclusion.py
│ │ ├── test_distance_indel.py
│ │ ├── test_distance_isg.py
│ │ ├── test_distance_iterative_substring.py
│ │ ├── test_distance_jaccard.py
│ │ ├── test_distance_jaccard_nm.py
│ │ ├── test_distance_jaro_winkler.py
│ │ ├── test_distance_jensen_shannon.py
│ │ ├── test_distance_johnson.py
│ │ ├── test_distance_kendall_tau.py
│ │ ├── test_distance_kent_foster_i.py
│ │ ├── test_distance_kent_foster_ii.py
│ │ ├── test_distance_koppen_i.py
│ │ ├── test_distance_koppen_ii.py
│ │ ├── test_distance_kuder_richardson.py
│ │ ├── test_distance_kuhns_i.py
│ │ ├── test_distance_kuhns_ii.py
│ │ ├── test_distance_kuhns_iii.py
│ │ ├── test_distance_kuhns_iv.py
│ │ ├── test_distance_kuhns_ix.py
│ │ ├── test_distance_kuhns_v.py
│ │ ├── test_distance_kuhns_vi.py
│ │ ├── test_distance_kuhns_vii.py
│ │ ├── test_distance_kuhns_viii.py
│ │ ├── test_distance_kuhns_x.py
│ │ ├── test_distance_kuhns_xi.py
│ │ ├── test_distance_kuhns_xii.py
│ │ ├── test_distance_kulczynski_i.py
│ │ ├── test_distance_kulczynski_ii.py
│ │ ├── test_distance_lcprefix.py
│ │ ├── test_distance_lcsseq.py
│ │ ├── test_distance_lcsstr.py
│ │ ├── test_distance_lcsuffix.py
│ │ ├── test_distance_length.py
│ │ ├── test_distance_levenshtein.py
│ │ ├── test_distance_lig3.py
│ │ ├── test_distance_lorentzian.py
│ │ ├── test_distance_maarel.py
│ │ ├── test_distance_manhattan.py
│ │ ├── test_distance_marking.py
│ │ ├── test_distance_marking_metric.py
│ │ ├── test_distance_masi.py
│ │ ├── test_distance_matusita.py
│ │ ├── test_distance_maxwell_pilliner.py
│ │ ├── test_distance_mcconnaughey.py
│ │ ├── test_distance_mcewen_michael.py
│ │ ├── test_distance_meta_levenshtein.py
│ │ ├── test_distance_michelet.py
│ │ ├── test_distance_millar.py
│ │ ├── test_distance_minhash.py
│ │ ├── test_distance_minkowski.py
│ │ ├── test_distance_mlipns.py
│ │ ├── test_distance_monge_elkan.py
│ │ ├── test_distance_morisita.py
│ │ ├── test_distance_mountford.py
│ │ ├── test_distance_mra.py
│ │ ├── test_distance_ms_contingency.py
│ │ ├── test_distance_mutual_information.py
│ │ ├── test_distance_ncd_arith.py
│ │ ├── test_distance_ncd_bwtrle.py
│ │ ├── test_distance_ncd_bz2.py
│ │ ├── test_distance_ncd_lzma.py
│ │ ├── test_distance_ncd_lzss.py
│ │ ├── test_distance_ncd_paq9a.py
│ │ ├── test_distance_ncd_rle.py
│ │ ├── test_distance_ncd_zlib.py
│ │ ├── test_distance_needleman_wunsch.py
│ │ ├── test_distance_overlap.py
│ │ ├── test_distance_ozbay.py
│ │ ├── test_distance_pattern.py
│ │ ├── test_distance_pearson_chi_squared.py
│ │ ├── test_distance_pearson_heron_ii.py
│ │ ├── test_distance_pearson_ii.py
│ │ ├── test_distance_pearson_iii.py
│ │ ├── test_distance_pearson_phi.py
│ │ ├── test_distance_peirce.py
│ │ ├── test_distance_phonetic_distance.py
│ │ ├── test_distance_phonetic_edit_distance.py
│ │ ├── test_distance_positional_q_gram_dice.py
│ │ ├── test_distance_positional_q_gram_jaccard.py
│ │ ├── test_distance_positional_q_gram_overlap.py
│ │ ├── test_distance_prefix.py
│ │ ├── test_distance_q_gram.py
│ │ ├── test_distance_quantitative_cosine.py
│ │ ├── test_distance_quantitative_dice.py
│ │ ├── test_distance_quantitative_jaccard.py
│ │ ├── test_distance_ratcliff_obershelp.py
│ │ ├── test_distance_raup_crick.py
│ │ ├── test_distance_rees_levenshtein.py
│ │ ├── test_distance_relaxed_hamming.py
│ │ ├── test_distance_roberts.py
│ │ ├── test_distance_rogers_tanimoto.py
│ │ ├── test_distance_rogot_goldberg.py
│ │ ├── test_distance_rouge_l.py
│ │ ├── test_distance_rouge_s.py
│ │ ├── test_distance_rouge_su.py
│ │ ├── test_distance_rouge_w.py
│ │ ├── test_distance_russell_rao.py
│ │ ├── test_distance_saps.py
│ │ ├── test_distance_scott_pi.py
│ │ ├── test_distance_shape.py
│ │ ├── test_distance_shapira_storer_i.py
│ │ ├── test_distance_sift4.py
│ │ ├── test_distance_sift4_extended.py
│ │ ├── test_distance_sift4_simplest.py
│ │ ├── test_distance_single_linkage.py
│ │ ├── test_distance_size.py
│ │ ├── test_distance_smith_waterman.py
│ │ ├── test_distance_soft_cosine.py
│ │ ├── test_distance_softtf_idf.py
│ │ ├── test_distance_sokal_michener.py
│ │ ├── test_distance_sokal_sneath_i.py
│ │ ├── test_distance_sokal_sneath_ii.py
│ │ ├── test_distance_sokal_sneath_iii.py
│ │ ├── test_distance_sokal_sneath_iv.py
│ │ ├── test_distance_sokal_sneath_v.py
│ │ ├── test_distance_sorgenfrei.py
│ │ ├── test_distance_ssk.py
│ │ ├── test_distance_steffensen.py
│ │ ├── test_distance_stiles.py
│ │ ├── test_distance_strcmp95.py
│ │ ├── test_distance_stuart_tau.py
│ │ ├── test_distance_suffix.py
│ │ ├── test_distance_synoname.py
│ │ ├── test_distance_tarantula.py
│ │ ├── test_distance_tarwid.py
│ │ ├── test_distance_tetrachoric.py
│ │ ├── test_distance_tf_idf.py
│ │ ├── test_distance_tichy.py
│ │ ├── test_distance_tulloss_r.py
│ │ ├── test_distance_tulloss_s.py
│ │ ├── test_distance_tulloss_t.py
│ │ ├── test_distance_tulloss_u.py
│ │ ├── test_distance_tversky.py
│ │ ├── test_distance_typo.py
│ │ ├── test_distance_unigram_subtuple.py
│ │ ├── test_distance_unknown_a.py
│ │ ├── test_distance_unknown_b.py
│ │ ├── test_distance_unknown_c.py
│ │ ├── test_distance_unknown_d.py
│ │ ├── test_distance_unknown_e.py
│ │ ├── test_distance_unknown_f.py
│ │ ├── test_distance_unknown_g.py
│ │ ├── test_distance_unknown_h.py
│ │ ├── test_distance_unknown_i.py
│ │ ├── test_distance_unknown_j.py
│ │ ├── test_distance_unknown_k.py
│ │ ├── test_distance_unknown_l.py
│ │ ├── test_distance_unknown_m.py
│ │ ├── test_distance_upholt.py
│ │ ├── test_distance_vps.py
│ │ ├── test_distance_warrens_i.py
│ │ ├── test_distance_warrens_ii.py
│ │ ├── test_distance_warrens_iii.py
│ │ ├── test_distance_warrens_iv.py
│ │ ├── test_distance_warrens_v.py
│ │ ├── test_distance_weighted_jaccard.py
│ │ ├── test_distance_whittaker.py
│ │ ├── test_distance_yates_chi_squared.py
│ │ ├── test_distance_yjhhr.py
│ │ ├── test_distance_yujian_bo.py
│ │ ├── test_distance_yule_q.py
│ │ ├── test_distance_yule_q_ii.py
│ │ └── test_distance_yule_y.py
│ ├── fingerprint/
│ │ ├── __init__.py
│ │ ├── test_fingerprint__fingerprint.py
│ │ ├── test_fingerprint_bwtf.py
│ │ ├── test_fingerprint_bwtrlef.py
│ │ ├── test_fingerprint_consonant.py
│ │ ├── test_fingerprint_count.py
│ │ ├── test_fingerprint_extract.py
│ │ ├── test_fingerprint_extract_position_frequency.py
│ │ ├── test_fingerprint_lacss.py
│ │ ├── test_fingerprint_lc_cutter.py
│ │ ├── test_fingerprint_occurrence.py
│ │ ├── test_fingerprint_occurrence_halved.py
│ │ ├── test_fingerprint_omission_key.py
│ │ ├── test_fingerprint_phonetic.py
│ │ ├── test_fingerprint_position.py
│ │ ├── test_fingerprint_qgram.py
│ │ ├── test_fingerprint_skeleton_key.py
│ │ ├── test_fingerprint_string.py
│ │ └── test_fingerprint_synoname_toolcode.py
│ ├── fuzz/
│ │ ├── __init__.py
│ │ ├── corpora/
│ │ │ ├── basewords.txt
│ │ │ └── blns.txt
│ │ ├── fuzz_test_distance.py
│ │ ├── fuzz_test_fingerprint.py
│ │ ├── fuzz_test_phonetic.py
│ │ ├── fuzz_test_stemmer.py
│ │ └── fuzz_test_tokenizer.py
│ ├── phones/
│ │ ├── __init__.py
│ │ └── test_phones.py
│ ├── phonetic/
│ │ ├── __init__.py
│ │ ├── test_phonetic__phonetic.py
│ │ ├── test_phonetic_ainsworth.py
│ │ ├── test_phonetic_alpha_sis.py
│ │ ├── test_phonetic_beider_morse.py
│ │ ├── test_phonetic_caverphone.py
│ │ ├── test_phonetic_daitch_mokotoff.py
│ │ ├── test_phonetic_davidson.py
│ │ ├── test_phonetic_dolby.py
│ │ ├── test_phonetic_double_metaphone.py
│ │ ├── test_phonetic_eudex.py
│ │ ├── test_phonetic_fonem.py
│ │ ├── test_phonetic_fuzzy_soundex.py
│ │ ├── test_phonetic_haase.py
│ │ ├── test_phonetic_henry_early.py
│ │ ├── test_phonetic_koelner.py
│ │ ├── test_phonetic_lein.py
│ │ ├── test_phonetic_meta_soundex.py
│ │ ├── test_phonetic_metaphone.py
│ │ ├── test_phonetic_mra.py
│ │ ├── test_phonetic_norphone.py
│ │ ├── test_phonetic_nrl.py
│ │ ├── test_phonetic_nysiis.py
│ │ ├── test_phonetic_onca.py
│ │ ├── test_phonetic_parmar_kumbharana.py
│ │ ├── test_phonetic_phonem.py
│ │ ├── test_phonetic_phonet.py
│ │ ├── test_phonetic_phonetic_spanish.py
│ │ ├── test_phonetic_phonex.py
│ │ ├── test_phonetic_phonic.py
│ │ ├── test_phonetic_phonix.py
│ │ ├── test_phonetic_pshp_soundex_first.py
│ │ ├── test_phonetic_pshp_soundex_last.py
│ │ ├── test_phonetic_refined_soundex.py
│ │ ├── test_phonetic_reth_schek.py
│ │ ├── test_phonetic_roger_root.py
│ │ ├── test_phonetic_russell_index.py
│ │ ├── test_phonetic_sfinxbis.py
│ │ ├── test_phonetic_sound_d.py
│ │ ├── test_phonetic_soundex.py
│ │ ├── test_phonetic_soundex_br.py
│ │ ├── test_phonetic_spanish_metaphone.py
│ │ ├── test_phonetic_spfc.py
│ │ ├── test_phonetic_statistics_canada.py
│ │ └── test_phonetic_waahlin.py
│ ├── stats/
│ │ ├── __init__.py
│ │ ├── test_stats_confusion_table.py
│ │ ├── test_stats_mean.py
│ │ └── test_stats_pairwise.py
│ ├── stemmer/
│ │ ├── __init__.py
│ │ ├── test_stemmer__snowball.py
│ │ ├── test_stemmer__stemmer.py
│ │ ├── test_stemmer_caumanns.py
│ │ ├── test_stemmer_clef_german.py
│ │ ├── test_stemmer_clef_german_plus.py
│ │ ├── test_stemmer_clef_swedish.py
│ │ ├── test_stemmer_lovins.py
│ │ ├── test_stemmer_paice_husk.py
│ │ ├── test_stemmer_porter.py
│ │ ├── test_stemmer_porter2.py
│ │ ├── test_stemmer_s_stemmer.py
│ │ ├── test_stemmer_schinke.py
│ │ ├── test_stemmer_snowball_danish.py
│ │ ├── test_stemmer_snowball_dutch.py
│ │ ├── test_stemmer_snowball_german.py
│ │ ├── test_stemmer_snowball_norwegian.py
│ │ ├── test_stemmer_snowball_swedish.py
│ │ └── test_stemmer_uealite.py
│ ├── tokenizer/
│ │ ├── __init__.py
│ │ ├── test_tokenizer__tokenizer.py
│ │ ├── test_tokenizer_c_or_v_cluster.py
│ │ ├── test_tokenizer_character.py
│ │ ├── test_tokenizer_cv_cluster.py
│ │ ├── test_tokenizer_legalipy.py
│ │ ├── test_tokenizer_nltk.py
│ │ ├── test_tokenizer_q_grams.py
│ │ ├── test_tokenizer_q_skipgrams.py
│ │ ├── test_tokenizer_regexp.py
│ │ ├── test_tokenizer_saps.py
│ │ ├── test_tokenizer_sonoripy.py
│ │ ├── test_tokenizer_vc_cluster.py
│ │ ├── test_tokenizer_whitespace.py
│ │ └── test_tokenizer_wordpunct.py
│ └── util/
│ ├── __init__.py
│ ├── test_data.py
│ ├── test_ncr.py
│ └── test_prod.py
└── tox.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: .circleci/config.yml
================================================
# Python CircleCI 2.0 configuration file
#
# Check https://circleci.com/docs/2.0/language-python/ for more details
#
version: 2
jobs:
build:
docker:
# specify the version you desire here
# use `-browsers` prefix for selenium tests, e.g. `3.6.1-browsers`
- image: circleci/python:3.7
# Specify service dependencies here if necessary
# CircleCI maintains a library of pre-built images
# documented at https://circleci.com/docs/2.0/circleci-images/
# - image: circleci/postgres:9.4
working_directory: ~/repo
steps:
- checkout
# Download and cache dependencies
- restore_cache:
keys:
- v1-dependencies-{{ checksum "requirements.txt" }}
# fallback to using the latest cache if no exact match is found
- v1-dependencies-
- run:
name: install dependencies
command: |
python3 -m venv venv
. venv/bin/activate
python3 -m pip install --upgrade pip
pip install cython
echo "tox" >> requirements.txt
pip install -r requirements.txt -r requirements-test.txt
- save_cache:
paths:
- ./venv
key: v1-dependencies-{{ checksum "requirements.txt" }}
# run tests!
# this example uses Django's built-in test-runner
# other common Python testing frameworks include pytest and nose
# https://pytest.org
# https://nose.readthedocs.io
- run:
name: run tests
command: |
. venv/bin/activate
tox -e py37
- store_artifacts:
path: test-reports
destination: test-reports
================================================
FILE: .codeclimate.yml
================================================
---
version: "2"
plugins:
sonar-python:
enabled: true
fixme:
enabled: true
pylint:
enabled: true
radon:
enabled: true
duplication:
enabled: true
git-legal:
enabled: true
shellcheck:
enabled: true
exclude_patterns:
- "tests/"
- "data/"
- "helpers/"
- "docs/"
- "setup.py"
- "badge_update.py"
- "_beider_morse_data.py"
================================================
FILE: .coveragerc
================================================
[run]
source = abydos
omit =
*/tests/*
parallel = True
branch = True
[report]
exclude_lines =
pragma: no cover
if __name__ == .__main__.:
================================================
FILE: .github/CODEOWNERS
================================================
# Lines starting with '#' are comments.
# Each line is a file pattern followed by one or more owners.
# These owners will be the default owners for everything in the repo.
* @chrislit
# Order is important. The last matching pattern has the most precedence.
# So if a pull request only touches javascript files, only these owners
# will be requested to review.
# You can also use email addresses if you prefer.
================================================
FILE: .gitignore
================================================
build/
cover/
dist/
flake8/
mypy/
abydos.egg-info/
venv/
.settings/
*.bak
*.pyc
*.log
.coverage
.idea/
.mypy_cache/
.pyre/
.tox/
.ipynb_checkpoints
Untitled*.ipynb
================================================
FILE: .gitmodules
================================================
[submodule "tests/regtests"]
path = tests/regression
url = https://github.com/chrislit/abydos-regtests.git
================================================
FILE: .project
================================================
abydos
org.python.pydev.PyDevBuilder
org.python.pydev.pythonNature
================================================
FILE: .pypirc
================================================
[distutils]
index-servers=
pypi
testpypi
[testpypi]
repository: https://test.pypi.org/legacy/
username: chrislit
#password:
[pypi]
username: chrislit
#password:
================================================
FILE: .pyup.yml
================================================
# autogenerated pyup.io config file
# see https://pyup.io/docs/configuration/ for all available options
update: "insecure"
schedule: "every day"
pin: False
================================================
FILE: .travis.yml
================================================
language: python
jobs:
include:
- os: linux
arch: arm64
python: 3.7
- os: linux
arch: ppc64le
python: 3.7
- os: linux
dist: xenial
python: 3.5
- os: linux
dist: xenial
python: 3.6
- os: linux
dist: xenial
python: 3.7
- os: linux
dist: xenial
python: 3.8
notifications:
email: false
# Install packages
install:
- travis_retry python -m pip install --upgrade pip
- travis_retry pip install cython
- travis_retry pip install paq lzss coveralls -r requirements.txt -r requirements-test.txt
- travis_retry python setup.py install
# Run test
script:
- nosetests --verbose --with-coverage --cover-erase --cover-branches --cover-package=abydos --logging-level=INFO --process-timeout=60 --process-restartworker
# Calculate coverage
after_success:
- coveralls --verbose --rcfile=.coveragerc
================================================
FILE: AUTHORS.rst
================================================
Creator & Maintainer
````````````````````
- Christopher C. Little (`@chrislit `_)
Contributors
````````````
- Szolár Balázs (`@LEFTazs `_)
================================================
FILE: CODE_OF_CONDUCT.rst
================================================
Contributor Covenant Code of Conduct
====================================
Our Pledge
----------
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to making participation in our project and
our community a harassment-free experience for everyone, regardless of age,
body size, disability, ethnicity, gender identity and expression, level of
experience, nationality, personal appearance, race, religion, or sexual
identity and orientation.
Our Standards
-------------
Examples of behavior that contributes to creating a positive environment
include:
- Using welcoming and inclusive language
- Being respectful of differing viewpoints and experiences
- Gracefully accepting constructive criticism
- Focusing on what is best for the community
- Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
- The use of sexualized language or imagery and unwelcome sexual attention or
advances
- Trolling, insulting/derogatory comments, and personal or political attacks
- Public or private harassment
- Publishing others' private information, such as a physical or electronic
address, without explicit permission
- Other conduct which could reasonably be considered inappropriate in a
professional setting
Our Responsibilities
--------------------
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
Scope
-----
This Code of Conduct applies both within project spaces and in public spaces
when an individual is representing the project or its community. Examples of
representing a project or community include using an official project e-mail
address, posting via an official social media account, or acting as an
appointed representative at an online or offline event. Representation of a
project may be further defined and clarified by project maintainers.
Enforcement
-----------
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at chrisclittle+abydos@gmail.com. The
project team will review and investigate all complaints, and will respond in a
way that it deems appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an
incident. Further details of specific enforcement policies may be posted
separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
Attribution
-----------
This Code of Conduct is adapted from the `Contributor Covenant`_,
version 1.4, available at `http://contributor-covenant.org/version/1/4`_
.. _Contributor Covenant: http://contributor-covenant.org
.. _`http://contributor-covenant.org/version/1/4`: http://contributor-covenant.org/version/1/4/
================================================
FILE: CODING_STANDARDS.rst
================================================
CODING STANDARDS
----------------
- Nosetest will be used for testing.
- Flake8 will be used for best practice conformance.
- Pydocstyle will be used to ensure documentation style conformance to PEP257
(for the most part) and NumPy documentation style.
- Black will be used to keep code style consistent.
- 3rd party packages may be used, but must be present in both PyPI and conda
or conda-forge. They must also support all supported Python versions.
----
git commits
~~~~~~~~~~~
Each commit should be a minimal unit of code or represent minimal changes.
Avoid doing multiple things in a single commit, but describe them in separate
lines of the commit log if this does occur.
git pushes
~~~~~~~~~~
A git push should be performed only under the following conditions:
- library is syntactically correct (compiling correctly) in both Python 3
- library passes all tests and doctests according to nosetests in Python 3
- test coverage is 100% according to nosetests
- flake8 and pydocstyle should report 0 issues
- black code styling has been applied
Notes on architecture
~~~~~~~~~~~~~~~~~~~~~
As of the 0.3.6 release, each major algorithm of the compression, distance,
fingerprint, phonetic, & stemmer subpackages has been moved into a class of its
own. The distance, fingerprint, phonetic, & stemmer classes each inherit from
respectively common classes that define basic methods for these four major
types of classes.
The old functional API for these subpackages has been retained for backwards
compatibility until the release of version 0.6, but its use is deprecated as
of version 0.4. New classes (those not present at the release of version 0.3.6)
will not be given functional API wrappers.
Although, as of the 0.3.6 release, many of the classes that have are pre-0.3.6
functions encapsulated in a class simply consist of a single method that
could be a static method, making these methods static is generally avoided.
As development continues, these classes will take more advantage of object
architecture to store parameters between calls and inherit from base classes.
================================================
FILE: FAQ.rst
================================================
FAQ
===
Why is the library licensed under GPL3+? Can you change the license?
--------------------------------------------------------------------
GPL3 is the only license compatible with all of the various parts of
Abydos that have been ported to Python from other languages. For example,
the Beider-Morse Phonetic Matching algorithm implementation included in
Abydos was ported from their reference implementation in PHP, which is
itself licensed under GPL3.
Accordingly, it's not possible to change to a different license without
removing parts of the library. However, if you have a need for a specific
part of the library and can't use GPL3+ code, contact us and we may be able
to provide it separately or can give guidance on its underlying licensing
status.
What is the purpose of this library?
------------------------------------
A. Abydos is intended to facilitate any manner of string transformation and
comparison might be useful for string matching or record linkage. The two
most significant parts of the library are string distance/similarity
measures and phonetic algorithms/string fingerprint algorithms, but a large
collection of tokenizers, corpus classes, compression algorithms, &
phonetics functions support these and afford greater customization.
Can you add this new feature?
-----------------------------
Maybe. Open an issue at https://github.com/chrislit/abydos/issues and
propose your new feature.
Additional string distance/similarity measures,
phonetic algorithms, string fingerprint algorithms, and string tokenizers
will certainly be added if possible -- but it's helpful to point them
out since we may not be aware of them.
Can I contribute to the project?
--------------------------------
Absolutely. You can take on an unclaimed issue, report bugs, add new
classes, or whatever piques your interest. You are welcome to open an
issue at https://github.com/chrislit/abydos/issues proposing what you'd
like to work on, or you can submit a pull request if you have something
ready to contribute to the repository.
Will you add Metaphone 3?
-------------------------
No. Although Lawrence Philips (author of Metaphone, Double Metaphone, and
Metaphone 3) released Metaphone 3 version 2.1.3 under the BSD 3-clause
license as part of Google Refine, which became OpenRefine
(https://github.com/OpenRefine/OpenRefine/blob/master/main/src/com/google/refine/clustering/binning/Metaphone3.java),
he doesn't want that code used for ports to other languages or used in any
way outside of OpenRefine. In accordance with his wishes, no one has
released Metaphone 3 ports to other languages or included it other
libraries.
Why have you included algorithm X when it is already a part of NLTK/SciPy/...?
------------------------------------------------------------------------------
Abydos is a collection of algorithms with common class & function
interfaces and options. So, while NLTK has Levenshtein & Jaccard string
similarity measures, they don't allow for tunable edit costs or using
the tokenizer of your choice.
Are there similar projects for languages other than Python?
-----------------------------------------------------------
Yes, there are libraries such as:
- Talisman_ for JavaScript
- Phonics_ for R (phonetic algorithms)
- stringmetric_ for Scala
.. _Talisman: https://github.com/Yomguithereal/talisman
.. _Phonics: https://github.com/howardjp/phonics
.. _stringmetric: https://github.com/rockymadden/stringmetric
What is the process for adding a new class to the library?
----------------------------------------------------------
The process of adding a new class follows roughly the following steps:
- Discover that a new (unimplemented) measure/algorithm/method exists
- Locate the original source of the algorithm (a journal article, a
reference implementation, etc.). And save the reference to it in
docs/abydos.bib.
- If the original source cannot be located for reference, use an
adequate secondary source and add its reference info to
docs/abydos.bib.
- Implement the class based on its description/reference implementation.
- Create a test class and add all examples and test cases from the
original source. Add other reliable test cases from other sources, if
they are available.
- Ensure that the class passes all test cases.
- Add test cases, as necessary, until test coverage reaches 100%, or as
close to 100% as possible.
Are these really Frequently Asked Questions?
--------------------------------------------
No. Most of these questions have never been explicitly asked.
================================================
FILE: HISTORY.rst
================================================
Release History
---------------
0.6.0 (2020-00-00) *frija*
++++++++++++++++++++++++++
doi:10.5281/zenodo.
Changes:
- The deprecated function-based API wrappers were removed.
- Added type hints
- Made all phonetic algorithms' encode & encode_alpha methods and all string
fingerprinters' fingerprint methods return values of type str.
0.5.0 (2020-01-10) *ecgtheow*
+++++++++++++++++++++++++++++
doi:10.5281/zenodo.3603514
Changes:
- Support for Python 2.7 was removed.
0.4.1 (2020-01-07) *distant dietrich*
+++++++++++++++++++++++++++++++++++++
doi:10.5281/zenodo.3600548
Changes:
- Support for Python 3.4 was removed. (3.4 reached end-of-life on March 18,
2019)
- Fuzzy intersections were corrected to avoid over-counting partial
intersection instances.
- Levenshtein can now return an optimal alignment
- Added the following distance measures:
- Indice de Similitude-Guth (ISG)
- INClusion Programme
- Guth
- Victorian Panel Study (VPS) score
- LIG3 similarity
- Discounted Levenshtein
- Relaxed Hamming
- String subsequence kernel (SSK) similarity
- Phonetic edit distance
- Henderson-Heron dissimilarity
- Raup-Crick similarity
- Millar's binomial deviance dissimilarity
- Morisita similarity
- Horn-Morisita similarity
- Clark's coefficient of divergence
- Chao's Jaccard similarity
- Chao's Dice similarity
- Cao's CY similarity (CYs) and dissimilarity (CYd)
- Added the following fingerprint classes:
- Taft's Consonant coding
- Taft's Extract - letter list
- Taft's Extract - position & frequency
- L.A. County Sheriff's System
- Library of Congres Cutter table encoding
- Added the following phonetic algorithms:
- Ainsworth's grapheme-to-phoneme
- PHONIC
0.4.0 (2019-05-30) *dietrich*
+++++++++++++++++++++++++++++
doi:10.5281/zenodo.3235034
Version 0.4.0 focuses on distance measures, adding 211 new measures. Attempts
were made to provide normalized version for measure that did not inherently
range from 0 to 1. The other major focus was the addition of 12 tokenizers, in
service of expanding distance measure options.
Changes:
- Support for Python 3.3 was dropped.
- Deprecated functions that merely wrap class methods to maintain API
compatibility, for removal in 0.6.0
- Added methods to ConfusionTable to return:
- its internal representation
- false negative rate
- false omission rate
- positive & negative likelihood ratios
- diagnostic odds ratio
- error rate
- prevalence
- Jaccard index
- D-measure
- Phi coefficient
- joint, actual, & predicted entropies
- mutual information
- proficiency (uncertainty coefficient)
- information gain ratio
- dependency
- lift
- Deprecated f-measure & g-measure from ConfusionTable for removal in
0.6.0
- Added notes to indicate when functions, classes, & methods were added
- Added the following 12 tokenizers:
- QSkipgrams
- CharacterTokenizer
- RegexpTokenizer, WhitespaceTokenizer, & WordpunctTokenizer
- COrVClusterTokenizer, CVClusterTokenizer, & VCClusterTokenizer
- SonoriPyTokenizer & LegaliPyTokenizer
- NLTKTokenizer
- SAPSTokenizer
- Added the UnigramCorpus class & a facility for downloading data, such as
pre-processed/trained data, from storage on GitHub
- Added the Wåhlin phonetic encoding
- Added the following 211 similarity/distance/correlation measures:
- ALINE
- AMPLE
- Anderberg
- Andres & Marzo's Delta
- Average Linkage
- AZZOO
- Baroni-Urbani & Buser I & II
- Batagelj & Bren
- Baulieu I-XV
- Benini I & II
- Bennet
- Bhattacharyya
- BI-SIM
- BLEU
- Block Levenshtein
- Brainerd-Robinson
- Braun-Blanquet
- Canberra
- Chord
- Clement
- Cohen's Kappa
- Cole
- Complete Linkage
- Consonni & Todeschini I-V
- Cormode's LZ
- Covington
- Dennis
- Dice Asymmetric I & II
- Digby
- Dispersion
- Doolittle
- Dunning
- Eyraud
- Fager & McGowan
- Faith
- Fellegi-Sunter
- Fidelity
- Fleiss
- Fleiss-Levin-Paik
- FlexMetric
- Forbes I & II
- Fossum
- FuzzyWuzzy Partial String
- FuzzyWuzzy Token Set
- FuzzyWuzzy Token Sort
- Generalized Fleiss
- Gilbert
- Gilbert & Wells
- Gini I & II
- Goodall
- Goodman & Kruskal's Lambda
- Goodman & Kruskal's Lambda-r
- Goodman & Kruskal's Tau A & B
- Gower & Legendre
- Guttman's Lambda A & B
- Gwet's AC
- Hamann
- Harris & Lahey
- Hassanat
- Hawkins & Dotson
- Hellinger
- Higuera & Mico
- Hurlbert
- Iterative SubString
- Jaccard-NM
- Jensen-Shannon
- Johnson
- Kendall's Tau
- Kent & Foster I & II
- Koppen I & II
- Kuder & Richardson
- Kuhns I-XII
- Kulczynski I & II
- Longest Common Prefix
- Longest Common Suffix
- Lorentzian
- Maarel
- Marking
- Marking Metric
- MASI
- Matusita
- Maxwell & Pilliner
- McConnaughey
- McEwen & Michael
- MetaLevenshtein
- Michelet
- MinHash
- Mountford
- Mean Squared Contingency
- Mutual Information
- NCD with LZSS
- NCD with PAQ9a
- Ozbay
- Pattern
- Pearson's Chi-Squared
- Pearson & Heron II
- Pearson II & III
- Pearson's Phi
- Peirce
- Positional Q-Gram Dice, Jaccard, & Overlap
- Q-Gram
- Quantitative Cosine, Dice, & Jaccard
- Rees-Levenshtein
- Roberts
- Rogers & Tanimoto
- Rogot & Goldberg
- Rouge-L, -S, -SU, & -W
- Russell & Rao
- SAPS
- Scott's Pi
- Shape
- Shapira & Storer I
- Sift4 Extended
- Single Linkage
- Size
- Soft Cosine
- SoftTF-IDF
- Sokal & Michener
- Sokal & Sneath I-V
- Sorgenfrei
- Steffensen
- Stiles
- Stuart's Tau
- Tarantula
- Tarwid
- Tetrachoric
- TF-IDF
- Tichy
- Tulloss's R, S, T, & U
- Unigram Subtuple
- Unknown A-M
- Upholt
- Warrens I-V
- Weighted Jaccard
- Whittaker
- Yates' Chi-Squared
- YJHHR
- Yujian & Bo
- Yule's Q, Q II, & Y
- Four intersection types are now supported for all distance measure that are
based on _TokenDistance. In addition to basic crisp intersections, soft,
fuzzy, and group linkage intersections have been provided.
0.3.6 (2018-11-17) *classy carl*
++++++++++++++++++++++++++++++++
doi:10.5281/zenodo.1490537
Changes:
- Most functions were encapsulated into classes.
- Each class is broken out into its own file, with test files paralleling
library files.
- Documentation was converted from Sphinx markup to Numpy style.
- A tutorial was written for each subpackage.
- Documentation was cleaned up, with math markup corrections and many
additional links.
0.3.5 (2018-10-31) *cantankerous carl*
++++++++++++++++++++++++++++++++++++++
doi:10.5281/zenodo.1463204
Version 0.3.5 focuses on refactoring the whole project. The API itself remains
largely the same as in previous versions, but underlyingly modules have been
split up. Essentially no new features are added (bugfixes aside) in this
version.
Changes:
- Refactored library and tests into smaller modules
- Broke compression distances (NCD) out into separate functions
- Adopted Black code style
- Added pyproject.toml to use Poetry for packaging (but will continue using
setuptools and setup.py for the present)
- Minor bug fixes
0.3.0 (2018-10-15) *carl*
+++++++++++++++++++++++++
doi:10.5281/zenodo.1462443
Version 0.3.0 focuses on additional phonetic algorithms, but does add numerous
distance measures, fingerprints, and even a few stemmers. Another focus was
getting everything to build again (including docs) and to move to more
standard modern tools (flake8, tox, etc.).
Changes:
- Fixed implementation of Bag distance
- Updated BMPM to version 3.10
- Fixed Sphinx documentation on readthedocs.org
- Split string fingerprints out of clustering into their own module
- Added support for q-grams to skip-n characters
- New phonetic algorithms:
- Statistics Canada
- Lein
- Roger Root
- Oxford Name Compression Algorithm (ONCA)
- Eudex phonetic hash
- Haase Phonetik
- Reth-Schek Phonetik
- FONEM
- Parmar-Kumbharana
- Davidson's Consonant Code
- SoundD
- PSHP Soundex/Viewex Coding
- an early version of Henry Code
- Norphone
- Dolby Code
- Phonetic Spanish
- Spanish Metaphone
- MetaSoundex
- SoundexBR
- NRL English-to-phoneme
- New string fingerprints:
- Cisłak & Grabowski's occurrence fingerprint
- Cisłak & Grabowski's occurrence halved fingerprint
- Cisłak & Grabowski's count fingerprint
- Cisłak & Grabowski's position fingerprint
- Synoname Toolcode
- New distance measures:
- Minkowski distance & similarity
- Manhattan distance & similarity
- Euclidean distance & similarity
- Chebyshev distance & similarity
- Eudex distances
- Sift4 distance
- Baystat distance & similarity
- Typo distance
- Indel distance
- Synoname
- New stemmers:
- UEA-Lite Stemmer
- Paice-Husk Stemmer
- Schinke Latin stemmer
- S stemmer
- Eliminated ._compat submodule in favor of six
- Transitioned from PEP8 to flake8, etc.
- Phonetic algorithms now consistently use max_length=-1 to indicate that
there should be no length limit
- Added example notebooks in binder directory
0.2.0 (2015-05-27) *berthold*
+++++++++++++++++++++++++++++
- Added Caumanns' German stemmer
- Added Lovins' English stemmer
- Updated Beider-Morse Phonetic Matching to 3.04
- Added Sphinx documentation
0.1.1 (2015-05-12) *albrecht*
+++++++++++++++++++++++++++++
- First Beta release to PyPI
================================================
FILE: LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc.
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
{one line to give the program's name and a brief idea of what it does.}
Copyright (C) {year} {name of author}
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
{project} Copyright (C) {year} {fullname}
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
.
================================================
FILE: MANIFEST.in
================================================
include *.rst LICENSE abydos.png
================================================
FILE: Pipfile
================================================
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[dev-packages]
tox = "*"
nose = "*"
coverage = "*"
nltk = "*"
syllabipy = "*"
[packages]
numpy = "*"
deprecation = "*"
================================================
FILE: README.rst
================================================
Abydos
======
+------------------+------------------------------------------------------+
| CI & Test Status | |travis| |circle| |azure| |semaphore| |coveralls| |
+------------------+------------------------------------------------------+
| Code Quality | |codeclimate| |scrutinizer| |codacy| |codefactor| |
+------------------+------------------------------------------------------+
| Dependencies | |requires| |snyk| |pyup| |cii| |black| |
+------------------+------------------------------------------------------+
| Local Analysis | |pylint| |flake8| |pydocstyle| |sloccount| |mypy| |
+------------------+------------------------------------------------------+
| Usage | |docs| |mybinder| |license| |sourcerank| |zenodo| |
+------------------+------------------------------------------------------+
| Contribution | |openhub| |gh-commits| |gh-issues| |gh-stars| |
+------------------+------------------------------------------------------+
| PyPI | |pypi| |pypi-dl| |pypi-ver| |
+------------------+------------------------------------------------------+
| conda-forge | |conda| |conda-dl| |conda-platforms| |
+------------------+------------------------------------------------------+
.. |travis| image:: https://travis-ci.org/chrislit/abydos.svg?branch=master
:target: https://travis-ci.org/chrislit/abydos
:alt: Travis-CI Build Status
.. |circle| image:: https://circleci.com/gh/chrislit/abydos/tree/master.svg?style=shield
:target: https://circleci.com/gh/chrislit/abydos/tree/master
:alt: Circle-CI Build Status
.. |azure| image:: https://dev.azure.com/chrislit/abydos/_apis/build/status/chrislit.abydos?branchName=master
:target: https://dev.azure.com/chrislit/abydos/_build/latest?definitionId=1
:alt: Azure Pipelines Build Status
.. |semaphore| image:: https://semaphoreci.com/api/v1/chrislit/abydos/branches/master/shields_badge.svg
:target: https://semaphoreci.com/chrislit/abydos
:alt: Semaphore Build Status
.. |coveralls| image:: https://coveralls.io/repos/github/chrislit/abydos/badge.svg?branch=master
:target: https://coveralls.io/github/chrislit/abydos?branch=master
:alt: Coverage Status
.. |codeclimate| image:: https://codeclimate.com/github/chrislit/abydos/badges/gpa.svg
:target: https://codeclimate.com/github/chrislit/abydos
:alt: Code Climate
.. |scrutinizer| image:: https://scrutinizer-ci.com/g/chrislit/abydos/badges/quality-score.png?b=master
:target: https://scrutinizer-ci.com/g/chrislit/abydos/?branch=master
:alt: Scrutinizer
.. |codacy| image:: https://api.codacy.com/project/badge/Grade/db79f2c31ea142fb9b5938abe87b0854
:target: https://www.codacy.com/app/chrislit/abydos?utm_source=github.com&utm_medium=referral&utm_content=chrislit/abydos&utm_campaign=Badge_Grade
:alt: Codacy
.. |codefactor| image:: https://www.codefactor.io/repository/github/chrislit/abydos/badge
:target: https://www.codefactor.io/repository/github/chrislit/abydos
:alt: CodeFactor
.. |requires| image:: https://requires.io/github/chrislit/abydos/requirements.svg?branch=master
:target: https://requires.io/github/chrislit/abydos/requirements/?branch=master
:alt: Requirements Status
.. |snyk| image:: https://snyk.io/test/github/chrislit/abydos/badge.svg?targetFile=requirements.txt
:target: https://snyk.io/test/github/chrislit/abydos?targetFile=requirements.txt
:alt: Known Vulnerabilities
.. |pyup| image:: https://pyup.io/repos/github/chrislit/abydos/shield.svg
:target: https://pyup.io/repos/github/chrislit/abydos/
:alt: Updates
.. |cii| image:: https://bestpractices.coreinfrastructure.org/projects/1598/badge
:target: https://bestpractices.coreinfrastructure.org/projects/1598
:alt: CII Best Practices
.. |black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
:target: https://github.com/ambv/black
:alt: black
.. |pylint| image:: https://img.shields.io/badge/Pylint-9.13/10-yellowgreen.svg
:target: #
:alt: Pylint Score
.. |flake8| image:: https://img.shields.io/badge/flake8-0-brightgreen.svg
:target: #
:alt: flake8 Errors
.. |pydocstyle| image:: https://img.shields.io/badge/pydocstyle-0-brightgreen.svg
:target: #
:alt: pydocstyle Errors
.. |sloccount| image:: https://img.shields.io/badge/SLOCCount-40,079-blue.svg
:target: #
:alt: SLOCCount
.. |mypy| image:: https://img.shields.io/badge/mypy-1.87%25%20imprecise-1F5082.svg
:target: #
:alt: mypy Imprecision
.. |docs| image:: https://readthedocs.org/projects/abydos/badge/?version=latest
:target: https://abydos.readthedocs.org/en/latest/
:alt: Documentation Status
.. |mybinder| image:: https://img.shields.io/badge/launch-binder-579aca.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFkAAABZCAMAAABi1XidAAAB8lBMVEX///9XmsrmZYH1olJXmsr1olJXmsrmZYH1olJXmsr1olJXmsrmZYH1olL1olJXmsr1olJXmsrmZYH1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olJXmsrmZYH1olL1olL0nFf1olJXmsrmZYH1olJXmsq8dZb1olJXmsrmZYH1olJXmspXmspXmsr1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olLeaIVXmsrmZYH1olL1olL1olJXmsrmZYH1olLna31Xmsr1olJXmsr1olJXmsrmZYH1olLqoVr1olJXmsr1olJXmsrmZYH1olL1olKkfaPobXvviGabgadXmsqThKuofKHmZ4Dobnr1olJXmsr1olJXmspXmsr1olJXmsrfZ4TuhWn1olL1olJXmsqBi7X1olJXmspZmslbmMhbmsdemsVfl8ZgmsNim8Jpk8F0m7R4m7F5nLB6jbh7jbiDirOEibOGnKaMhq+PnaCVg6qWg6qegKaff6WhnpKofKGtnomxeZy3noG6dZi+n3vCcpPDcpPGn3bLb4/Mb47UbIrVa4rYoGjdaIbeaIXhoWHmZYHobXvpcHjqdHXreHLroVrsfG/uhGnuh2bwj2Hxk17yl1vzmljzm1j0nlX1olL3AJXWAAAAbXRSTlMAEBAQHx8gICAuLjAwMDw9PUBAQEpQUFBXV1hgYGBkcHBwcXl8gICAgoiIkJCQlJicnJ2goKCmqK+wsLC4usDAwMjP0NDQ1NbW3Nzg4ODi5+3v8PDw8/T09PX29vb39/f5+fr7+/z8/Pz9/v7+zczCxgAABC5JREFUeAHN1ul3k0UUBvCb1CTVpmpaitAGSLSpSuKCLWpbTKNJFGlcSMAFF63iUmRccNG6gLbuxkXU66JAUef/9LSpmXnyLr3T5AO/rzl5zj137p136BISy44fKJXuGN/d19PUfYeO67Znqtf2KH33Id1psXoFdW30sPZ1sMvs2D060AHqws4FHeJojLZqnw53cmfvg+XR8mC0OEjuxrXEkX5ydeVJLVIlV0e10PXk5k7dYeHu7Cj1j+49uKg7uLU61tGLw1lq27ugQYlclHC4bgv7VQ+TAyj5Zc/UjsPvs1sd5cWryWObtvWT2EPa4rtnWW3JkpjggEpbOsPr7F7EyNewtpBIslA7p43HCsnwooXTEc3UmPmCNn5lrqTJxy6nRmcavGZVt/3Da2pD5NHvsOHJCrdc1G2r3DITpU7yic7w/7Rxnjc0kt5GC4djiv2Sz3Fb2iEZg41/ddsFDoyuYrIkmFehz0HR2thPgQqMyQYb2OtB0WxsZ3BeG3+wpRb1vzl2UYBog8FfGhttFKjtAclnZYrRo9ryG9uG/FZQU4AEg8ZE9LjGMzTmqKXPLnlWVnIlQQTvxJf8ip7VgjZjyVPrjw1te5otM7RmP7xm+sK2Gv9I8Gi++BRbEkR9EBw8zRUcKxwp73xkaLiqQb+kGduJTNHG72zcW9LoJgqQxpP3/Tj//c3yB0tqzaml05/+orHLksVO+95kX7/7qgJvnjlrfr2Ggsyx0eoy9uPzN5SPd86aXggOsEKW2Prz7du3VID3/tzs/sSRs2w7ovVHKtjrX2pd7ZMlTxAYfBAL9jiDwfLkq55Tm7ifhMlTGPyCAs7RFRhn47JnlcB9RM5T97ASuZXIcVNuUDIndpDbdsfrqsOppeXl5Y+XVKdjFCTh+zGaVuj0d9zy05PPK3QzBamxdwtTCrzyg/2Rvf2EstUjordGwa/kx9mSJLr8mLLtCW8HHGJc2R5hS219IiF6PnTusOqcMl57gm0Z8kanKMAQg0qSyuZfn7zItsbGyO9QlnxY0eCuD1XL2ys/MsrQhltE7Ug0uFOzufJFE2PxBo/YAx8XPPdDwWN0MrDRYIZF0mSMKCNHgaIVFoBbNoLJ7tEQDKxGF0kcLQimojCZopv0OkNOyWCCg9XMVAi7ARJzQdM2QUh0gmBozjc3Skg6dSBRqDGYSUOu66Zg+I2fNZs/M3/f/Grl/XnyF1Gw3VKCez0PN5IUfFLqvgUN4C0qNqYs5YhPL+aVZYDE4IpUk57oSFnJm4FyCqqOE0jhY2SMyLFoo56zyo6becOS5UVDdj7Vih0zp+tcMhwRpBeLyqtIjlJKAIZSbI8SGSF3k0pA3mR5tHuwPFoa7N7reoq2bqCsAk1HqCu5uvI1n6JuRXI+S1Mco54YmYTwcn6Aeic+kssXi8XpXC4V3t7/ADuTNKaQJdScAAAAAElFTkSuQmCC
:target: https://mybinder.org/v2/gh/chrislit/abydos/master?filepath=binder
:alt: Binder
.. |license| image:: https://img.shields.io/badge/License-GPL%20v3+-blue.svg?logo=gnu
:target: https://www.gnu.org/licenses/gpl-3.0
:alt: License: GPL v3.0+
.. |sourcerank| image:: https://img.shields.io/librariesio/sourcerank/pypi/abydos.svg
:target: https://libraries.io/pypi/abydos
:alt: Libraries.io SourceRank
.. |zenodo| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.3603514.svg
:target: https://doi.org/10.5281/zenodo.3603514
:alt: Zenodo
.. |openhub| image:: https://www.openhub.net/p/abydosnlp/widgets/project_thin_badge.gif
:target: https://www.openhub.net/p/abydosnlp
:alt: OpenHUB
.. |gh-commits| image:: https://img.shields.io/github/commit-activity/y/chrislit/abydos.svg?logo=github
:target: https://github.com/chrislit/abydos/graphs/commit-activity
:alt: GitHub Commits
.. |gh-issues| image:: https://img.shields.io/github/issues-closed/chrislit/abydos.svg?logo=github
:target: https://github.com/chrislit/abydos/issues?q=
:alt: GitHub Issues Closed
.. |gh-stars| image:: https://img.shields.io/github/stars/chrislit/abydos.svg?logo=github
:target: https://github.com/chrislit/abydos/stargazers
:alt: GitHub Stars
.. |pypi| image:: https://img.shields.io/pypi/v/abydos.svg?logo=python&logoColor=white
:target: https://pypi.python.org/pypi/abydos
:alt: PyPI
.. |pypi-dl| image:: https://img.shields.io/pypi/dm/abydos.svg?logo=python&logoColor=white
:target: https://pypi.python.org/pypi/abydos
:alt: PyPI downloads/month
.. |pypi-ver| image:: https://img.shields.io/pypi/pyversions/abydos.svg?logo=python&logoColor=white
:target: https://pypi.python.org/pypi/abydos
:alt: PyPI versions
.. |conda| image:: https://img.shields.io/conda/vn/conda-forge/abydos.svg?logo=conda-forge
:target: https://anaconda.org/conda-forge/abydos
:alt: conda-forge
.. |conda-dl| image:: https://img.shields.io/conda/dn/conda-forge/abydos.svg?logo=conda-forge
:target: https://anaconda.org/conda-forge/abydos
:alt: conda-forge downloads
.. |conda-platforms| image:: https://img.shields.io/conda/pn/conda-forge/abydos.svg?logo=conda-forge
:target: https://anaconda.org/conda-forge/abydos
:alt: conda-forge platforms
|
.. image:: https://raw.githubusercontent.com/chrislit/abydos/master/abydos-small.png
:target: https://github.com/chrislit/abydos
:alt: abydos
:align: right
|
| `Abydos NLP/IR library `_
| Copyright 2014-2020 by Christopher C. Little
Abydos is a library of phonetic algorithms, string distance measures & metrics,
stemmers, and string fingerprinters including:
- Phonetic algorithms
- Robert C. Russell's Index
- American Soundex
- Refined Soundex
- Daitch-Mokotoff Soundex
- Kölner Phonetik
- NYSIIS
- Match Rating Algorithm
- Metaphone
- Double Metaphone
- Caverphone
- Alpha Search Inquiry System
- Fuzzy Soundex
- Phonex
- Phonem
- Phonix
- SfinxBis
- phonet
- Standardized Phonetic Frequency Code
- Statistics Canada
- Lein
- Roger Root
- Oxford Name Compression Algorithm (ONCA)
- Eudex phonetic hash
- Haase Phonetik
- Reth-Schek Phonetik
- FONEM
- Parmar-Kumbharana
- Davidson's Consonant Code
- SoundD
- PSHP Soundex/Viewex Coding
- an early version of Henry Code
- Norphone
- Dolby Code
- Phonetic Spanish
- Spanish Metaphone
- MetaSoundex
- SoundexBR
- NRL English-to-phoneme
- Beider-Morse Phonetic Matching
- String distance metrics
- Levenshtein distance
- Optimal String Alignment distance
- Levenshtein-Damerau distance
- Hamming distance
- Tversky index
- Sørensen–Dice coefficient & distance
- Jaccard similarity coefficient & distance
- overlap similarity & distance
- Tanimoto coefficient & distance
- Minkowski distance & similarity
- Manhattan distance & similarity
- Euclidean distance & similarity
- Chebyshev distance
- cosine similarity & distance
- Jaro distance
- Jaro-Winkler distance (incl. the strcmp95 algorithm variant)
- Longest common substring
- Ratcliff-Obershelp similarity & distance
- Match Rating Algorithm similarity
- Normalized Compression Distance (NCD) & similarity
- Monge-Elkan similarity & distance
- Matrix similarity
- Needleman-Wunsch score
- Smith-Waterman score
- Gotoh score
- Length similarity
- Prefix, Suffix, and Identity similarity & distance
- Modified Language-Independent Product Name Search (MLIPNS) similarity &
distance
- Bag distance
- Editex distance
- Eudex distances
- Sift4 distance
- Baystat distance & similarity
- Typo distance
- Indel distance
- Synoname
- Stemmers
- the Lovins stemmer
- the Porter and Porter2 (Snowball English) stemmers
- Snowball stemmers for German, Dutch, Norwegian, Swedish, and Danish
- CLEF German, German plus, and Swedish stemmers
- Caumann's German stemmer
- UEA-Lite Stemmer
- Paice-Husk Stemmer
- Schinke Latin stemmer
- S stemmer
- String Fingerprints
- string fingerprint
- q-gram fingerprint
- phonetic fingerprint
- Pollock & Zomora's skeleton key
- Pollock & Zomora's omission key
- Cisłak & Grabowski's occurrence fingerprint
- Cisłak & Grabowski's occurrence halved fingerprint
- Cisłak & Grabowski's count fingerprint
- Cisłak & Grabowski's position fingerprint
- Synoname Toolcode
-----
Installation
============
Required libraries:
- NumPy
- deprecation
Optional libraries (all available on PyPI, some available on conda or
conda-forge):
- `SyllabiPy `_
- `NLTK `_
- `PyLZSS `_
- `paq `_
To install Abydos (master) from Github source::
git clone https://github.com/chrislit/abydos.git --recursive
cd abydos
python setup install
If your default python command calls Python 2.7 but you want to install for
Python 3, you may instead need to call::
python3 setup install
To install Abydos (latest release) from PyPI using pip::
pip install abydos
To install from `conda-forge `_::
conda install abydos
It should run on Python 3.5-3.8.
Testing & Contributing
======================
To run the whole test-suite just call tox::
tox
The tox setup has the following environments: black, py37, doctest,
regression, fuzz, pylint, pydocstyle, flake8, doc8, docs, sloccount, badges, &
build. So if you only want to generate documentation (in HTML, EPUB, & PDF
formats), just call::
tox -e docs
In order to only run & generate Flake8 reports, call::
tox -e flake8
Contributions such as bug reports, PRs, suggestions, desired new features, etc.
are welcome through Github
`Issues `_ &
`Pull requests `_.
================================================
FILE: VERSION.rst
================================================
0.6.0
================================================
FILE: abydos/__init__.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.
Abydos NLP/IR library by Christopher C. Little
There are nine major packages that make up Abydos:
- :py:mod:`.compression` for string compression classes
- :py:mod:`.corpus` for document corpus classes
- :py:mod:`.distance` for string distance measure & metric classes
- :py:mod:`.fingerprint` for string fingerprint classes
- :py:mod:`.phones` for functions relating to phones and phonemes
- :py:mod:`.phonetic` for phonetic algorithm classes
- :py:mod:`.stats` for statistical functions and a confusion table class
- :py:mod:`.stemmer` for stemming classes
- :py:mod:`.tokenizer` for tokenizer classes
Classes with each package have consistent method names, as discussed below.
A tenth package, :py:mod:`.util`, contains functions not intended for end-user
use.
----
"""
__version__ = '0.6.0'
__all__ = [
'__version__',
'compression',
'corpus',
'distance',
'fingerprint',
'phones',
'phonetic',
'stats',
'stemmer',
'tokenizer',
'util',
]
================================================
FILE: abydos/compression/__init__.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
r"""abydos.compression.
The compression package defines compression and compression-related functions
for use within Abydos, including implementations of the following:
- :py:class:`.Arithmetic` for arithmetic coding
- :py:class:`.BWT` for Burrows-Wheeler Transform
- :py:class:`.RLE` for Run-Length Encoding
Each class exposes ``encode`` and ``decode`` methods for performing and
reversing its encoding. For example, the Burrows-Wheeler Transform can be
performed by creating a :py:class:`.BWT` object and then calling
:py:meth:`.BWT.encode` on a string:
>>> bwt = BWT()
>>> bwt.encode('^BANANA')
'ANNB^AA\x00'
----
"""
from ._arithmetic import Arithmetic
from ._bwt import BWT
from ._rle import RLE
__all__ = [
'Arithmetic',
'BWT',
'RLE',
]
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/compression/_arithmetic.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.compression._arithmetic.
Arithmetic coder/decoder
"""
from collections import Counter
from fractions import Fraction
from typing import Dict, Tuple, Union
__all__ = ['Arithmetic']
class Arithmetic:
"""Arithmetic Coder.
This is based on Andrew Dalke's public domain implementation
:cite:`Dalke:2005`. It has been ported to use the fractions.Fraction class.
.. versionadded:: 0.3.6
"""
_probs = {} # type: Dict[str, Tuple[Fraction, Fraction]]
def __init__(self, text: Union[str, None] = None) -> None:
"""Initialize arithmetic coder object.
Parameters
----------
text : str or None
The training text
.. versionadded:: 0.3.6
"""
if text is not None:
self.train(text)
def get_probs(self) -> Dict[str, Tuple[Fraction, Fraction]]:
"""Return the probs dictionary.
Returns
-------
dict
The dictionary of probabilities
.. versionadded:: 0.3.6
"""
return self._probs
def set_probs(self, probs: Dict[str, Tuple[Fraction, Fraction]]) -> None:
"""Set the probs dictionary.
Parameters
----------
probs : dict
The dictionary of probabilities
.. versionadded:: 0.3.6
"""
self._probs = probs
def train(self, text: str) -> None:
r"""Generate a probability dict from the provided text.
Text to 0-order probability statistics as a dict
Parameters
----------
text : str
The text data over which to calculate probability statistics. This
must not contain the NUL (0x00) character because that is used to
indicate the end of data.
Example
-------
>>> ac = Arithmetic()
>>> ac.train('the quick brown fox jumped over the lazy dog')
>>> ac.get_probs()
{' ': (Fraction(0, 1), Fraction(8, 45)),
'o': (Fraction(8, 45), Fraction(4, 15)),
'e': (Fraction(4, 15), Fraction(16, 45)),
'u': (Fraction(16, 45), Fraction(2, 5)),
't': (Fraction(2, 5), Fraction(4, 9)),
'r': (Fraction(4, 9), Fraction(22, 45)),
'h': (Fraction(22, 45), Fraction(8, 15)),
'd': (Fraction(8, 15), Fraction(26, 45)),
'z': (Fraction(26, 45), Fraction(3, 5)),
'y': (Fraction(3, 5), Fraction(28, 45)),
'x': (Fraction(28, 45), Fraction(29, 45)),
'w': (Fraction(29, 45), Fraction(2, 3)),
'v': (Fraction(2, 3), Fraction(31, 45)),
'q': (Fraction(31, 45), Fraction(32, 45)),
'p': (Fraction(32, 45), Fraction(11, 15)),
'n': (Fraction(11, 15), Fraction(34, 45)),
'm': (Fraction(34, 45), Fraction(7, 9)),
'l': (Fraction(7, 9), Fraction(4, 5)),
'k': (Fraction(4, 5), Fraction(37, 45)),
'j': (Fraction(37, 45), Fraction(38, 45)),
'i': (Fraction(38, 45), Fraction(13, 15)),
'g': (Fraction(13, 15), Fraction(8, 9)),
'f': (Fraction(8, 9), Fraction(41, 45)),
'c': (Fraction(41, 45), Fraction(14, 15)),
'b': (Fraction(14, 15), Fraction(43, 45)),
'a': (Fraction(43, 45), Fraction(44, 45)),
'\x00': (Fraction(44, 45), Fraction(1, 1))}
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if '\x00' in text:
text = text.replace('\x00', ' ')
counts = Counter(text)
counts['\x00'] = 1
tot_letters = sum(counts.values())
tot = 0
self._probs = {}
prev = Fraction(0)
for char, count in sorted(
counts.items(), key=lambda x: (x[1], x[0]), reverse=True
):
follow = Fraction(tot + count, tot_letters)
self._probs[char] = (prev, follow)
prev = follow
tot = tot + count
def encode(self, text: str) -> Tuple[int, int]:
"""Encode a text using arithmetic coding.
Text and the 0-order probability statistics -> longval, nbits
The encoded number is Fraction(longval, 2**nbits)
Parameters
----------
text : str
A string to encode
Returns
-------
tuple
The arithmetically coded text
Example
-------
>>> ac = Arithmetic('the quick brown fox jumped over the lazy dog')
>>> ac.encode('align')
(16720586181, 34)
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if '\x00' in text:
text = text.replace('\x00', ' ')
minval = Fraction(0)
maxval = Fraction(1)
for char in text + '\x00':
prob_range = self._probs[char]
delta = maxval - minval
maxval = minval + prob_range[1] * delta
minval = minval + prob_range[0] * delta
# I tried without the /2 just to check. Doesn't work.
# Keep scaling up until the error range is >= 1. That
# gives me the minimum number of bits needed to resolve
# down to the end-of-data character.
delta = (maxval - minval) / 2
nbits = int(0)
while delta < 1:
nbits += 1
delta *= 2
# The below condition shouldn't ever be false
if nbits == 0: # pragma: no cover
return 0, 0
# using -1 instead of /2
avg = (maxval + minval) * 2 ** (nbits - 1)
# Could return a rational instead ...
# the division truncation is deliberate
return avg.numerator // avg.denominator, nbits
def decode(self, longval: int, nbits: int) -> str:
"""Decode the number to a string using the given statistics.
Parameters
----------
longval : int
The first part of an encoded tuple from encode
nbits : int
The second part of an encoded tuple from encode
Returns
-------
str
The arithmetically decoded text
Example
-------
>>> ac = Arithmetic('the quick brown fox jumped over the lazy dog')
>>> ac.decode(16720586181, 34)
'align'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
val = Fraction(longval, int(1) << nbits)
letters = []
probs_items = [
(char, minval, maxval)
for (char, (minval, maxval)) in self._probs.items()
]
char = '\x00'
minval = maxval = Fraction(0)
while True:
for (char, minval, maxval) in probs_items: # noqa: B007
if minval <= val < maxval:
break
if char == '\x00':
break
letters.append(char)
delta = maxval - minval
val = (val - minval) / delta
return ''.join(letters)
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
================================================
FILE: abydos/compression/_bwt.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.compression._bwt.
Burrows-Wheeler Transform encoder/decoder
"""
__all__ = ['BWT']
class BWT:
"""Burrows-Wheeler Transform.
The Burrows-Wheeler transform is an attempt at placing similar characters
together to improve compression.
Cf. :cite:`Burrows:1994`.
.. versionadded:: 0.3.6
"""
def __init__(self, terminator: str = '\0') -> None:
"""Initialize BWT instance.
Parameters
----------
terminator : str
A character added to signal the end of the string
.. versionadded:: 0.4.0
"""
self._terminator = terminator
def encode(self, word: str) -> str:
r"""Return the Burrows-Wheeler transformed form of a word.
Parameters
----------
word : str
The word to transform using BWT
Returns
-------
str
Word encoded by BWT
Raises
------
ValueError
Specified terminator absent from code.
Examples
--------
>>> bwt = BWT()
>>> bwt.encode('align')
'n\x00ilag'
>>> bwt.encode('banana')
'annb\x00aa'
>>> bwt = BWT('@')
>>> bwt.encode('banana')
'annb@aa'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if word:
if self._terminator in word:
raise ValueError(
'Specified terminator, {}, already in word.'.format(
self._terminator if self._terminator != '\0' else '\\0'
)
)
else:
word += self._terminator
wordlist = sorted(
word[i:] + word[:i] for i in range(len(word))
)
return ''.join([w[-1] for w in wordlist])
else:
return self._terminator
def decode(self, code: str) -> str:
r"""Return a word decoded from BWT form.
Parameters
----------
code : str
The word to transform from BWT form
Returns
-------
str
Word decoded by BWT
Raises
------
ValueError
Specified terminator absent from code.
Examples
--------
>>> bwt = BWT()
>>> bwt.decode('n\x00ilag')
'align'
>>> bwt.decode('annb\x00aa')
'banana'
>>> bwt = BWT('@')
>>> bwt.decode('annb@aa')
'banana'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if code:
if self._terminator not in code:
raise ValueError(
'Specified terminator, {}, absent from code.'.format(
self._terminator if self._terminator != '\0' else '\\0'
)
)
else:
wordlist = [''] * len(code)
for i in range(len(code)):
wordlist = sorted(
code[i] + wordlist[i] for i in range(len(code))
)
rows = [w for w in wordlist if w[-1] == self._terminator][0]
return rows.rstrip(self._terminator)
else:
return ''
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/compression/_rle.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.compression._rle.
Run-Length Encoding encoder/decoder
"""
from itertools import groupby
__all__ = ['RLE']
class RLE:
"""Run-Length Encoding.
Cf. :cite:`Robinson:1967`.
Based on http://rosettacode.org/wiki/Run-length_encoding#Python
:cite:`rosettacode:2018`. This is licensed GFDL 1.2.
Digits 0-9 cannot be in text.
.. versionadded:: 0.3.6
"""
def encode(self, text: str) -> str:
r"""Perform encoding of run-length-encoding (RLE).
Parameters
----------
text : str
A text string to encode
Returns
-------
str
Word decoded by RLE
Examples
--------
>>> from abydos.compression import BWT
>>> rle = RLE()
>>> bwt = BWT()
>>> rle.encode(bwt.encode('align'))
'n\x00ilag'
>>> rle.encode('align')
'align'
>>> rle.encode(bwt.encode('banana'))
'annb\x00aa'
>>> rle.encode('banana')
'banana'
>>> rle.encode(bwt.encode('aaabaabababa'))
'ab\x00abbab5a'
>>> rle.encode('aaabaabababa')
'3abaabababa'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if text:
text = ''.join(
(str(n) + k if n > 2 else (k if n == 1 else 2 * k))
for n, k in ((len(list(g)), k) for k, g in groupby(text))
)
return text
def decode(self, text: str) -> str:
r"""Perform decoding of run-length-encoding (RLE).
Parameters
----------
text : str
A text string to decode
Returns
-------
str
Word decoded by RLE
Examples
--------
>>> from abydos.compression import BWT
>>> rle = RLE()
>>> bwt = BWT()
>>> bwt.decode(rle.decode('n\x00ilag'))
'align'
>>> rle.decode('align')
'align'
>>> bwt.decode(rle.decode('annb\x00aa'))
'banana'
>>> rle.decode('banana')
'banana'
>>> bwt.decode(rle.decode('ab\x00abbab5a'))
'aaabaabababa'
>>> rle.decode('3abaabababa')
'aaabaabababa'
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
mult = ''
decoded = []
for letter in list(text):
if not letter.isdigit():
if mult:
decoded.append(int(mult) * letter)
mult = ''
else:
decoded.append(letter)
else:
mult += letter
text = ''.join(decoded)
return text
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/corpus/__init__.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
r"""abydos.corpus.
The corpus package includes basic and n-gram corpus classes:
- :py:class:`Corpus`
- :py:class:`NGramCorpus`
- :py:class:`UnigramCorpus`
As a quick example of :py:class:`.Corpus`:
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n\n'
>>> tqbf += 'And then it slept.\n\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.docs()
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog.']],
[['And', 'then', 'it', 'slept.']], [['And', 'the', 'dog', 'ran', 'off.']]]
>>> round(corp.idf('dog'), 10)
1.0986122887
>>> round(corp.idf('the'), 10)
0.4054651081
Here, each sentence is a separate "document". We can retrieve IDF values from
the :py:class:`.Corpus`. The same :py:class:`.Corpus` can be used to initialize
an :py:class:`.NGramCorpus` and calculate TF values:
>>> ngcorp = NGramCorpus(corp)
>>> ngcorp.get_count('the')
2
>>> ngcorp.get_count('fox')
1
----
"""
from ._corpus import Corpus
from ._ngram_corpus import NGramCorpus
from ._unigram_corpus import UnigramCorpus
__all__ = ['Corpus', 'NGramCorpus', 'UnigramCorpus']
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/corpus/_corpus.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.corpus._corpus.
The Corpus class is a container for linguistic corpora and includes various
functions for corpus statistics, language modeling, etc.
"""
from math import log
from typing import Callable, List, Optional, Set, Tuple, Union
from ..tokenizer import _Tokenizer
__all__ = ['Corpus']
class Corpus:
"""Corpus class.
Internally, this is a list of lists or lists. The corpus itself is a list
of documents. Each document is an ordered list of sentences in those
documents. And each sentence is an ordered list of words that make up that
sentence.
.. versionadded:: 0.1.0
"""
def __init__(
self,
corpus_text: str = '',
doc_split: str = '\n\n',
sent_split: str = '\n',
filter_chars: Union[str, List[str], Set[str], Tuple[str]] = '',
stop_words: Optional[Union[List[str], Set[str], Tuple[str]]] = None,
word_tokenizer: Optional[_Tokenizer] = None,
) -> None:
r"""Initialize Corpus.
By default, when importing a corpus:
- two consecutive newlines divide documents
- single newlines divide sentences
- other whitespace divides words
Parameters
----------
corpus_text : str
The corpus text as a single string
doc_split : str
A character or string used to split corpus_text into documents
sent_split : str
A character or string used to split documents into sentences
filter_chars : list or set or tuple or str
A list of characters (as a string, tuple, set, or list) to filter
out of the corpus text
stop_words : list or set or tuple
A list of words (as a tuple, set, or list) to filter out of the
corpus text
word_tokenizer : _Tokenizer
A tokenizer to apply to each sentence in order to retrieve the
individual "word" tokens. If set to none, str.split() will be used.
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
.. versionadded:: 0.1.0
"""
self.corpus = [] # type: List[List[List[str]]]
self.doc_split = doc_split
self.sent_split = sent_split
for document in corpus_text.split(doc_split):
doc = [] # type: List[List[str]]
for sentence in document.split(sent_split):
if word_tokenizer:
word_tokenizer.tokenize(sentence)
sentence_words = word_tokenizer.get_list()
else:
sentence_words = sentence.split()
if stop_words:
for word in set(stop_words):
while word in sentence_words:
sentence_words.remove(word)
for char in set(filter_chars):
sentence_words = [
word.replace(char, '') for word in sentence_words
]
if sentence_words:
doc.append(sentence_words)
if doc:
self.corpus.append(doc)
def docs(self) -> List[List[List[str]]]:
r"""Return the docs in the corpus.
Each list within a doc represents the sentences in that doc, each of
which is in turn a list of words within that sentence.
Returns
-------
[[[str]]]
The docs in the corpus as a list of lists of lists of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.docs()
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog',
'ran', 'off.']]]
>>> len(corp.docs())
1
.. versionadded:: 0.1.0
"""
return self.corpus
def paras(self) -> List[List[List[str]]]:
r"""Return the paragraphs in the corpus.
Each list within a paragraph represents the sentences in that doc, each
of which is in turn a list of words within that sentence.
This is identical to the docs() member function and exists only to
mirror part of NLTK's API for corpora.
Returns
-------
[[[str]]]
The paragraphs in the corpus as a list of lists of lists of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.paras()
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog',
'ran', 'off.']]]
>>> len(corp.paras())
1
.. versionadded:: 0.1.0
"""
return self.docs()
def sents(self) -> List[List[str]]:
r"""Return the sentences in the corpus.
Each list within a sentence represents the words within that sentence.
Returns
-------
[[str]]
The sentences in the corpus as a list of lists of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.sents()
[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.'], ['And', 'then', 'it', 'slept.'], ['And', 'the', 'dog',
'ran', 'off.']]
>>> len(corp.sents())
3
"""
return [words for sents in self.corpus for words in sents]
def words(self) -> List[str]:
r"""Return the words in the corpus as a single list.
Returns
-------
[str]
The words in the corpus as a list of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.words()
['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran',
'off.']
>>> len(corp.words())
18
.. versionadded:: 0.1.0
"""
return [words for sents in self.sents() for words in sents]
def docs_of_words(self) -> List[List[str]]:
r"""Return the docs in the corpus, with sentences flattened.
Each list within the corpus represents all the words of that document.
Thus the sentence level of lists has been flattened.
Returns
-------
[[str]]
The docs in the corpus as a list of list of strs
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> corp.docs_of_words()
[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.', 'And', 'then', 'it', 'slept.', 'And', 'the', 'dog', 'ran',
'off.']]
>>> len(corp.docs_of_words())
1
.. versionadded:: 0.1.0
"""
return [
[words for sents in doc for words in sents] for doc in self.corpus
]
def raw(self) -> str:
r"""Return the raw corpus.
This is reconstructed by joining sub-components with the corpus' split
characters
Returns
-------
str
The raw corpus
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> print(corp.raw())
The quick brown fox jumped over the lazy dog.
And then it slept.
And the dog ran off.
>>> len(corp.raw())
85
.. versionadded:: 0.1.0
"""
doc_list = []
for doc in self.corpus:
sent_list = []
for sent in doc:
sent_list.append(' '.join(sent))
doc_list.append(self.sent_split.join(sent_list))
del sent_list
return self.doc_split.join(doc_list)
def idf(
self, term: str, transform: Optional[Callable[[str], str]] = None
) -> float:
r"""Calculate the Inverse Document Frequency of a term in the corpus.
Parameters
----------
term : str
The term to calculate the IDF of
transform : function
A function to apply to each document term before checking for the
presence of term
Returns
-------
float
The IDF
Examples
--------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n\n'
>>> tqbf += 'And then it slept.\n\n And the dog ran off.'
>>> corp = Corpus(tqbf)
>>> print(corp.docs())
[[['The', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy',
'dog.']],
[['And', 'then', 'it', 'slept.']],
[['And', 'the', 'dog', 'ran', 'off.']]]
>>> round(corp.idf('dog'), 10)
1.0986122887
>>> round(corp.idf('the'), 10)
0.4054651081
.. versionadded:: 0.1.0
"""
docs_with_term = 0
docs = self.docs_of_words()
for doc in docs:
doc_set = set(doc)
if transform:
transformed_doc = []
for word in doc_set:
transformed_doc.append(transform(word))
doc_set = set(transformed_doc)
if term in doc_set:
docs_with_term += 1
if docs_with_term == 0:
return float('inf')
return log(len(docs) / docs_with_term)
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
================================================
FILE: abydos/corpus/_ngram_corpus.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.corpus._ngram_corpus.
The NGram class is a container for an n-gram corpus
"""
from codecs import open as c_open
from collections import Counter
from typing import Any, Counter as TCounter, List, Optional, Union, cast
from ._corpus import Corpus
__all__ = ['NGramCorpus']
class NGramCorpus:
"""The NGramCorpus class.
Internally, this is a set of recursively embedded dicts, with n layers for
a corpus of n-grams. E.g. for a trigram corpus, this will be a dict of
dicts of dicts. More precisely, ``collections.Counter`` is used in place of
dict, making multiset operations valid and allowing unattested n-grams to
be queried.
The key at each level is a word. The value at the most deeply embedded
level is a numeric value representing the frequency of the trigram. E.g.
the trigram frequency of 'colorless green ideas' would be the value stored
in ``self.ngcorpus['colorless']['green']['ideas'][None]``.
.. versionadded:: 0.3.0
"""
def __init__(self, corpus: Optional[Corpus] = None) -> None:
r"""Initialize Corpus.
Parameters
----------
corpus : Corpus
The :py:class:`Corpus` from which to initialize the n-gram corpus.
By default, this is None, which initializes an empty NGramCorpus.
This can then be populated using NGramCorpus methods.
Raises
------
TypeError
Corpus argument must be None or of type abydos.Corpus
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> ngcorp = NGramCorpus(Corpus(tqbf))
.. versionadded:: 0.3.0
"""
self.ngcorpus = Counter() # type: TCounter[Optional[str]]
if corpus is None:
return
elif isinstance(corpus, Corpus):
self.corpus_importer(corpus)
else:
raise TypeError(
'Corpus argument must be None or of type abydos.corpus.Corpus. '
+ str(type(corpus))
+ ' found.'
)
def corpus_importer(
self,
corpus: Corpus,
n_val: int = 1,
bos: str = '_START_',
eos: str = '_END_',
) -> None:
r"""Fill in self.ngcorpus from a Corpus argument.
Parameters
----------
corpus : Corpus
The Corpus from which to initialize the n-gram corpus
n_val : int
Maximum n value for n-grams
bos : str
String to insert as an indicator of beginning of sentence
eos : str
String to insert as an indicator of end of sentence
Raises
------
TypeError
Corpus argument of the Corpus class required.
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> ngcorp = NGramCorpus()
>>> ngcorp.corpus_importer(Corpus(tqbf))
.. versionadded:: 0.3.0
"""
if not corpus or not isinstance(corpus, Corpus):
raise TypeError('Corpus argument of the Corpus class required.')
sentences = corpus.sents()
for sent in sentences:
ngs = Counter(sent)
for key in ngs.keys():
self._add_to_ngcorpus(self.ngcorpus, [key], ngs[key])
if n_val > 1:
if bos and bos != '':
sent = [bos] + sent
if eos and eos != '':
sent += [eos]
for i in range(2, n_val + 1):
for j in range(len(sent) - i + 1):
self._add_to_ngcorpus(
self.ngcorpus, sent[j : j + i], 1
)
def get_count(
self,
ngram: Union[str, List[str]],
corpus: Optional[TCounter[Optional[str]]] = None,
) -> int:
r"""Get the count of an n-gram in the corpus.
Parameters
----------
ngram : str or List[str]
The n-gram to retrieve the count of from the n-gram corpus
corpus : Counter[str] or None
The corpus
Returns
-------
int
The n-gram count
Examples
--------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> ngcorp = NGramCorpus(Corpus(tqbf))
>>> ngcorp.get_count('the')
2
>>> ngcorp.get_count('fox')
1
.. versionadded:: 0.3.0
"""
if not corpus:
corpus = self.ngcorpus
# if ngram is empty, we're at our leaf node and should return the
# value in None
if not ngram:
return corpus[None]
# support strings or lists/tuples by splitting strings
if isinstance(ngram, str):
ngram = ngram.split()
# if ngram is not empty, check whether the next element is in the
# corpus; if so, recurse--if not, return 0
if ngram[0] in corpus:
return self.get_count(
ngram[1:],
cast(Optional[TCounter[Optional[str]]], corpus[ngram[0]]),
)
return 0
def _add_to_ngcorpus(
self, corpus: Any, words: List[str], count: int
) -> None:
"""Build up a corpus entry recursively.
Parameters
----------
corpus : Corpus or counter
The corpus
words : [str]
Words to add to the corpus
count : int
Count of words
.. versionadded:: 0.3.0
"""
if words[0] not in corpus:
corpus[words[0]] = Counter()
if len(words) == 1:
corpus[words[0]][None] += count
else:
self._add_to_ngcorpus(corpus[words[0]], words[1:], count)
def gng_importer(self, corpus_file: str) -> None:
"""Fill in self.ngcorpus from a Google NGram corpus file.
Parameters
----------
corpus_file : str
The filename of the Google NGram file from which to initialize the
n-gram corpus
.. versionadded:: 0.3.0
"""
with c_open(corpus_file, 'r', encoding='utf-8') as gng:
for line in gng:
line_parts = line.rstrip().split('\t')
words = line_parts[0].split()
self._add_to_ngcorpus(self.ngcorpus, words, int(line_parts[2]))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/corpus/_unigram_corpus.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.corpus._unigram_corpus.
Unigram Corpus
"""
import pickle # noqa: S403
from codecs import open as c_open
from collections import Counter, defaultdict
from math import log1p
from typing import Any, Callable, DefaultDict, Optional, Tuple
from ..tokenizer import _Tokenizer
__all__ = ['UnigramCorpus']
def _dd_default(*args: Any) -> Tuple[int, int]:
return 0, 0
class UnigramCorpus:
"""Unigram corpus class.
Largely intended for calculating inverse document frequence (IDF) from a
large corpus of unigram (or smaller) tokens, this class encapsulates a
dict object. Each key is a unigram token whose value is a tuple consisting
of the number of times a term appeared and the number of distinct documents
in which it appeared.
.. versionadded:: 0.4.0
"""
def __init__(
self,
corpus_text: str = '',
documents: int = 0,
word_transform: Optional[Callable[[str], str]] = None,
word_tokenizer: Optional[_Tokenizer] = None,
) -> None:
r"""Initialize UnigramCorpus.
Parameters
----------
corpus_text : str
The corpus text as a single string
documents : int
The number of documents in the corpus. If equal to 0 (the default)
then the maximum from the internal dictionary's distinct
documents count.
word_transform : function
A function to apply to each term before term tokenization and
addition to the corpus. One might use this, for example, to apply
Soundex encoding to each term.
word_tokenizer : _Tokenizer
A tokenizer to apply to each sentence in order to retrieve the
individual "word" tokens. If set to none, str.split() will be used.
Example
-------
>>> tqbf = 'The quick brown fox jumped over the lazy dog.\n'
>>> tqbf += 'And then it slept.\n And the dog ran off.'
>>> corp = UnigramCorpus(tqbf)
.. versionadded:: 0.4.0
"""
self.corpus = defaultdict(
_dd_default
) # type: DefaultDict[str, Tuple[int, int]]
self.transform = word_transform
self.tokenizer = word_tokenizer
self.doc_count = documents
self.add_document(corpus_text)
def add_document(self, doc: str) -> None:
"""Add a new document to the corpus.
Parameters
----------
doc : str
A string, representing the document to be added.
.. versionadded:: 0.4.0
"""
for word, count in Counter(doc.split()).items():
self._add_word(word, count, 1)
self.doc_count += 1
def save_corpus(self, filename: str) -> None:
"""Save the corpus to a file.
This employs pickle to save the corpus (a defaultdict). Other
parameters of the corpus, such as its word_tokenizer, will not be
affected and should be set during initialization.
Parameters
----------
filename : str
The filename to save the corpus to.
.. versionadded:: 0.4.0
"""
with open(filename, mode='wb') as pkl:
pickle.dump(self.corpus, pkl)
def load_corpus(self, filename: str) -> None:
"""Load the corpus from a file.
This employs pickle to load the corpus (a defaultdict). Other
parameters of the corpus, such as its word_tokenizer, will not be
affected and should be set during initialization.
Parameters
----------
filename : str
The filename to load the corpus from.
.. versionadded:: 0.4.0
"""
with open(filename, mode='rb') as pkl:
self.corpus = pickle.load(pkl) # noqa: S301
self._update_doc_count()
def _update_doc_count(self) -> None:
"""Update document count, if necessary.
.. versionadded:: 0.4.0
"""
max_docs = max(self.corpus.values(), key=lambda _: _[1])[1]
self.doc_count = max(max_docs, self.doc_count)
def _add_word(self, word: str, count: int, doc_count: int) -> None:
"""Add a term to the corpus, possibly after tokenization.
Parameters
----------
word : str
Word to add to the corpus
count : int
Count of word appearances
doc_count : int
Count of distinct documents in which word appears
.. versionadded:: 0.4.0
"""
if self.transform is not None:
word = self.transform(word)
if self.tokenizer is not None:
self.tokenizer.tokenize(word)
tokens = self.tokenizer.get_counter()
for tok in tokens:
n = tokens[tok] * count
prior_count, prior_doc_count = self.corpus[tok]
self.corpus[tok] = (
prior_count + n,
prior_doc_count + doc_count,
)
else:
prior_count, prior_doc_count = self.corpus[word]
self.corpus[word] = (
prior_count + count,
prior_doc_count + doc_count,
)
def gng_importer(self, corpus_file: str) -> None:
"""Fill in self.corpus from a Google NGram corpus file.
Parameters
----------
corpus_file : file
The Google NGram file from which to initialize the n-gram corpus
.. versionadded:: 0.4.0
"""
with c_open(corpus_file, 'r', encoding='utf-8') as gng:
for line in gng:
word, _, count, doc_count = line.rstrip().split('\t')
if '_' in word:
word = word[: word.find('_')]
self._add_word(word, int(count), int(doc_count))
self._update_doc_count()
def idf(self, term: str) -> float:
r"""Calculate the Inverse Document Frequency of a term in the corpus.
Parameters
----------
term : str
The term to calculate the IDF of
Returns
-------
float
The IDF
Examples
--------
>>> tqbf = 'the quick brown fox jumped over the lazy dog\n\n'
>>> tqbf += 'and then it slept\n\n and the dog ran off'
>>> corp = UnigramCorpus(tqbf)
>>> round(corp.idf('dog'), 10)
0.6931471806
>>> round(corp.idf('the'), 10)
0.6931471806
.. versionadded:: 0.4.0
"""
if term in self.corpus:
count, term_doc_count = self.corpus[term]
return log1p(self.doc_count / term_doc_count)
else:
return float('inf')
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
================================================
FILE: abydos/distance/__init__.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
r"""abydos.distance.
The distance package implements string distance measure and metric classes:
These include traditional Levenshtein edit distance and related algorithms:
- Levenshtein distance (:py:class:`.Levenshtein`)
- Optimal String Alignment distance (:py:class:`.Levenshtein` with
``mode='osa'``)
- Damerau-Levenshtein distance (:py:class:`.DamerauLevenshtein`)
- Yujian-Bo normalized edit distance (:py:class:`.YujianBo`)
- Higuera-Micó contextual normalized edit distance
(:py:class:`.HigueraMico`)
- Indel distance (:py:class:`.Indel`)
- Syllable Alignment Pattern Searching similarity
(:py:class:`.distance.SAPS`)
- Meta-Levenshtein distance (:py:class:`.MetaLevenshtein`)
- Covington distance (:py:class:`.Covington`)
- ALINE distance (:py:class:`.ALINE`)
- FlexMetric distance (:py:class:`.FlexMetric`)
- BI-SIM similarity (:py:class:`.BISIM`)
- Discounted Levenshtein distance (:py:class:`.DiscountedLevenshtein`)
- Phonetic edit distance (:py:class:`.PhoneticEditDistance`)
Hamming distance (:py:class:`.Hamming`), Relaxed Hamming distance
(:py:class:`.RelaxedHamming`), and the closely related Modified
Language-Independent Product Name Search distance (:py:class:`.MLIPNS`) are
provided.
Block edit distances:
- Tichy edit distance (:py:class:`.Tichy`)
- Levenshtein distance with block operations
(:py:class:`.BlockLevenshtein`)
- Rees-Levenshtein distance (:py:class:`.ReesLevenshtein`)
- Cormode's LZ distance (:py:class:`.CormodeLZ`)
- Shapira-Storer I edit distance with block moves, greedy algorithm
(:py:class:`.ShapiraStorerI`)
Distance metrics developed for the US Census or derived from them are included:
- Jaro distance (:py:class:`.JaroWinkler` with ``mode='Jaro'``)
- Jaro-Winkler distance (:py:class:`.JaroWinkler`)
- Strcmp95 distance (:py:class:`.Strcmp95`)
- Iterative-SubString (I-Sub) correlation
(:py:class:`.IterativeSubString`)
A large set of multi-set token-based distance metrics are provided, including:
- AMPLE similarity (:py:class:`.AMPLE`)
- AZZOO similarity (:py:class:`.AZZOO`)
- Anderberg's D similarity (:py:class:`.Anderberg`)
- Andres & Marzo's Delta correlation (:py:class:`.AndresMarzoDelta`)
- Baroni-Urbani & Buser I similarity (:py:class:`.BaroniUrbaniBuserI`)
- Baroni-Urbani & Buser II correlation (:py:class:`.BaroniUrbaniBuserII`)
- Batagelj & Bren similarity (:py:class:`.BatageljBren`)
- Baulieu I distance (:py:class:`.BaulieuI`)
- Baulieu II distance (:py:class:`.BaulieuII`)
- Baulieu III distance (:py:class:`.BaulieuIII`)
- Baulieu IV distance (:py:class:`.BaulieuIV`)
- Baulieu V distance (:py:class:`.BaulieuV`)
- Baulieu VI distance (:py:class:`.BaulieuVI`)
- Baulieu VII distance (:py:class:`.BaulieuVII`)
- Baulieu VIII distance (:py:class:`.BaulieuVIII`)
- Baulieu IX distance (:py:class:`.BaulieuIX`)
- Baulieu X distance (:py:class:`.BaulieuX`)
- Baulieu XI distance (:py:class:`.BaulieuXI`)
- Baulieu XII distance (:py:class:`.BaulieuXII`)
- Baulieu XIII distance (:py:class:`.BaulieuXIII`)
- Baulieu XIV distance (:py:class:`.BaulieuXIV`)
- Baulieu XV distance (:py:class:`.BaulieuXV`)
- Benini I correlation (:py:class:`.BeniniI`)
- Benini II correlation (:py:class:`.BeniniII`)
- Bennet's S correlation (:py:class:`.Bennet`)
- Braun-Blanquet similarity (:py:class:`.BraunBlanquet`)
- Canberra distance (:py:class:`.Canberra`)
- Cao similarity (:py:class:`.Cao`)
- Chao's Dice similarity (:py:class:`.ChaoDice`)
- Chao's Jaccard similarity (:py:class:`.ChaoJaccard`)
- Chebyshev distance (:py:class:`.Chebyshev`)
- Chord distance (:py:class:`.Chord`)
- Clark distance (:py:class:`.Clark`)
- Clement similarity (:py:class:`.Clement`)
- Cohen's Kappa similarity (:py:class:`.CohenKappa`)
- Cole correlation (:py:class:`.Cole`)
- Consonni & Todeschini I similarity (:py:class:`.ConsonniTodeschiniI`)
- Consonni & Todeschini II similarity (:py:class:`.ConsonniTodeschiniII`)
- Consonni & Todeschini III similarity (:py:class:`.ConsonniTodeschiniIII`)
- Consonni & Todeschini IV similarity (:py:class:`.ConsonniTodeschiniIV`)
- Consonni & Todeschini V correlation (:py:class:`.ConsonniTodeschiniV`)
- Cosine similarity (:py:class:`.Cosine`)
- Dennis similarity (:py:class:`.Dennis`)
- Dice's Asymmetric I similarity (:py:class:`.DiceAsymmetricI`)
- Dice's Asymmetric II similarity (:py:class:`.DiceAsymmetricII`)
- Digby correlation (:py:class:`.Digby`)
- Dispersion correlation (:py:class:`.Dispersion`)
- Doolittle similarity (:py:class:`.Doolittle`)
- Dunning similarity (:py:class:`.Dunning`)
- Euclidean distance (:py:class:`.Euclidean`)
- Eyraud similarity (:py:class:`.Eyraud`)
- Fager & McGowan similarity (:py:class:`.FagerMcGowan`)
- Faith similarity (:py:class:`.Faith`)
- Fidelity similarity (:py:class:`.Fidelity`)
- Fleiss correlation (:py:class:`.Fleiss`)
- Fleiss-Levin-Paik similarity (:py:class:`.FleissLevinPaik`)
- Forbes I similarity (:py:class:`.ForbesI`)
- Forbes II correlation (:py:class:`.ForbesII`)
- Fossum similarity (:py:class:`.Fossum`)
- Generalized Fleiss correlation (:py:class:`.GeneralizedFleiss`)
- Gilbert correlation (:py:class:`.Gilbert`)
- Gilbert & Wells similarity (:py:class:`.GilbertWells`)
- Gini I correlation (:py:class:`.GiniI`)
- Gini II correlation (:py:class:`.GiniII`)
- Goodall similarity (:py:class:`.Goodall`)
- Goodman & Kruskal's Lambda similarity (:py:class:`.GoodmanKruskalLambda`)
- Goodman & Kruskal's Lambda-r correlation
(:py:class:`.GoodmanKruskalLambdaR`)
- Goodman & Kruskal's Tau A similarity (:py:class:`.GoodmanKruskalTauA`)
- Goodman & Kruskal's Tau B similarity (:py:class:`.GoodmanKruskalTauB`)
- Gower & Legendre similarity (:py:class:`.GowerLegendre`)
- Guttman Lambda A similarity (:py:class:`.GuttmanLambdaA`)
- Guttman Lambda B similarity (:py:class:`.GuttmanLambdaB`)
- Gwet's AC correlation (:py:class:`.GwetAC`)
- Hamann correlation (:py:class:`.Hamann`)
- Harris & Lahey similarity (:py:class:`.HarrisLahey`)
- Hassanat distance (:py:class:`.Hassanat`)
- Hawkins & Dotson similarity (:py:class:`.HawkinsDotson`)
- Hellinger distance (:py:class:`.Hellinger`)
- Henderson-Heron similarity (:py:class:`.HendersonHeron`)
- Horn-Morisita similarity (:py:class:`.HornMorisita`)
- Hurlbert correlation (:py:class:`.Hurlbert`)
- Jaccard similarity (:py:class:`.Jaccard`) &
Tanimoto coefficient (:py:meth:`.Jaccard.tanimoto_coeff`)
- Jaccard-NM similarity (:py:class:`.JaccardNM`)
- Johnson similarity (:py:class:`.Johnson`)
- Kendall's Tau correlation (:py:class:`.KendallTau`)
- Kent & Foster I similarity (:py:class:`.KentFosterI`)
- Kent & Foster II similarity (:py:class:`.KentFosterII`)
- Köppen I correlation (:py:class:`.KoppenI`)
- Köppen II similarity (:py:class:`.KoppenII`)
- Kuder & Richardson correlation (:py:class:`.KuderRichardson`)
- Kuhns I correlation (:py:class:`.KuhnsI`)
- Kuhns II correlation (:py:class:`.KuhnsII`)
- Kuhns III correlation (:py:class:`.KuhnsIII`)
- Kuhns IV correlation (:py:class:`.KuhnsIV`)
- Kuhns V correlation (:py:class:`.KuhnsV`)
- Kuhns VI correlation (:py:class:`.KuhnsVI`)
- Kuhns VII correlation (:py:class:`.KuhnsVII`)
- Kuhns VIII correlation (:py:class:`.KuhnsVIII`)
- Kuhns IX correlation (:py:class:`.KuhnsIX`)
- Kuhns X correlation (:py:class:`.KuhnsX`)
- Kuhns XI correlation (:py:class:`.KuhnsXI`)
- Kuhns XII similarity (:py:class:`.KuhnsXII`)
- Kulczynski I similarity (:py:class:`.KulczynskiI`)
- Kulczynski II similarity (:py:class:`.KulczynskiII`)
- Lorentzian distance (:py:class:`.Lorentzian`)
- Maarel correlation (:py:class:`.Maarel`)
- Manhattan distance (:py:class:`.Manhattan`)
- Morisita similarity (:py:class:`.Morisita`)
- marking distance (:py:class:`.Marking`)
- marking metric (:py:class:`.MarkingMetric`)
- MASI similarity (:py:class:`.MASI`)
- Matusita distance (:py:class:`.Matusita`)
- Maxwell & Pilliner correlation (:py:class:`.MaxwellPilliner`)
- McConnaughey correlation (:py:class:`.McConnaughey`)
- McEwen & Michael correlation (:py:class:`.McEwenMichael`)
- mean squared contingency correlation (:py:class:`.MSContingency`)
- Michael similarity (:py:class:`.Michael`)
- Michelet similarity (:py:class:`.Michelet`)
- Millar distance (:py:class:`.Millar`)
- Minkowski distance (:py:class:`.Minkowski`)
- Mountford similarity (:py:class:`.Mountford`)
- Mutual Information similarity (:py:class:`.MutualInformation`)
- Overlap distance (:py:class:`.Overlap`)
- Pattern difference (:py:class:`.Pattern`)
- Pearson & Heron II correlation (:py:class:`.PearsonHeronII`)
- Pearson II similarity (:py:class:`.PearsonII`)
- Pearson III correlation (:py:class:`.PearsonIII`)
- Pearson's Chi-Squared similarity (:py:class:`.PearsonChiSquared`)
- Pearson's Phi correlation (:py:class:`.PearsonPhi`)
- Peirce correlation (:py:class:`.Peirce`)
- q-gram distance (:py:class:`.QGram`)
- Raup-Crick similarity (:py:class:`.RaupCrick`)
- Rogers & Tanimoto similarity (:py:class:`.RogersTanimoto`)
- Rogot & Goldberg similarity (:py:class:`.RogotGoldberg`)
- Russell & Rao similarity (:py:class:`.RussellRao`)
- Scott's Pi correlation (:py:class:`.ScottPi`)
- Shape difference (:py:class:`.Shape`)
- Size difference (:py:class:`.Size`)
- Sokal & Michener similarity (:py:class:`.SokalMichener`)
- Sokal & Sneath I similarity (:py:class:`.SokalSneathI`)
- Sokal & Sneath II similarity (:py:class:`.SokalSneathII`)
- Sokal & Sneath III similarity (:py:class:`.SokalSneathIII`)
- Sokal & Sneath IV similarity (:py:class:`.SokalSneathIV`)
- Sokal & Sneath V similarity (:py:class:`.SokalSneathV`)
- Sørensen–Dice coefficient (:py:class:`.Dice`)
- Sorgenfrei similarity (:py:class:`.Sorgenfrei`)
- Steffensen similarity (:py:class:`.Steffensen`)
- Stiles similarity (:py:class:`.Stiles`)
- Stuart's Tau correlation (:py:class:`.StuartTau`)
- Tarantula similarity (:py:class:`.Tarantula`)
- Tarwid correlation (:py:class:`.Tarwid`)
- Tetrachoric correlation coefficient (:py:class:`.Tetrachronic`)
- Tulloss' R similarity (:py:class:`.TullossR`)
- Tulloss' S similarity (:py:class:`.TullossS`)
- Tulloss' T similarity (:py:class:`.TullossT`)
- Tulloss' U similarity (:py:class:`.TullossU`)
- Tversky distance (:py:class:`.Tversky`)
- Weighted Jaccard similarity (:py:class:`.WeightedJaccard`)
- Unigram subtuple similarity (:py:class:`.UnigramSubtuple`)
- Unknown A correlation (:py:class:`.UnknownA`)
- Unknown B similarity (:py:class:`.UnknownB`)
- Unknown C similarity (:py:class:`.UnknownC`)
- Unknown D similarity (:py:class:`.UnknownD`)
- Unknown E correlation (:py:class:`.UnknownE`)
- Unknown F similarity (:py:class:`.UnknownF`)
- Unknown G similarity (:py:class:`.UnknownG`)
- Unknown H similarity (:py:class:`.UnknownH`)
- Unknown I similarity (:py:class:`.UnknownI`)
- Unknown J similarity (:py:class:`.UnknownJ`)
- Unknown K distance (:py:class:`.UnknownK`)
- Unknown L similarity (:py:class:`.UnknownL`)
- Unknown M similarity (:py:class:`.UnknownM`)
- Upholt similarity (:py:class:`.Upholt`)
- Warrens I correlation (:py:class:`.WarrensI`)
- Warrens II similarity (:py:class:`.WarrensII`)
- Warrens III correlation (:py:class:`.WarrensIII`)
- Warrens IV similarity (:py:class:`.WarrensIV`)
- Warrens V similarity (:py:class:`.WarrensV`)
- Whittaker distance (:py:class:`.Whittaker`)
- Yates' Chi-Squared similarity (:py:class:`.YatesChiSquared`)
- Yule's Q correlation (:py:class:`.YuleQ`)
- Yule's Q II distance (:py:class:`.YuleQII`)
- Yule's Y correlation (:py:class:`.YuleY`)
- YJHHR distance (:py:class:`.YJHHR`)
- Bhattacharyya distance (:py:class:`.Bhattacharyya`)
- Brainerd-Robinson similarity (:py:class:`.BrainerdRobinson`)
- Quantitative Cosine similarity (:py:class:`.QuantitativeCosine`)
- Quantitative Dice similarity (:py:class:`.QuantitativeDice`)
- Quantitative Jaccard similarity (:py:class:`.QuantitativeJaccard`)
- Roberts similarity (:py:class:`.Roberts`)
- Average linkage distance (:py:class:`.AverageLinkage`)
- Single linkage distance (:py:class:`.SingleLinkage`)
- Complete linkage distance (:py:class:`.CompleteLinkage`)
- Bag distance (:py:class:`.Bag`)
- Soft cosine similarity (:py:class:`.SoftCosine`)
- Monge-Elkan distance (:py:class:`.MongeElkan`)
- TF-IDF similarity (:py:class:`.TFIDF`)
- SoftTF-IDF similarity (:py:class:`.SoftTFIDF`)
- Jensen-Shannon divergence (:py:class:`.JensenShannon`)
- Simplified Fellegi-Sunter distance (:py:class:`.FellegiSunter`)
- MinHash similarity (:py:class:`.MinHash`)
- BLEU similarity (:py:class:`.BLEU`)
- Rouge-L similarity (:py:class:`.RougeL`)
- Rouge-W similarity (:py:class:`.RougeW`)
- Rouge-S similarity (:py:class:`.RougeS`)
- Rouge-SU similarity (:py:class:`.RougeSU`)
- Positional Q-Gram Dice distance (:py:class:`.PositionalQGramDice`)
- Positional Q-Gram Jaccard distance (:py:class:`.PositionalQGramJaccard`)
- Positional Q-Gram Overlap distance (:py:class:`.PositionalQGramOverlap`)
Three popular sequence alignment algorithms are provided:
- Needleman-Wunsch score (:py:class:`.NeedlemanWunsch`)
- Smith-Waterman score (:py:class:`.SmithWaterman`)
- Gotoh score (:py:class:`.Gotoh`)
Classes relating to substring and subsequence distances include:
- Longest common subsequence (:py:class:`.LCSseq`)
- Longest common substring (:py:class:`.LCSstr`)
- Ratcliff-Obserhelp distance (:py:class:`.RatcliffObershelp`)
A number of simple distance classes provided in the package include:
- Identity distance (:py:class:`.Ident`)
- Length distance (:py:class:`.Length`)
- Prefix distance (:py:class:`.Prefix`)
- Suffix distance (:py:class:`.Suffix`)
Normalized compression distance classes for a variety of compression algorithms
are provided:
- zlib (:py:class:`.NCDzlib`)
- bzip2 (:py:class:`.NCDbz2`)
- lzma (:py:class:`.NCDlzma`)
- LZSS (:py:class:`.NCDlzss`)
- arithmetic coding (:py:class:`.NCDarith`)
- PAQ9A (:py:class:`.NCDpaq9a`)
- BWT plus RLE (:py:class:`.NCDbwtrle`)
- RLE (:py:class:`.NCDrle`)
Three similarity measures from SeatGeek's FuzzyWuzzy:
- FuzzyWuzzy Partial String similarity
(:py:class:`FuzzyWuzzyPartialString`)
- FuzzyWuzzy Token Sort similarity (:py:class:`FuzzyWuzzyTokenSort`)
- FuzzyWuzzy Token Set similarity (:py:class:`FuzzyWuzzyTokenSet`)
A convenience class, allowing one to pass a list of string transforms (phonetic
algorithms, string transforms, and/or stemmers) and, optionally, a string
distance measure to compute the similarity/distance of two strings that have
undergone each transform, is provided in:
- Phonetic distance (:py:class:`.PhoneticDistance`)
The remaining distance measures & metrics include:
- Western Airlines' Match Rating Algorithm comparison
(:py:class:`.distance.MRA`)
- Editex (:py:class:`.Editex`)
- Bavarian Landesamt für Statistik distance (:py:class:`.Baystat`)
- Eudex distance (:py:class:`.distance.Eudex`)
- Sift4 distance (:py:class:`.Sift4`, :py:class:`.Sift4Simplest`,
:py:class:`.Sift4Extended`)
- Typo distance (:py:class:`.Typo`)
- Synoname (:py:class:`.Synoname`)
- Ozbay metric (:py:class:`.Ozbay`)
- Indice de Similitude-Guth (:py:class:`.ISG`)
- INClusion Programme (:py:class:`.Inclusion`)
- Guth (:py:class:`.Guth`)
- Victorian Panel Study (:py:class:`.VPS`)
- LIG3 (:py:class:`.LIG3`)
- String subsequence kernel (SSK) (:py:class:`.SSK`)
Most of the distance and similarity measures have ``sim`` and ``dist`` methods,
which return a measure that is normalized to the range :math:`[0, 1]`. The
normalized distance and similarity are always complements, so the normalized
distance will always equal 1 - the similarity for a particular measure supplied
with the same input. Some measures have an absolute distance method
``dist_abs`` and/or a similarity score ``sim_score``, which are not limited to
any range.
The first three methods can be demonstrated using the
:py:class:`.DamerauLevenshtein` class, while :py:class:`.SmithWaterman` offers
the fourth:
>>> dl = DamerauLevenshtein()
>>> dl.dist_abs('orange', 'strange')
2
>>> dl.dist('orange', 'strange')
0.2857142857142857
>>> dl.sim('orange', 'strange')
0.7142857142857143
>>> sw = SmithWaterman()
>>> sw.sim_score('TGTTACGG', 'GGTTGACTA')
4.0
----
"""
from ._aline import ALINE
from ._ample import AMPLE
from ._anderberg import Anderberg
from ._andres_marzo_delta import AndresMarzoDelta
from ._average_linkage import AverageLinkage
from ._azzoo import AZZOO
from ._bag import Bag
from ._baroni_urbani_buser_i import BaroniUrbaniBuserI
from ._baroni_urbani_buser_ii import BaroniUrbaniBuserII
from ._batagelj_bren import BatageljBren
from ._baulieu_i import BaulieuI
from ._baulieu_ii import BaulieuII
from ._baulieu_iii import BaulieuIII
from ._baulieu_iv import BaulieuIV
from ._baulieu_ix import BaulieuIX
from ._baulieu_v import BaulieuV
from ._baulieu_vi import BaulieuVI
from ._baulieu_vii import BaulieuVII
from ._baulieu_viii import BaulieuVIII
from ._baulieu_x import BaulieuX
from ._baulieu_xi import BaulieuXI
from ._baulieu_xii import BaulieuXII
from ._baulieu_xiii import BaulieuXIII
from ._baulieu_xiv import BaulieuXIV
from ._baulieu_xv import BaulieuXV
from ._baystat import Baystat
from ._benini_i import BeniniI
from ._benini_ii import BeniniII
from ._bennet import Bennet
from ._bhattacharyya import Bhattacharyya
from ._bisim import BISIM
from ._bleu import BLEU
from ._block_levenshtein import BlockLevenshtein
from ._brainerd_robinson import BrainerdRobinson
from ._braun_blanquet import BraunBlanquet
from ._canberra import Canberra
from ._cao import Cao
from ._chao_dice import ChaoDice
from ._chao_jaccard import ChaoJaccard
from ._chebyshev import Chebyshev
from ._chord import Chord
from ._clark import Clark
from ._clement import Clement
from ._cohen_kappa import CohenKappa
from ._cole import Cole
from ._complete_linkage import CompleteLinkage
from ._consonni_todeschini_i import ConsonniTodeschiniI
from ._consonni_todeschini_ii import ConsonniTodeschiniII
from ._consonni_todeschini_iii import ConsonniTodeschiniIII
from ._consonni_todeschini_iv import ConsonniTodeschiniIV
from ._consonni_todeschini_v import ConsonniTodeschiniV
from ._cormode_lz import CormodeLZ
from ._cosine import Cosine
from ._covington import Covington
from ._damerau_levenshtein import DamerauLevenshtein
from ._dennis import Dennis
from ._dice import Dice
from ._dice_asymmetric_i import DiceAsymmetricI
from ._dice_asymmetric_ii import DiceAsymmetricII
from ._digby import Digby
from ._discounted_levenshtein import DiscountedLevenshtein
from ._dispersion import Dispersion
from ._distance import _Distance
from ._doolittle import Doolittle
from ._dunning import Dunning
from ._editex import Editex
from ._euclidean import Euclidean
from ._eudex import Eudex
from ._eyraud import Eyraud
from ._fager_mcgowan import FagerMcGowan
from ._faith import Faith
from ._fellegi_sunter import FellegiSunter
from ._fidelity import Fidelity
from ._fleiss import Fleiss
from ._fleiss_levin_paik import FleissLevinPaik
from ._flexmetric import FlexMetric
from ._forbes_i import ForbesI
from ._forbes_ii import ForbesII
from ._fossum import Fossum
from ._fuzzywuzzy_partial_string import FuzzyWuzzyPartialString
from ._fuzzywuzzy_token_set import FuzzyWuzzyTokenSet
from ._fuzzywuzzy_token_sort import FuzzyWuzzyTokenSort
from ._generalized_fleiss import GeneralizedFleiss
from ._gilbert import Gilbert
from ._gilbert_wells import GilbertWells
from ._gini_i import GiniI
from ._gini_ii import GiniII
from ._goodall import Goodall
from ._goodman_kruskal_lambda import GoodmanKruskalLambda
from ._goodman_kruskal_lambda_r import GoodmanKruskalLambdaR
from ._goodman_kruskal_tau_a import GoodmanKruskalTauA
from ._goodman_kruskal_tau_b import GoodmanKruskalTauB
from ._gotoh import Gotoh
from ._gower_legendre import GowerLegendre
from ._guth import Guth
from ._guttman_lambda_a import GuttmanLambdaA
from ._guttman_lambda_b import GuttmanLambdaB
from ._gwet_ac import GwetAC
from ._hamann import Hamann
from ._hamming import Hamming
from ._harris_lahey import HarrisLahey
from ._hassanat import Hassanat
from ._hawkins_dotson import HawkinsDotson
from ._hellinger import Hellinger
from ._henderson_heron import HendersonHeron
from ._higuera_mico import HigueraMico
from ._horn_morisita import HornMorisita
from ._hurlbert import Hurlbert
from ._ident import Ident
from ._inclusion import Inclusion
from ._indel import Indel
from ._isg import ISG
from ._iterative_substring import IterativeSubString
from ._jaccard import Jaccard
from ._jaccard_nm import JaccardNM
from ._jaro_winkler import JaroWinkler
from ._jensen_shannon import JensenShannon
from ._johnson import Johnson
from ._kendall_tau import KendallTau
from ._kent_foster_i import KentFosterI
from ._kent_foster_ii import KentFosterII
from ._koppen_i import KoppenI
from ._koppen_ii import KoppenII
from ._kuder_richardson import KuderRichardson
from ._kuhns_i import KuhnsI
from ._kuhns_ii import KuhnsII
from ._kuhns_iii import KuhnsIII
from ._kuhns_iv import KuhnsIV
from ._kuhns_ix import KuhnsIX
from ._kuhns_v import KuhnsV
from ._kuhns_vi import KuhnsVI
from ._kuhns_vii import KuhnsVII
from ._kuhns_viii import KuhnsVIII
from ._kuhns_x import KuhnsX
from ._kuhns_xi import KuhnsXI
from ._kuhns_xii import KuhnsXII
from ._kulczynski_i import KulczynskiI
from ._kulczynski_ii import KulczynskiII
from ._lcprefix import LCPrefix
from ._lcsseq import LCSseq
from ._lcsstr import LCSstr
from ._lcsuffix import LCSuffix
from ._length import Length
from ._levenshtein import Levenshtein
from ._lig3 import LIG3
from ._lorentzian import Lorentzian
from ._maarel import Maarel
from ._manhattan import Manhattan
from ._marking import Marking
from ._marking_metric import MarkingMetric
from ._masi import MASI
from ._matusita import Matusita
from ._maxwell_pilliner import MaxwellPilliner
from ._mcconnaughey import McConnaughey
from ._mcewen_michael import McEwenMichael
from ._meta_levenshtein import MetaLevenshtein
from ._michelet import Michelet
from ._millar import Millar
from ._minhash import MinHash
from ._minkowski import Minkowski
from ._mlipns import MLIPNS
from ._monge_elkan import MongeElkan
from ._morisita import Morisita
from ._mountford import Mountford
from ._mra import MRA
from ._ms_contingency import MSContingency
from ._mutual_information import MutualInformation
from ._ncd_arith import NCDarith
from ._ncd_bwtrle import NCDbwtrle
from ._ncd_bz2 import NCDbz2
from ._ncd_lzma import NCDlzma
from ._ncd_lzss import NCDlzss
from ._ncd_paq9a import NCDpaq9a
from ._ncd_rle import NCDrle
from ._ncd_zlib import NCDzlib
from ._needleman_wunsch import NeedlemanWunsch
from ._overlap import Overlap
from ._ozbay import Ozbay
from ._pattern import Pattern
from ._pearson_chi_squared import PearsonChiSquared
from ._pearson_heron_ii import PearsonHeronII
from ._pearson_ii import PearsonII
from ._pearson_iii import PearsonIII
from ._pearson_phi import PearsonPhi
from ._peirce import Peirce
from ._phonetic_distance import PhoneticDistance
from ._phonetic_edit_distance import PhoneticEditDistance
from ._positional_q_gram_dice import PositionalQGramDice
from ._positional_q_gram_jaccard import PositionalQGramJaccard
from ._positional_q_gram_overlap import PositionalQGramOverlap
from ._prefix import Prefix
from ._q_gram import QGram
from ._quantitative_cosine import QuantitativeCosine
from ._quantitative_dice import QuantitativeDice
from ._quantitative_jaccard import QuantitativeJaccard
from ._ratcliff_obershelp import RatcliffObershelp
from ._raup_crick import RaupCrick
from ._rees_levenshtein import ReesLevenshtein
from ._relaxed_hamming import RelaxedHamming
from ._roberts import Roberts
from ._rogers_tanimoto import RogersTanimoto
from ._rogot_goldberg import RogotGoldberg
from ._rouge_l import RougeL
from ._rouge_s import RougeS
from ._rouge_su import RougeSU
from ._rouge_w import RougeW
from ._russell_rao import RussellRao
from ._saps import SAPS
from ._scott_pi import ScottPi
from ._shape import Shape
from ._shapira_storer_i import ShapiraStorerI
from ._sift4 import Sift4
from ._sift4_extended import Sift4Extended
from ._sift4_simplest import Sift4Simplest
from ._single_linkage import SingleLinkage
from ._size import Size
from ._smith_waterman import SmithWaterman
from ._soft_cosine import SoftCosine
from ._softtf_idf import SoftTFIDF
from ._sokal_michener import SokalMichener
from ._sokal_sneath_i import SokalSneathI
from ._sokal_sneath_ii import SokalSneathII
from ._sokal_sneath_iii import SokalSneathIII
from ._sokal_sneath_iv import SokalSneathIV
from ._sokal_sneath_v import SokalSneathV
from ._sorgenfrei import Sorgenfrei
from ._ssk import SSK
from ._steffensen import Steffensen
from ._stiles import Stiles
from ._strcmp95 import Strcmp95
from ._stuart_tau import StuartTau
from ._suffix import Suffix
from ._synoname import Synoname
from ._tarantula import Tarantula
from ._tarwid import Tarwid
from ._tetrachoric import Tetrachoric
from ._tf_idf import TFIDF
from ._tichy import Tichy
from ._token_distance import _TokenDistance
from ._tulloss_r import TullossR
from ._tulloss_s import TullossS
from ._tulloss_t import TullossT
from ._tulloss_u import TullossU
from ._tversky import Tversky
from ._typo import Typo
from ._unigram_subtuple import UnigramSubtuple
from ._unknown_a import UnknownA
from ._unknown_b import UnknownB
from ._unknown_c import UnknownC
from ._unknown_d import UnknownD
from ._unknown_e import UnknownE
from ._unknown_f import UnknownF
from ._unknown_g import UnknownG
from ._unknown_h import UnknownH
from ._unknown_i import UnknownI
from ._unknown_j import UnknownJ
from ._unknown_k import UnknownK
from ._unknown_l import UnknownL
from ._unknown_m import UnknownM
from ._upholt import Upholt
from ._vps import VPS
from ._warrens_i import WarrensI
from ._warrens_ii import WarrensII
from ._warrens_iii import WarrensIII
from ._warrens_iv import WarrensIV
from ._warrens_v import WarrensV
from ._weighted_jaccard import WeightedJaccard
from ._whittaker import Whittaker
from ._yates_chi_squared import YatesChiSquared
from ._yjhhr import YJHHR
from ._yujian_bo import YujianBo
from ._yule_q import YuleQ
from ._yule_q_ii import YuleQII
from ._yule_y import YuleY
__all__ = [
'_Distance',
'_TokenDistance',
'Levenshtein',
'DamerauLevenshtein',
'ShapiraStorerI',
'Marking',
'MarkingMetric',
'YujianBo',
'HigueraMico',
'Indel',
'SAPS',
'MetaLevenshtein',
'Covington',
'ALINE',
'FlexMetric',
'BISIM',
'DiscountedLevenshtein',
'PhoneticEditDistance',
'Hamming',
'MLIPNS',
'RelaxedHamming',
'Tichy',
'BlockLevenshtein',
'CormodeLZ',
'JaroWinkler',
'Strcmp95',
'IterativeSubString',
'AMPLE',
'AZZOO',
'Anderberg',
'AndresMarzoDelta',
'BaroniUrbaniBuserI',
'BaroniUrbaniBuserII',
'BatageljBren',
'BaulieuI',
'BaulieuII',
'BaulieuIII',
'BaulieuIV',
'BaulieuV',
'BaulieuVI',
'BaulieuVII',
'BaulieuVIII',
'BaulieuIX',
'BaulieuX',
'BaulieuXI',
'BaulieuXII',
'BaulieuXIII',
'BaulieuXIV',
'BaulieuXV',
'BeniniI',
'BeniniII',
'Bennet',
'BraunBlanquet',
'Canberra',
'Cao',
'ChaoDice',
'ChaoJaccard',
'Chebyshev',
'Chord',
'Clark',
'Clement',
'CohenKappa',
'Cole',
'ConsonniTodeschiniI',
'ConsonniTodeschiniII',
'ConsonniTodeschiniIII',
'ConsonniTodeschiniIV',
'ConsonniTodeschiniV',
'Cosine',
'Dennis',
'Dice',
'DiceAsymmetricI',
'DiceAsymmetricII',
'Digby',
'Dispersion',
'Doolittle',
'Dunning',
'Euclidean',
'Eyraud',
'FagerMcGowan',
'Faith',
'Fidelity',
'Fleiss',
'FleissLevinPaik',
'ForbesI',
'ForbesII',
'Fossum',
'GeneralizedFleiss',
'Gilbert',
'GilbertWells',
'GiniI',
'GiniII',
'Goodall',
'GoodmanKruskalLambda',
'GoodmanKruskalLambdaR',
'GoodmanKruskalTauA',
'GoodmanKruskalTauB',
'GowerLegendre',
'GuttmanLambdaA',
'GuttmanLambdaB',
'GwetAC',
'Hamann',
'HarrisLahey',
'Hassanat',
'HawkinsDotson',
'Hellinger',
'HendersonHeron',
'HornMorisita',
'Hurlbert',
'Jaccard',
'JaccardNM',
'Johnson',
'KendallTau',
'KentFosterI',
'KentFosterII',
'KoppenI',
'KoppenII',
'KuderRichardson',
'KuhnsI',
'KuhnsII',
'KuhnsIII',
'KuhnsIV',
'KuhnsV',
'KuhnsVI',
'KuhnsVII',
'KuhnsVIII',
'KuhnsIX',
'KuhnsX',
'KuhnsXI',
'KuhnsXII',
'KulczynskiI',
'KulczynskiII',
'Lorentzian',
'Maarel',
'Morisita',
'Manhattan',
'Michelet',
'Millar',
'Minkowski',
'MASI',
'Matusita',
'MaxwellPilliner',
'McConnaughey',
'McEwenMichael',
'Mountford',
'MutualInformation',
'MSContingency',
'Overlap',
'Pattern',
'PearsonHeronII',
'PearsonII',
'PearsonIII',
'PearsonChiSquared',
'PearsonPhi',
'Peirce',
'QGram',
'RaupCrick',
'ReesLevenshtein',
'RogersTanimoto',
'RogotGoldberg',
'RussellRao',
'ScottPi',
'Shape',
'Size',
'SokalMichener',
'SokalSneathI',
'SokalSneathII',
'SokalSneathIII',
'SokalSneathIV',
'SokalSneathV',
'Sorgenfrei',
'Steffensen',
'Stiles',
'StuartTau',
'Tarantula',
'Tarwid',
'Tetrachoric',
'TullossR',
'TullossS',
'TullossT',
'TullossU',
'Tversky',
'UnigramSubtuple',
'UnknownA',
'UnknownB',
'UnknownC',
'UnknownD',
'UnknownE',
'UnknownF',
'UnknownG',
'UnknownH',
'UnknownI',
'UnknownJ',
'UnknownK',
'UnknownL',
'UnknownM',
'Upholt',
'WarrensI',
'WarrensII',
'WarrensIII',
'WarrensIV',
'WarrensV',
'WeightedJaccard',
'Whittaker',
'YatesChiSquared',
'YuleQ',
'YuleQII',
'YuleY',
'YJHHR',
'Bhattacharyya',
'BrainerdRobinson',
'QuantitativeCosine',
'QuantitativeDice',
'QuantitativeJaccard',
'Roberts',
'AverageLinkage',
'SingleLinkage',
'CompleteLinkage',
'Bag',
'SoftCosine',
'MongeElkan',
'TFIDF',
'SoftTFIDF',
'JensenShannon',
'FellegiSunter',
'MinHash',
'BLEU',
'RougeL',
'RougeW',
'RougeS',
'RougeSU',
'PositionalQGramDice',
'PositionalQGramJaccard',
'PositionalQGramOverlap',
'NeedlemanWunsch',
'SmithWaterman',
'Gotoh',
'LCSseq',
'LCSstr',
'LCPrefix',
'LCSuffix',
'RatcliffObershelp',
'Ident',
'Length',
'Prefix',
'Suffix',
'NCDzlib',
'NCDbz2',
'NCDlzma',
'NCDarith',
'NCDbwtrle',
'NCDrle',
'NCDpaq9a',
'NCDlzss',
'FuzzyWuzzyPartialString',
'FuzzyWuzzyTokenSort',
'FuzzyWuzzyTokenSet',
'PhoneticDistance',
'MRA',
'Editex',
'Baystat',
'Eudex',
'Sift4',
'Sift4Simplest',
'Sift4Extended',
'Typo',
'Synoname',
'Ozbay',
'ISG',
'Inclusion',
'Guth',
'VPS',
'LIG3',
'SSK',
]
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_aline.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._aline.
ALINE alignment, similarity, and distance
"""
from copy import deepcopy
from typing import Any, Callable, Dict, List, Tuple, Union, cast
from numpy import float_, inf, zeros
from ._distance import _Distance
__all__ = ['ALINE']
class ALINE(_Distance):
r"""ALINE alignment, similarity, and distance.
ALINE alignment was developed by
:cite:`Kondrak:2000,Kondrak:2002,Downey:2008`, and establishes an
alignment algorithm based on multivalued phonetic features and feature
salience weights. Along with the alignment itself, the algorithm produces a
term similarity score.
:cite:`Downey:2008` develops ALINE's similarity score into a similarity
measure & distance measure:
.. math::
sim_{ALINE} = \frac{2 \dot score_{ALINE}(src, tar)}
{score_{ALINE}(src, src) + score_{ALINE}(tar, tar)}
However, because the average of the two self-similarity scores is not
guaranteed to be greater than or equal to the similarity score between
the two strings, by default, this formula is not used here in order to
guarantee that the similarity measure is bounded to [0, 1]. Instead,
Kondrak's similarity measure is employed:
.. math::
sim_{ALINE} = \frac{score_{ALINE}(src, tar)}
{max(score_{ALINE}(src, src), score_{ALINE}(tar, tar))}
.. versionadded:: 0.4.0
"""
# The three dicts below are mostly copied from NLTK's implementation
# https://www.nltk.org/_modules/nltk/metrics/aline.html
# But values have been returned, as much as possible to the reference
# values supplied in Kondrak's paper.
feature_weights = {
# place
'bilabial': 1.0,
'labiodental': 0.95,
'dental': 0.9,
'alveolar': 0.85,
'retroflex': 0.8,
'palato-alveolar': 0.75,
'palatal': 0.7,
'velar': 0.6,
'uvular': 0.5,
'pharyngeal': 0.3,
'glottal': 0.1,
# manner
'stop': 1.0,
'affricate': 0.9,
'fricative': 0.8,
'approximant': 0.6,
'trill': 0.55, # not in original
'tap': 0.5, # not in original
'high vowel': 0.4,
'mid vowel': 0.2,
'low vowel': 0.0,
# high
'high': 1.0,
'mid': 0.5,
'low': 0.0,
# back
'front': 1.0,
'central': 0.5,
'back': 0.0,
# binary features
'plus': 1.0,
'minus': 0.0,
}
v_features = {
'syllabic',
'nasal',
'retroflex',
'high',
'back',
'round',
'long',
}
c_features = {
'syllabic',
'manner',
'voice',
'nasal',
'retroflex',
'lateral',
'aspirated',
'place',
}
salience = {
'syllabic': 5,
'voice': 10,
'lateral': 10,
'high': 5,
'manner': 50,
'long': 1,
'place': 40,
'nasal': 10,
'aspirated': 5,
'back': 5,
'retroflex': 10,
'round': 5,
}
phones_ipa = {
'p': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'b': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
't': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'd': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʈ': {
'place': 'retroflex',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɖ': {
'place': 'retroflex',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'c': {
'place': 'palatal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɟ': {
'place': 'palatal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'k': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'g': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'q': {
'place': 'uvular',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɢ': {
'place': 'uvular',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʔ': {
'place': 'glottal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'm': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɱ': {
'place': 'labiodental',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'n': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɳ': {
'place': 'retroflex',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɲ': {
'place': 'palatal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ŋ': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɴ': {
'place': 'uvular',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʙ': {
'place': 'bilabial',
'manner': 'trill',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'r': {
'place': 'alveolar',
'manner': 'trill',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʀ': {
'place': 'uvular',
'manner': 'trill',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɾ': {
'place': 'alveolar',
'manner': 'tap',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɽ': {
'place': 'retroflex',
'manner': 'tap',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɸ': {
'place': 'bilabial',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'β': {
'place': 'bilabial',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'f': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'v': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'θ': {
'place': 'dental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ð': {
'place': 'dental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
's': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'z': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʃ': {
'place': 'palato-alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʒ': {
'place': 'palato-alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʂ': {
'place': 'retroflex',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʐ': {
'place': 'retroflex',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ç': {
'place': 'palatal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʝ': {
'place': 'palatal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'x': {
'place': 'velar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɣ': {
'place': 'velar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'χ': {
'place': 'uvular',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʁ': {
'place': 'uvular',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ħ': {
'place': 'pharyngeal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ʕ': {
'place': 'pharyngeal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'h': {
'place': 'glottal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɦ': {
'place': 'glottal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɬ': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
'aspirated': 'minus',
},
'ɮ': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
'aspirated': 'minus',
},
'ʋ': {
'place': 'labiodental',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɹ': {
'place': 'alveolar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɻ': {
'place': 'retroflex',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
'aspirated': 'minus',
},
'j': {
'place': 'palatal',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'ɰ': {
'place': 'velar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
},
'l': {
'place': 'alveolar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
'aspirated': 'minus',
},
'w': {
'place': 'velar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'aspirated': 'minus',
'double': 'bilabial',
},
'i': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'y': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'e': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ø': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ɛ': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'œ': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'æ': {
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'a': {
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'front',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ɨ': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'central',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ʉ': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'central',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ə': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'central',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'u': {
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'back',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'o': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'back',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ɔ': {
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'back',
'round': 'plus',
'long': 'minus',
'aspirated': 'minus',
},
'ɒ': {
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'back',
'round': 'minus',
'long': 'minus',
'aspirated': 'minus',
},
'ː': {'long': 'plus', 'supplemental': 'True'},
'ʰ': {'aspirated': 'plus', 'supplemental': 'True'},
} # type: Dict[str, Dict[str, str]]
phones_kondrak = {
'a': {
'place': 'velar',
'manner': 'low vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'low',
'back': 'central',
'round': 'minus',
},
'b': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'c': {
'place': 'alveolar',
'manner': 'affricate',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'd': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'e': {
'place': 'palatal',
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'front',
'round': 'minus',
},
'f': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'g': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'h': {
'place': 'glottal',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'i': {
'place': 'palatal',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'plus',
},
'j': {
'place': 'alveolar',
'manner': 'affricate',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'k': {
'place': 'velar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'l': {
'place': 'alveolar',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'plus',
},
'm': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
},
'n': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'plus',
'retroflex': 'minus',
'lateral': 'minus',
},
'o': {
'place': 'velar',
'manner': 'mid vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'mid',
'back': 'back',
'round': 'plus',
},
'p': {
'place': 'bilabial',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'q': {
'place': 'glottal',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'r': {
'place': 'retroflex',
'manner': 'approximant',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'plus',
'lateral': 'minus',
},
's': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
't': {
'place': 'alveolar',
'manner': 'stop',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'u': {
'place': 'velar',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'back',
'round': 'plus',
},
'v': {
'place': 'labiodental',
'manner': 'fricative',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'w': {
'place': 'velar',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'back',
'round': 'plus',
'double': 'bilabial',
},
'x': {
'place': 'velar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'minus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'y': {
'place': 'velar',
'manner': 'high vowel',
'syllabic': 'plus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
'high': 'high',
'back': 'front',
'round': 'minus',
},
'z': {
'place': 'alveolar',
'manner': 'fricative',
'syllabic': 'minus',
'voice': 'plus',
'nasal': 'minus',
'retroflex': 'minus',
'lateral': 'minus',
},
'A': {'aspirated': 'plus', 'supplemental': 'True'},
'B': {'back': 'back', 'supplemental': 'True'},
'C': {'back': 'central', 'supplemental': 'True'},
'D': {'place': 'dental', 'supplemental': 'True'},
'F': {'back': 'front', 'supplemental': 'True'},
'H': {'long': 'plus', 'supplemental': 'True'},
'N': {'nasal': 'plus', 'supplemental': 'True'},
'P': {'place': 'palatal', 'supplemental': 'True'},
'R': {'round': 'plus', 'supplemental': 'True'},
'S': {'manner': 'fricative', 'supplemental': 'True'},
'V': {'place': 'palato-alveolar', 'supplemental': 'True'},
} # type: Dict[str, Dict[str, str]]
def __init__(
self,
epsilon: float = 0.0,
c_skip: float = -10,
c_sub: float = 35,
c_exp: float = 45,
c_vwl: float = 10,
mode: str = 'local',
phones: str = 'aline',
normalizer: Callable[[List[float]], float] = max,
**kwargs: Any
) -> None:
"""Initialize ALINE instance.
Parameters
----------
epsilon : float
The portion (out of 1.0) of the maximum ALINE score, above which
alignments are returned. If set to 0, only the alignments matching
the maximum alignment score are returned. If set to 1, all
alignments scoring 0 or higher are returned.
c_skip : float
The cost of an insertion or deletion
c_sub : float
The cost of a substitution
c_exp : float
The cost of an expansion or contraction
c_vwl : float
The additional cost of a vowel substitution, expansion, or
contraction
mode : str
Alignment mode, which can be ``local`` (default), ``global``,
``half-local``, or ``semi-global``
phones : str
Phonetic symbol set, which can be:
- ``aline`` selects Kondrak's original symbols set
- ``ipa`` selects IPA symbols
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). For the
normalization proposed by Downey, et al. (2008), set this to:
``lambda x: sum(x)/len(x)``
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(ALINE, self).__init__(**kwargs)
self._epsilon = epsilon
self._c_skip = c_skip
self._c_sub = c_sub
self._c_exp = c_exp
self._c_vwl = c_vwl
self._mode = mode
if self._mode not in {'local', 'global', 'half-local', 'semi-global'}:
self._mode = 'local'
if phones == 'ipa':
self._phones = self.phones_ipa
else:
self._phones = self.phones_kondrak
self._normalizer = normalizer
def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
"""Return the top ALINE alignment of two strings.
The `top` ALINE alignment is the first alignment with the best score.
The purpose of this function is to have a single tuple as a return
value.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
tuple(float, str, str)
ALINE alignment and its score
Examples
--------
>>> cmp = ALINE()
>>> cmp.alignment('cat', 'hat')
(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')
>>> cmp.alignment('niall', 'neil')
(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')
>>> cmp.alignment('aluminum', 'catalan')
(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')
>>> cmp.alignment('atcg', 'tagc')
(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖')
.. versionadded:: 0.4.1
"""
return cast(List[Tuple[float, str, str]], self.alignments(src, tar))[0]
def alignments(
self, src: str, tar: str, score_only: bool = False
) -> Union[float, List[Tuple[float, str, str]]]:
"""Return the ALINE alignments of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
score_only : bool
Return the score only, not the alignments
Returns
-------
list(tuple(float, str, str) or float
ALINE alignments and their scores or the top score
Examples
--------
>>> cmp = ALINE()
>>> cmp.alignments('cat', 'hat')
[(50.0, 'c ‖ a t ‖', 'h ‖ a t ‖')]
>>> cmp.alignments('niall', 'neil')
[(90.0, '‖ n i a ll ‖', '‖ n e i l ‖')]
>>> cmp.alignments('aluminum', 'catalan')
[(81.5, '‖ a l u m ‖ inum', 'cat ‖ a l a n ‖')]
>>> cmp.alignments('atcg', 'tagc')
[(65.0, '‖ a t c ‖ g', 't ‖ a g c ‖'), (65.0, 'a ‖ tc - g ‖',
'‖ t a g ‖ c')]
.. versionadded:: 0.4.0
.. versionchanged:: 0.4.1
Renamed from .alignment to .alignments
"""
def _sig_skip(*args: Any) -> float:
return self._c_skip
def _sig_sub(seg1: Dict[str, float], seg2: Dict[str, float]) -> float:
return (
self._c_sub
- _delta(seg1, seg2)
- _sig_vwl(seg1)
- _sig_vwl(seg2)
)
def _sig_exp(
seg1: Dict[str, float],
seg2a: Dict[str, float],
seg2b: Dict[str, float],
) -> float:
return (
self._c_exp
- _delta(seg1, seg2a)
- _delta(seg1, seg2b)
- _sig_vwl(seg1)
- max(_sig_vwl(seg2a), _sig_vwl(seg2b))
)
def _sig_vwl(seg1: Dict[str, float]) -> float:
return (
0.0
if seg1['manner'] > self.feature_weights['high vowel']
else self._c_vwl
)
def _delta(seg1: Dict[str, float], seg2: Dict[str, float]) -> float:
features = (
self.c_features
if max(seg1['manner'], seg2['manner'])
> self.feature_weights['high vowel']
else self.v_features
)
diff = 0.0
for f in features:
diff += (
abs(seg1.get(f, 0.0) - seg2.get(f, 0.0)) * self.salience[f]
)
return diff
def _retrieve(
i: int, j: int, score: float, out: List[Tuple[str, str]]
) -> None:
def _record(score: float, out: List[Tuple[str, str]]) -> None:
out.append(('‖', '‖'))
for i1 in range(i - 1, -1, -1):
out.append((src_tok[i1], ''))
for j1 in range(j - 1, -1, -1):
out.append(('', tar_tok[j1]))
if self._mode == 'global':
score += (i + j) * _sig_skip('')
out = out[::-1]
src_alignment = []
tar_alignment = []
out.append(('‖', '‖'))
part = 0
s_segment = '' # type: Union[str, List[str]]
t_segment = '' # type: Union[str, List[str]]
for ss, ts in out:
if ss == '‖':
if part % 2 == 0:
src_alignment.append(s_segment)
tar_alignment.append(t_segment)
s_segment = []
t_segment = []
else:
src_alignment.append(' '.join(s_segment))
tar_alignment.append(' '.join(t_segment))
s_segment = ''
t_segment = ''
part += 1
else:
if part % 2 == 0:
s_segment = cast(str, s_segment) + ss
t_segment = cast(str, t_segment) + ts
else:
cast(List[str], s_segment).append(
ss + ' ' * (len(ts) - len(ss))
)
cast(List[str], t_segment).append(
ts + ' ' * (len(ss) - len(ts))
)
src_alignment_str = ' ‖ '.join(
cast(List[str], src_alignment)
).strip()
tar_alignment_str = ' ‖ '.join(
cast(List[str], tar_alignment)
).strip()
alignments.append(
(score, src_alignment_str, tar_alignment_str)
)
return
if s_mat[i, j] == 0:
_record(score, out)
return
else:
if (
i > 0
and j > 0
and s_mat[i - 1, j - 1]
+ _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1])
+ score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append((src_tok[i - 1], tar_tok[j - 1]))
_retrieve(
i - 1,
j - 1,
score
+ _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]),
loc_out,
)
loc_out.pop()
if (
j > 0
and s_mat[i, j - 1] + _sig_skip(tar_tok[j - 1]) + score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(('-', tar_tok[j - 1]))
_retrieve(
i, j - 1, score + _sig_skip(tar_tok[j - 1]), loc_out
)
loc_out.pop()
if (
i > 0
and j > 1
and s_mat[i - 1, j - 2]
+ _sig_exp(
src_feat_wt[i - 1],
tar_feat_wt[j - 2],
tar_feat_wt[j - 1],
)
+ score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(
(src_tok[i - 1], tar_tok[j - 2] + tar_tok[j - 1],)
)
_retrieve(
i - 1,
j - 2,
score
+ _sig_exp(
src_feat_wt[i - 1],
tar_feat_wt[j - 2],
tar_feat_wt[j - 1],
),
loc_out,
)
loc_out.pop()
if (
i > 0
and s_mat[i - 1, j] + _sig_skip(src_tok[i - 1]) + score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append((src_tok[i - 1], '-'))
_retrieve(
i - 1, j, score + _sig_skip(src_tok[i - 1]), loc_out
)
loc_out.pop()
if (
i > 1
and j > 0
and s_mat[i - 2, j - 1]
+ _sig_exp(
tar_feat_wt[j - 1],
src_feat_wt[i - 2],
src_feat_wt[i - 1],
)
+ score
>= threshold
):
loc_out = deepcopy(out)
loc_out.append(
(src_tok[i - 2] + src_tok[i - 1], tar_tok[j - 1],)
)
_retrieve(
i - 2,
j - 1,
score
+ _sig_exp(
tar_feat_wt[j - 1],
src_feat_wt[i - 2],
src_feat_wt[i - 1],
),
loc_out,
)
loc_out.pop()
sg_max = 0.0
src_tok = [] # type: List[str]
src_feat = [] # type: List[Dict[str, str]]
tar_tok = [] # type: List[str]
tar_feat = [] # type: List[Dict[str, str]]
for ch in src:
if ch in self._phones:
src_tok.append(ch)
src_feat.append(dict(self._phones[ch]))
for ch in tar:
if ch in self._phones:
tar_tok.append(ch)
tar_feat.append(dict(self._phones[ch]))
for i in range(1, len(src_feat)):
if 'supplemental' in src_feat[i]:
j = i - 1
while j > -1:
if 'supplemental' not in src_feat[j]:
src_tok[j] += src_tok[i]
for key, value in src_feat[i].items():
if key != 'supplemental':
src_feat[j][key] = value
j = 0
j -= 1
zipped = [
fb for fb in zip(src_feat, src_tok) if 'supplemental' not in fb[0]
]
if zipped:
src_feat, src_tok = zip(*zipped) # type: ignore
else:
src_feat, src_tok = [], []
src_feat_wt = [] # type: List[Dict[str, float]]
for f_dict in src_feat:
src_feat_wt.append(
{
key: self.feature_weights[f_dict[key]]
for key in f_dict.keys()
}
)
src_len = len(src_tok)
for i in range(1, len(tar_feat)):
if 'supplemental' in tar_feat[i]:
j = i - 1
while j > -1:
if 'supplemental' not in tar_feat[j]:
tar_tok[j] += tar_tok[i]
for key, value in tar_feat[i].items():
if key != 'supplemental':
tar_feat[j][key] = value
j = 0
j -= 1
zipped = [
fb for fb in zip(tar_feat, tar_tok) if 'supplemental' not in fb[0]
]
if zipped:
tar_feat, tar_tok = zip(*zipped) # type: ignore
else:
tar_feat, tar_tok = [], []
tar_feat_wt = [] # type: List[Dict[str, float]]
for f_dict in tar_feat:
tar_feat_wt.append(
{
key: self.feature_weights[f_dict[key]]
for key in f_dict.keys()
}
)
tar_len = len(tar_tok)
s_mat = zeros((src_len + 1, tar_len + 1), dtype=float_)
if self._mode == 'global':
for i in range(1, src_len + 1):
s_mat[i, 0] = s_mat[i - 1, 0] + _sig_skip(src_tok[i - 1])
for j in range(1, tar_len + 1):
s_mat[0, j] = s_mat[0, j - 1] + _sig_skip(tar_tok[j - 1])
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
s_mat[i, j] = max(
s_mat[i - 1, j] + _sig_skip(src_feat_wt[i - 1]),
s_mat[i, j - 1] + _sig_skip(tar_feat_wt[j - 1]),
s_mat[i - 1, j - 1]
+ _sig_sub(src_feat_wt[i - 1], tar_feat_wt[j - 1]),
s_mat[i - 1, j - 2]
+ _sig_exp(
src_feat_wt[i - 1],
tar_feat_wt[j - 2],
tar_feat_wt[j - 1],
)
if j > 1
else -inf,
s_mat[i - 2, j - 1]
+ _sig_exp(
tar_feat_wt[j - 1],
src_feat_wt[i - 2],
src_feat_wt[i - 1],
)
if i > 1
else -inf,
0 if self._mode in {'local', 'half-local'} else -inf,
)
if s_mat[i, j] > sg_max:
if self._mode == 'semi-global':
if i == src_len or j == tar_len:
sg_max = s_mat[i, j]
else:
sg_max = s_mat[i, j]
if self._mode in {'global', 'half-local'}:
dp_score = s_mat[src_len, tar_len]
else:
dp_score = s_mat.max()
if score_only:
return cast(float, dp_score)
threshold = (1 - self._epsilon) * dp_score
alignments = [] # type: List[Tuple[float, str, str]]
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
if self._mode in {'global', 'half-local'} and (
i < src_len or j < tar_len
):
continue
if self._mode == 'semi-global' and (
i < src_len and j < tar_len
):
continue
if s_mat[i, j] >= threshold:
out = []
for j1 in range(tar_len - 1, j - 1, -1):
out.append(('', tar_tok[j1]))
for i1 in range(src_len - 1, i - 1, -1):
out.append((src_tok[i1], ''))
out.append(('‖', '‖'))
_retrieve(i, j, 0, out)
return sorted(alignments, key=lambda _: _[0], reverse=True)
def sim_score(self, src: str, tar: str) -> float:
"""Return the ALINE alignment score of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
ALINE alignment score
Examples
--------
>>> cmp = ALINE()
>>> cmp.sim_score('cat', 'hat')
50.0
>>> cmp.sim_score('niall', 'neil')
90.0
>>> cmp.sim_score('aluminum', 'catalan')
81.5
>>> cmp.sim_score('atcg', 'tagc')
65.0
.. versionadded:: 0.4.0
"""
if src == '' and tar == '':
return 1.0
return cast(float, self.alignments(src, tar, score_only=True))
def sim(self, src: str, tar: str) -> float:
"""Return the normalized ALINE similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized ALINE similarity
Examples
--------
>>> cmp = ALINE()
>>> cmp.dist('cat', 'hat')
0.4117647058823529
>>> cmp.dist('niall', 'neil')
0.33333333333333337
>>> cmp.dist('aluminum', 'catalan')
0.5925
>>> cmp.dist('atcg', 'tagc')
0.45833333333333337
.. versionadded:: 0.4.0
"""
num = self.sim_score(src, tar)
if num:
return num / self._normalizer(
[self.sim_score(src, src), self.sim_score(tar, tar)]
)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
================================================
FILE: abydos/distance/_ample.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._ample.
AMPLE similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['AMPLE']
class AMPLE(_TokenDistance):
r"""AMPLE similarity.
The AMPLE similarity :cite:`Dallmeier:2005,Abreu:2007` is defined in
getAverageSequenceWeight() in the AverageSequenceWeightEvaluator.java file
of AMPLE's source code. For two sets X and Y and a population N, it is
.. math::
sim_{AMPLE}(X, Y) =
\big|\frac{|X \cap Y|}{|X|} -
\frac{|Y \setminus X|}{|N \setminus X|}\big|
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{AMPLE} =
\big|\frac{a}{a+b}-\frac{c}{c+d}\big|
Notes
-----
This measure is asymmetric. The first ratio considers how similar the two
strings are, while the second considers how dissimilar the second string
is. As a result, both very similar and very dissimilar strings will score
high on this measure, provided the unique aspects are present chiefly
in the latter string.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize AMPLE instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(AMPLE, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the AMPLE similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
AMPLE similarity
Examples
--------
>>> cmp = AMPLE()
>>> cmp.sim('cat', 'hat')
0.49743589743589745
>>> cmp.sim('Niall', 'Neil')
0.32947729220222793
>>> cmp.sim('aluminum', 'Catalan')
0.10209049255441008
>>> cmp.sim('ATCG', 'TAGC')
0.006418485237483954
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
# If the denominators are 0, set them to 1.
# This is a deviation from the formula, but prevents division by zero
# while retaining the contribution of the other ratio.
if a + b == 0:
b = 1
if c + d == 0:
d = 1
return abs((a / (a + b)) - (c / (c + d)))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_anderberg.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._anderberg.
Anderberg's d
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Anderberg']
class Anderberg(_TokenDistance):
r"""Anderberg's D.
For two sets X and Y and a population N, Anderberg's D
:cite:`Anderberg:1973` is
.. math::
t_1 = max(|X \cap Y|, |X \setminus Y|)+
max(|Y \setminus X|, |(N \setminus X) \setminus Y|)+\\
max(|X \cap Y|, |Y \setminus X|)+
max(|X \setminus Y|, |(N \setminus X) \setminus Y|)\\
\\
t_2 = max(|Y|, |N \setminus Y|)+max(|X|, |N \setminus X|)\\
\\
sim_{Anderberg}(X, Y) =
\frac{t_1-t_2}{2|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Anderberg} =
\frac{(max(a,b)+max(c,d)+max(a,c)+max(b,d))-
(max(a+b,b+d)+max(a+b,c+d))}{2n}
Notes
-----
There are various references to another "Anderberg similarity",
:math:`sim_{Anderberg} = \frac{8a}{8a+b+c}`, but I cannot substantiate
the claim that this appears in :cite:`Anderberg:1973`. In any case,
if you want to use this measure, you may instatiate
:py:class:`WeightedJaccard` with `weight=8`.
Anderberg states that "[t]his quantity is the actual reduction in the
error probability (also the actual increase in the correct prediction) as
a consequence of using predictor information" :cite:`Anderberg:1973`. It
ranges [0, 0.5] so a ``sim`` method ranging [0, 1] is provided in addition
to ``sim_score``, which gives the value D itself.
It is difficult to term this measure a similarity score. Identical strings
often fail to gain high scores. Also, strings that would otherwise be
considered quite similar often earn lower scores than those that are less
similar.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Anderberg instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Anderberg, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Anderberg's D similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Anderberg similarity
Examples
--------
>>> cmp = Anderberg()
>>> cmp.sim_score('cat', 'hat')
0.0
>>> cmp.sim_score('Niall', 'Neil')
0.0
>>> cmp.sim_score('aluminum', 'Catalan')
0.0
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = (max(a, b) + max(c, d) + max(a, c) + max(b, d)) - (
max(a + c, b + d) + max(a + b, c + d)
)
if num == 0.0:
return 0.0
return num / (2 * (a + b + c + d))
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Anderberg's D similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Anderberg similarity
Examples
--------
>>> cmp = Anderberg()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('Niall', 'Neil')
0.0
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return 2 * self.sim_score(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_andres_marzo_delta.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._andres_marzo_delta.
Andres & Marzo's Delta correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['AndresMarzoDelta']
class AndresMarzoDelta(_TokenDistance):
r"""Andres & Marzo's Delta correlation.
For two sets X and Y and a population N, Andres & Marzo's :math:`\Delta`
correlation :cite:`Andres:2004` is
.. math::
corr_{AndresMarzo_\Delta}(X, Y) = \Delta =
\frac{|X \cap Y| + |(N \setminus X) \setminus Y| -
2\sqrt{|X \setminus Y| \cdot |Y \setminus X|}}{|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{AndresMarzo_\Delta} = \Delta =
\frac{a+d-2\sqrt{b \cdot c}}{n}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize AndresMarzoDelta instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(AndresMarzoDelta, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Andres & Marzo's Delta correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Andres & Marzo's Delta correlation
Examples
--------
>>> cmp = AndresMarzoDelta()
>>> cmp.corr('cat', 'hat')
0.9897959183673469
>>> cmp.corr('Niall', 'Neil')
0.9822344346552608
>>> cmp.corr('aluminum', 'Catalan')
0.9618259496215341
>>> cmp.corr('ATCG', 'TAGC')
0.9744897959183674
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
num = a + d - 2 * (b * c) ** 0.5
if num == 0.0:
return 0.0
return num / n
def sim(self, src: str, tar: str) -> float:
"""Return the Andres & Marzo's Delta similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Andres & Marzo's Delta similarity
Examples
--------
>>> cmp = AndresMarzoDelta()
>>> cmp.sim('cat', 'hat')
0.9948979591836735
>>> cmp.sim('Niall', 'Neil')
0.9911172173276304
>>> cmp.sim('aluminum', 'Catalan')
0.980912974810767
>>> cmp.sim('ATCG', 'TAGC')
0.9872448979591837
.. versionadded:: 0.4.0
"""
return (self.corr(src, tar) + 1) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_average_linkage.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._average_linkage.
Average linkage distance
"""
from typing import Any, Optional, cast
from ._distance import _Distance
from ._levenshtein import Levenshtein
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['AverageLinkage']
class AverageLinkage(_TokenDistance):
r"""Average linkage distance.
For two lists of tokens X and Y, average linkage distance
:cite:`Deza:2016` is
.. math::
dist_{AverageLinkage}(X, Y) =
\frac{\sum_{i \in X} \sum_{j \in Y} dist(X_i, Y_j)}{|X| \cdot |Y|}
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
metric: Optional[_Distance] = None,
**kwargs: Any
) -> None:
"""Initialize AverageLinkage instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants. (Defaults to Levenshtein distance)
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(AverageLinkage, self).__init__(tokenizer=tokenizer, **kwargs)
self._metric = cast(_Distance, metric)
if metric is None:
self._metric = Levenshtein()
def dist(self, src: str, tar: str) -> float:
"""Return the average linkage distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
average linkage distance
Examples
--------
>>> cmp = AverageLinkage()
>>> cmp.dist('cat', 'hat')
0.8125
>>> cmp.dist('Niall', 'Neil')
0.8333333333333334
>>> cmp.dist('aluminum', 'Catalan')
0.9166666666666666
>>> cmp.dist('ATCG', 'TAGC')
0.8
.. versionadded:: 0.4.0
"""
if not src and not tar:
return 0.0
src = self.params['tokenizer'].tokenize(src).get_list()
tar = self.params['tokenizer'].tokenize(tar).get_list()
if not src or not tar:
return 1.0
num = 0.0
den = len(src) * len(tar)
for term_src in src:
for term_tar in tar:
num += self._metric.dist(term_src, term_tar)
return num / den
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_azzoo.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._azzoo.
AZZOO similarity
"""
from typing import (
Any,
Counter as TCounter,
Optional,
Sequence,
Set,
Union,
cast,
)
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['AZZOO']
class AZZOO(_TokenDistance):
r"""AZZOO similarity.
For two sets X and Y, and alphabet N, and a parameter :math:`\sigma`,
AZZOO similarity :cite:`Cha:2006` is
.. math::
sim_{AZZOO_{\sigma}}(X, Y) =
\sum{s_i}
where :math:`s_i = 1` if :math:`X_i = Y_i = 1`,
:math:`s_i = \sigma` if :math:`X_i = Y_i = 0`,
and :math:`s_i = 0` otherwise.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{AZZOO} = a + \sigma \cdot d
.. versionadded:: 0.4.0
"""
def __init__(
self,
sigma: float = 0.5,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize AZZOO instance.
Parameters
----------
sigma : float
Sigma designates the contribution to similarity given by the
0-0 samples in the set.
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(AZZOO, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
self.set_params(sigma=sigma)
def sim_score(self, src: str, tar: str) -> float:
"""Return the AZZOO similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
AZZOO similarity
Examples
--------
>>> cmp = AZZOO()
>>> cmp.sim_score('cat', 'hat')
391.0
>>> cmp.sim_score('Niall', 'Neil')
389.5
>>> cmp.sim_score('aluminum', 'Catalan')
385.5
>>> cmp.sim_score('ATCG', 'TAGC')
387.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
d = self._total_complement_card()
return cast(float, a + self.params['sigma'] * d)
def sim(self, src: str, tar: str) -> float:
"""Return the AZZOO similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
AZZOO similarity
Examples
--------
>>> cmp = AZZOO()
>>> cmp.sim('cat', 'hat')
0.9923857868020305
>>> cmp.sim('Niall', 'Neil')
0.9860759493670886
>>> cmp.sim('aluminum', 'Catalan')
0.9710327455919395
>>> cmp.sim('ATCG', 'TAGC')
0.9809885931558935
.. versionadded:: 0.4.0
"""
den = max(self.sim_score(src, src), self.sim_score(tar, tar))
if den == 0.0:
return 1.0
return self.sim_score(src, tar) / den
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_bag.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._bag.
Bag similarity & distance
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import CharacterTokenizer, _Tokenizer
__all__ = ['Bag']
class Bag(_TokenDistance):
"""Bag distance.
Bag distance is proposed in :cite:`Bartolini:2002`. It is defined as
.. math::
dist_{bag}(src, tar) =
max(|multiset(src)-multiset(tar)|, |multiset(tar)-multiset(src)|)
.. versionadded:: 0.3.6
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Bag instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
if tokenizer is None:
tokenizer = CharacterTokenizer()
super(Bag, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def dist_abs(self, src: str, tar: str, normalized: bool = False) -> float:
"""Return the bag distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
normalized : bool
Normalizes to [0, 1] if True
Returns
-------
int or float
Bag distance
Examples
--------
>>> cmp = Bag()
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
2
>>> cmp.dist_abs('aluminum', 'Catalan')
5
>>> cmp.dist_abs('ATCG', 'TAGC')
0
>>> cmp.dist_abs('abcdefg', 'hijklm')
7
>>> cmp.dist_abs('abcdefg', 'hijklmno')
8
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if tar == src:
return 0
elif not src:
return len(tar)
elif not tar:
return len(src)
self._tokenize(src, tar)
dist = max(self._src_only_card(), self._tar_only_card())
if normalized:
dist /= max(self._src_card(), self._tar_card())
return dist
def dist(self, src: str, tar: str) -> float:
"""Return the normalized bag distance between two strings.
Bag distance is normalized by dividing by :math:`max( |src|, |tar| )`.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized bag distance
Examples
--------
>>> cmp = Bag()
>>> cmp.dist('cat', 'hat')
0.3333333333333333
>>> cmp.dist('Niall', 'Neil')
0.4
>>> cmp.dist('aluminum', 'Catalan')
0.625
>>> cmp.dist('ATCG', 'TAGC')
0.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if tar == src:
return 0.0
if not src or not tar:
return 1.0
return self.dist_abs(src, tar, normalized=True)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baroni_urbani_buser_i.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baroni_urbani_buser_i.
Baroni-Urbani & Buser I similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaroniUrbaniBuserI']
class BaroniUrbaniBuserI(_TokenDistance):
r"""Baroni-Urbani & Buser I similarity.
For two sets X and Y and a population N, the Baroni-Urbani & Buser I
similarity :cite:`BaroniUrbani:1976` is
.. math::
sim_{BaroniUrbaniBuserI}(X, Y) =
\frac{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} +
|X \cap Y|}
{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} +
|X \cap Y| + |X \setminus Y| + |Y \setminus X|}
This is the second, but more commonly used and referenced of the two
similarities proposed by Baroni-Urbani & Buser.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{BaroniUrbaniBuserI} =
\frac{\sqrt{ad}+a}{\sqrt{ad}+a+b+c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaroniUrbaniBuserI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaroniUrbaniBuserI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Baroni-Urbani & Buser I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baroni-Urbani & Buser I similarity
Examples
--------
>>> cmp = BaroniUrbaniBuserI()
>>> cmp.sim('cat', 'hat')
0.9119837740878104
>>> cmp.sim('Niall', 'Neil')
0.8552823175014205
>>> cmp.sim('aluminum', 'Catalan')
0.656992712054851
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
return ((a * d) ** 0.5 + a) / ((a * d) ** 0.5 + a + b + c)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baroni_urbani_buser_ii.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baroni_urbani_buser_ii.
Baroni-Urbani & Buser II correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaroniUrbaniBuserII']
class BaroniUrbaniBuserII(_TokenDistance):
r"""Baroni-Urbani & Buser II correlation.
For two sets X and Y and a population N, the Baroni-Urbani & Buser II
correlation :cite:`BaroniUrbani:1976` is
.. math::
corr_{BaroniUrbaniBuserII}(X, Y) =
\frac{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} +
|X \cap Y| - |X \setminus Y| - |Y \setminus X|}
{\sqrt{|X \cap Y| \cdot |(N \setminus X) \setminus Y|} +
|X \cap Y| + |X \setminus Y| + |Y \setminus X|}
This is the first, but less commonly used and referenced of the two
similarities proposed by Baroni-Urbani & Buser.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{BaroniUrbaniBuserII} =
\frac{\sqrt{ad}+a-b-c}{\sqrt{ad}+a+b+c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaroniUrbaniBuserII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaroniUrbaniBuserII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Baroni-Urbani & Buser II correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baroni-Urbani & Buser II correlation
Examples
--------
>>> cmp = BaroniUrbaniBuserII()
>>> cmp.corr('cat', 'hat')
0.8239675481756209
>>> cmp.corr('Niall', 'Neil')
0.7105646350028408
>>> cmp.corr('aluminum', 'Catalan')
0.31398542410970204
>>> cmp.corr('ATCG', 'TAGC')
-1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
return ((a * d) ** 0.5 + a - b - c) / ((a * d) ** 0.5 + a + b + c)
def sim(self, src: str, tar: str) -> float:
"""Return the Baroni-Urbani & Buser II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baroni-Urbani & Buser II similarity
Examples
--------
>>> cmp = BaroniUrbaniBuserII()
>>> cmp.sim('cat', 'hat')
0.9119837740878105
>>> cmp.sim('Niall', 'Neil')
0.8552823175014204
>>> cmp.sim('aluminum', 'Catalan')
0.656992712054851
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return (self.corr(src, tar) + 1) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_batagelj_bren.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._batagelj_bren.
Batagelj & Bren distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BatageljBren']
class BatageljBren(_TokenDistance):
r"""Batagelj & Bren distance.
For two sets X and Y and a population N, the Batagelj & Bren
distance :cite:`Batagelj:1995`, Batagelj & Bren's :math:`Q_0`, is
.. math::
dist_{BatageljBren}(X, Y) =
\frac{|X \setminus Y| \cdot |Y \setminus X|}
{|X \cap Y| \cdot |(N \setminus X) \setminus Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BatageljBren} =
\frac{bc}{ad}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BatageljBren instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BatageljBren, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Batagelj & Bren distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Batagelj & Bren distance
Examples
--------
>>> cmp = BatageljBren()
>>> cmp.dist_abs('cat', 'hat')
0.002570694087403599
>>> cmp.dist_abs('Niall', 'Neil')
0.007741935483870968
>>> cmp.dist_abs('aluminum', 'Catalan')
0.07282184655396619
>>> cmp.dist_abs('ATCG', 'TAGC')
inf
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
if a == 0 or d == 0:
return float('inf')
return b * c / (a * d)
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Batagelj & Bren distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Batagelj & Bren distance
Examples
--------
>>> cmp = BatageljBren()
>>> cmp.dist('cat', 'hat')
3.2789465400556106e-06
>>> cmp.dist('Niall', 'Neil')
9.874917709019092e-06
>>> cmp.dist('aluminum', 'Catalan')
9.276668350823718e-05
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
if a == 0 or d == 0:
return 1.0
return (b * c / (a * d)) / (a + b + c + d)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_i.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_i.
Baulieu I distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuI']
class BaulieuI(_TokenDistance):
r"""Baulieu I distance.
For two sets X and Y and a population N, Baulieu I distance
:cite:`Baulieu:1989` is
.. math::
sim_{BaulieuI}(X, Y) =
\frac{|X| \cdot |Y| - |X \cap Y|^2}{|X| \cdot |Y|}
This is Baulieu's 12th dissimilarity coefficient.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{BaulieuI} =
\frac{(a+b)(a+c)-a^2}{(a+b)(a+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu I distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu I distance
Examples
--------
>>> cmp = BaulieuI()
>>> cmp.dist('cat', 'hat')
0.75
>>> cmp.dist('Niall', 'Neil')
0.8666666666666667
>>> cmp.dist('aluminum', 'Catalan')
0.9861111111111112
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
ab = self._src_card()
ac = self._tar_card()
num = ab * ac - a * a
if num == 0:
return 0.0
return num / (ab * ac)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_ii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_ii.
Baulieu II similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuII']
class BaulieuII(_TokenDistance):
r"""Baulieu II similarity.
For two sets X and Y and a population N, Baulieu II similarity
:cite:`Baulieu:1989` is
.. math::
sim_{BaulieuII}(X, Y) =
\frac{|X \cap Y|^2 \cdot |(N \setminus X) \setminus Y|^2}
{|X| \cdot |Y| \cdot |N \setminus X| \cdot |N \setminus Y|}
This is based on Baulieu's 13th dissimilarity coefficient.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{BaulieuII} =
\frac{a^2d^2}{(a+b)(a+c)(b+d)(c+d)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Baulieu II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu II similarity
Examples
--------
>>> cmp = BaulieuII()
>>> cmp.sim('cat', 'hat')
0.24871959237343852
>>> cmp.sim('Niall', 'Neil')
0.13213719608444902
>>> cmp.sim('aluminum', 'Catalan')
0.013621892326789235
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = a * a * d * d
if num == 0:
return 0.0
return num / ((a + b) * (a + c) * (b + d) * (c + d))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_iii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_iii.
Baulieu III distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuIII']
class BaulieuIII(_TokenDistance):
r"""Baulieu III distance.
For two sets X and Y and a population N, Baulieu III distance
:cite:`Baulieu:1989` is
.. math::
sim_{BaulieuIII}(X, Y) =
\frac{|N|^2 - 4(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|)}{2 \cdot |N|^2}
This is based on Baulieu's 20th dissimilarity coefficient.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{BaulieuIII} =
\frac{n^2 - 4(ad-bc)}{2n^2}
Notes
-----
It should be noted that this is *based on* Baulieu's 20th dissimilarity
coefficient. This distance is exactly half Baulieu's 20th dissimilarity.
According to :cite:`Baulieu:1989`, the 20th dissimilarity should be a
value in the range [0.0, 1.0], meeting the article's (P1) property, but the
formula given ranges [0.0, 2.0], so dividing by 2 corrects the formula to
meet the article's expectations.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuIII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuIII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu III distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu III distance
Examples
--------
>>> cmp = BaulieuIII()
>>> cmp.dist('cat', 'hat')
0.4949500208246564
>>> cmp.dist('Niall', 'Neil')
0.4949955747605165
>>> cmp.dist('aluminum', 'Catalan')
0.49768591017891195
>>> cmp.dist('ATCG', 'TAGC')
0.5000813463140358
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
num = n * n - 4 * (a * d - b * c)
if num == 0:
return 0.0
return num / (2 * n * n)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_iv.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_iv.
Baulieu IV distance
"""
from math import e
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuIV']
class BaulieuIV(_TokenDistance):
r"""Baulieu IV distance.
For two sets X and Y, a population N, and a positive irractional number k,
Baulieu IV distance :cite:`Baulieu:1997` is
.. math::
dist_{BaulieuIV}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| -
(|X \cap Y| + \frac{1}{2}) \cdot (|(N \setminus X) \setminus Y| +
\frac{1}{2}) \cdot |(N \setminus X) \setminus Y| \cdot k}{|N|}
This is Baulieu's 22nd dissimilarity coefficient.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuIV} = \frac{b+c-(a+\frac{1}{2})(d+\frac{1}{2})dk}{n}
Notes
-----
The default value of k is Euler's number :math:`e`, but other irrationals
such as :math:`\pi` or :math:`\sqrt{2}` could be substituted at
initialization.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
positive_irrational: float = e,
**kwargs: Any
) -> None:
"""Initialize BaulieuIV instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuIV, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
self._positive_irrational = positive_irrational
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Baulieu IV distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu IV distance
Examples
--------
>>> cmp = BaulieuIV()
>>> cmp.dist_abs('cat', 'hat')
-5249.96272285802
>>> cmp.dist_abs('Niall', 'Neil')
-5209.561726488335
>>> cmp.dist_abs('aluminum', 'Catalan')
-3073.6070822721244
>>> cmp.dist_abs('ATCG', 'TAGC')
-1039.2151656463932
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
k = self._positive_irrational
num = (b + c) - (a + 0.5) * (d + 0.5) * d * k
if num == 0.0:
return 0.0
return num / n
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Baulieu IV distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Baulieu IV distance
Examples
--------
>>> cmp = BaulieuIV()
>>> cmp.dist('cat', 'hat')
0.49999799606535283
>>> cmp.dist('Niall', 'Neil')
0.49999801148659684
>>> cmp.dist('aluminum', 'Catalan')
0.49999883126809364
>>> cmp.dist('ATCG', 'TAGC')
0.4999996033268451
.. versionadded:: 0.4.0
"""
distance = self.dist_abs(src, tar)
n3 = self._population_unique_card() ** 3
k = self._positive_irrational
num = distance + n3 * k
if num == 0.0:
return 0.0
return (distance + n3 * k) / (2 * n3 * k)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_ix.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_ix.
Baulieu IX distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuIX']
class BaulieuIX(_TokenDistance):
r"""Baulieu IX distance.
For two sets X and Y and a population N, Baulieu IX distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuIX}(X, Y) = \frac{|X \setminus Y| + 2 \cdot
|Y \setminus X|}{|N| + |Y \setminus X|}
This is Baulieu's 27th dissimilarity coefficient. This coefficient fails
Baulieu's (P7) property, that :math:`D(a,b,c,d) = D(a,c,b,d)`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuIX} = \frac{b+2c}{a+b+2c+d}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuIX instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuIX, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu IX distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu IX distance
Examples
--------
>>> cmp = BaulieuIX()
>>> cmp.dist('cat', 'hat')
0.007633587786259542
>>> cmp.dist('Niall', 'Neil')
0.012706480304955527
>>> cmp.dist('aluminum', 'Catalan')
0.027777777777777776
>>> cmp.dist('ATCG', 'TAGC')
0.019011406844106463
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
b = self._src_only_card()
c = self._tar_only_card()
n = self._population_unique_card()
return (b + 2 * c) / (c + n)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_v.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_v.
Baulieu V distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuV']
class BaulieuV(_TokenDistance):
r"""Baulieu V distance.
For two sets X and Y and a population N, Baulieu V distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuV}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| +
1}{|X \cap Y| + |X \setminus Y| + |Y \setminus X| + 1}
This is Baulieu's 23rd dissimilarity coefficient. This coefficient fails
Baulieu's (P2) property, that :math:`D(a,0,0,0) = 0`. Rather,
:math:`D(a,0,0,0) > 0`, but
:math:`\lim_{a \to \infty} D(a,0,0,0) = 0`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuV} = \frac{b+c+1}{a+b+c+1}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuV instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuV, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu V distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu V distance
Examples
--------
>>> cmp = BaulieuV()
>>> cmp.dist('cat', 'hat')
0.7142857142857143
>>> cmp.dist('Niall', 'Neil')
0.8
>>> cmp.dist('aluminum', 'Catalan')
0.9411764705882353
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
return (b + c + 1) / (a + b + c + 1)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_vi.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_vi.
Baulieu VI distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuVI']
class BaulieuVI(_TokenDistance):
r"""Baulieu VI distance.
For two sets X and Y and a population N, Baulieu VI distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuVI}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|}
{|X \cap Y| + |X \setminus Y| + |Y \setminus X| + 1}
This is Baulieu's 24th dissimilarity coefficient. This coefficient fails
Baulieu's (P3) property, that :math:`D(a,b,c,d) = 1` for some (a,b,c,d).
Rather, :math:`D(a,b,c,d) < 1`, but
:math:`\lim_{b \to \infty, c \to \infty} D(a,b,c,d) = 0` for :math:`a = 0`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuVI} = \frac{b+c}{a+b+c+1}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuVI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuVI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu VI distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu VI distance
Examples
--------
>>> cmp = BaulieuVI()
>>> cmp.dist('cat', 'hat')
0.5714285714285714
>>> cmp.dist('Niall', 'Neil')
0.7
>>> cmp.dist('aluminum', 'Catalan')
0.8823529411764706
>>> cmp.dist('ATCG', 'TAGC')
0.9090909090909091
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
return (b + c) / (a + b + c + 1)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_vii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_vii.
Baulieu VII distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuVII']
class BaulieuVII(_TokenDistance):
r"""Baulieu VII distance.
For two sets X and Y and a population N, Baulieu VII distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuVII}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|}
{|N| + |X \cap Y| \cdot (|X \cap Y| - 4)^2}
This is Baulieu's 25th dissimilarity coefficient. This coefficient fails
Baulieu's (P4) property, that :math:`D(a+1,b,c,d) \leq D(a,b,c,d) = 0`
with equality holding iff :math:`D(a,b,c,d) = 0`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuVII} = \frac{b+c}{n + a \cdot (a-4)^2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuVII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuVII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu VII distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu VII distance
Examples
--------
>>> cmp = BaulieuVII()
>>> cmp.dist('cat', 'hat')
0.005050505050505051
>>> cmp.dist('Niall', 'Neil')
0.008838383838383838
>>> cmp.dist('aluminum', 'Catalan')
0.018891687657430732
>>> cmp.dist('ATCG', 'TAGC')
0.012755102040816327
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
bpc = self._src_only_card() + self._tar_only_card()
n = self._population_unique_card()
return bpc / (n + a * (a - 4) ** 2)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_viii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_viii.
Baulieu VIII distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuVIII']
class BaulieuVIII(_TokenDistance):
r"""Baulieu VIII distance.
For two sets X and Y and a population N, Baulieu VIII distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuVIII}(X, Y) = \frac{(|X \setminus Y| -
|Y \setminus X|)^2}{|N|^2}
This is Baulieu's 26th dissimilarity coefficient. This coefficient fails
Baulieu's (P5) property, that :math:`D(a,b+1,c,d) \geq D(a,b,c,d)`,
with equality holding if :math:`D(a,b,c,d) = 1`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuVIII} = \frac{(b-c)^2}{n^2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuVIII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuVIII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu VIII distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu VIII distance
Examples
--------
>>> cmp = BaulieuVIII()
>>> cmp.dist('cat', 'hat')
0.0
>>> cmp.dist('Niall', 'Neil')
1.6269262807163682e-06
>>> cmp.dist('aluminum', 'Catalan')
1.6227838857560144e-06
>>> cmp.dist('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
bmc = self._src_only_card() - self._tar_only_card()
n = self._population_unique_card()
if bmc == 0.0:
return 0.0
return bmc ** 2 / n ** 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_x.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_x.
Baulieu X distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuX']
class BaulieuX(_TokenDistance):
r"""Baulieu X distance.
For two sets X and Y and a population N, Baulieu X distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuX}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| +
max(|X \setminus Y|, |Y \setminus X|)}{|N| +
max(|X \setminus Y|, |Y \setminus X|)}
This is Baulieu's 28th dissimilarity coefficient. This coefficient fails
Baulieu's (P8) property, that :math:`D` is a rational function whose
numerator and denominator are both (total) linear.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuX} = \frac{b+c+max(b,c)}{n+max(b,c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuX instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuX, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu X distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu X distance
Examples
--------
>>> cmp = BaulieuX()
>>> cmp.dist('cat', 'hat')
0.007633587786259542
>>> cmp.dist('Niall', 'Neil')
0.013959390862944163
>>> cmp.dist('aluminum', 'Catalan')
0.029003783102143757
>>> cmp.dist('ATCG', 'TAGC')
0.019011406844106463
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
b = self._src_only_card()
c = self._tar_only_card()
n = self._population_unique_card()
num = b + c + max(b, c)
if num == 0.0:
return 0.0
return num / (n + max(b, c))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_xi.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_xi.
Baulieu XI distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuXI']
class BaulieuXI(_TokenDistance):
r"""Baulieu XI distance.
For two sets X and Y and a population N, Baulieu XI distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuXI}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|}
{|X \setminus Y| + |Y \setminus X| + |(N \setminus X) \setminus Y|}
This is Baulieu's 29th dissimilarity coefficient. This coefficient fails
Baulieu's (P4) property, that :math:`D(a+1,b,c,d) \leq D(a,b,c,d) = 0`
with equality holding iff :math:`D(a,b,c,d) = 0`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuXI} = \frac{b+c}{b+c+d}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuXI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuXI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu XI distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu XI distance
Examples
--------
>>> cmp = BaulieuXI()
>>> cmp.dist('cat', 'hat')
0.005115089514066497
>>> cmp.dist('Niall', 'Neil')
0.008951406649616368
>>> cmp.dist('aluminum', 'Catalan')
0.01913265306122449
>>> cmp.dist('ATCG', 'TAGC')
0.012755102040816327
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
bpc = self._src_only_card() + self._tar_only_card()
d = self._total_complement_card()
if bpc:
return bpc / (bpc + d)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_xii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_xii.
Baulieu XII distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuXII']
class BaulieuXII(_TokenDistance):
r"""Baulieu XII distance.
For two sets X and Y and a population N, Baulieu XII distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuXII}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|}
{|X \cap Y| + |X \setminus Y| + |Y \setminus X| - 1}
This is Baulieu's 30th dissimilarity coefficient. This coefficient fails
Baulieu's (P5) property, that :math:`D(a,b+1,c,d) \geq D(a,b,c,d)`,
with equality holding if :math:`D(a,b,c,d) = 1`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuXII} = \frac{b+c}{a+b+c-1}
Notes
-----
In the special case of comparisons where the intersection (a) contains 0
members, the size of the intersection is set to 1, resulting in a distance
of 1.0. This prevents the distance from exceeding 1.0 and similarity from
becoming negative.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuXII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuXII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu XII distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu XII distance
Examples
--------
>>> cmp = BaulieuXII()
>>> cmp.dist('cat', 'hat')
0.8
>>> cmp.dist('Niall', 'Neil')
0.875
>>> cmp.dist('aluminum', 'Catalan')
1.0
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = max(1.0, self._intersection_card())
bpc = self._src_only_card() + self._tar_only_card()
if bpc == 0.0:
return 0.0
return bpc / (a + bpc - 1)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_xiii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_xiii.
Baulieu XIII distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuXIII']
class BaulieuXIII(_TokenDistance):
r"""Baulieu XIII distance.
For two sets X and Y and a population N, Baulieu XIII distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuXIII}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X|}
{|X \cap Y| + |X \setminus Y| + |Y \setminus X| + |X \cap Y| \cdot
(|X \cap Y| - 4)^2}
This is Baulieu's 31st dissimilarity coefficient. This coefficient fails
Baulieu's (P4) property, that :math:`D(a+1,b,c,d) \leq D(a,b,c,d) = 0`
with equality holding iff :math:`D(a,b,c,d) = 0`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuXIII} = \frac{b+c}{a+b+c+a \cdot (a-4)^2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuXIII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuXIII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu XIII distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu XIII distance
Examples
--------
>>> cmp = BaulieuXIII()
>>> cmp.dist('cat', 'hat')
0.2857142857142857
>>> cmp.dist('Niall', 'Neil')
0.4117647058823529
>>> cmp.dist('aluminum', 'Catalan')
0.6
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
bpc = self._src_only_card() + self._tar_only_card()
if bpc == 0.0:
return 0.0
return bpc / (a + bpc + a * (a - 4) ** 2)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_xiv.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_xiv.
Baulieu XIV distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuXIV']
class BaulieuXIV(_TokenDistance):
r"""Baulieu XIV distance.
For two sets X and Y and a population N, Baulieu XIV distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuXIV}(X, Y) = \frac{|X \setminus Y| + 2 \cdot
|Y \setminus X|}{|X \cap Y| + |X \setminus Y| + 2 \cdot
|Y \setminus X|}
This is Baulieu's 32nd dissimilarity coefficient. This coefficient fails
Baulieu's (P7) property, that :math:`D(a,b,c,d) = D(a,c,b,d)`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuXIV} = \frac{b+2c}{a+b+2c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuXIV instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuXIV, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu XIV distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu XIV distance
Examples
--------
>>> cmp = BaulieuXIV()
>>> cmp.dist('cat', 'hat')
0.75
>>> cmp.dist('Niall', 'Neil')
0.8333333333333334
>>> cmp.dist('aluminum', 'Catalan')
0.9565217391304348
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
return (b + 2 * c) / (a + b + 2 * c)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baulieu_xv.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baulieu_xv.
Baulieu XV distance
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BaulieuXV']
class BaulieuXV(_TokenDistance):
r"""Baulieu XV distance.
For two sets X and Y and a population N, Baulieu XV distance
:cite:`Baulieu:1997` is
.. math::
dist_{BaulieuXV}(X, Y) = \frac{|X \setminus Y| + |Y \setminus X| +
max(|X \setminus Y|, |Y \setminus X|)}{|X \cap Y| + |X \setminus Y|
+ |Y \setminus X| + max(|X \setminus Y|, |Y \setminus X|)}
This is Baulieu's 33rd dissimilarity coefficient. This coefficient fails
Baulieu's (P8) property, that :math:`D` is a rational function whose
numerator and denominator are both (total) linear.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
dist_{BaulieuXV} = \frac{b+c+max(b, c)}{a+b+c+max(b, c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BaulieuXV instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BaulieuXV, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Baulieu XV distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Baulieu XV distance
Examples
--------
>>> cmp = BaulieuXV()
>>> cmp.dist('cat', 'hat')
0.75
>>> cmp.dist('Niall', 'Neil')
0.8461538461538461
>>> cmp.dist('aluminum', 'Catalan')
0.9583333333333334
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
return (b + c + max(b, c)) / (a + b + c + max(b, c))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_baystat.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._baystat.
Baystat similarity.
"""
from typing import Any, Optional
from ._distance import _Distance
__all__ = ['Baystat']
class Baystat(_Distance):
"""Baystat similarity and distance.
Good results for shorter words are reported when setting min_ss_len to 1
and either left_ext OR right_ext to 1.
The Baystat similarity is defined in :cite:`Furnohr:2002`.
This is ostensibly a port of the R module PPRL's implementation:
https://github.com/cran/PPRL/blob/master/src/MTB_Baystat.cpp
:cite:`Rukasz:2018`. As such, this could be made more pythonic.
.. versionadded:: 0.3.6
"""
def __init__(
self,
min_ss_len: Optional[int] = None,
left_ext: Optional[int] = None,
right_ext: Optional[int] = None,
**kwargs: Any
) -> None:
"""Initialize Levenshtein instance.
Parameters
----------
min_ss_len : int
Minimum substring length to be considered
left_ext : int
Left-side extension length
right_ext : int
Right-side extension length
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Baystat, self).__init__(**kwargs)
self._min_ss_len = min_ss_len
self._left_ext = left_ext
self._right_ext = right_ext
def sim(self, src: str, tar: str) -> float:
"""Return the Baystat similarity.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The Baystat similarity
Examples
--------
>>> cmp = Baystat()
>>> round(cmp.sim('cat', 'hat'), 12)
0.666666666667
>>> cmp.sim('Niall', 'Neil')
0.4
>>> round(cmp.sim('Colin', 'Cuilen'), 12)
0.166666666667
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
max_len = max(len(src), len(tar))
if not (self._min_ss_len and self._left_ext and self._right_ext):
# These can be set via arguments to the function. Otherwise they
# are set automatically based on values from the article.
if max_len >= 7:
min_ss_len = 2
left_ext = 2
right_ext = 2
else:
# The paper suggests that for short names, (exclusively) one or
# the other of left_ext and right_ext can be 1, with good
# results. I use 0 & 0 as the default in this case.
min_ss_len = 1
left_ext = 0
right_ext = 0
else:
min_ss_len = self._min_ss_len
left_ext = self._left_ext
right_ext = self._right_ext
pos = 0
match_len = 0
while True:
if pos + min_ss_len > len(src):
return match_len / max_len
hit_len = 0
ix = 1
substring = src[pos : pos + min_ss_len]
search_begin = pos - left_ext
if search_begin < 0:
search_begin = 0
left_ext_len = pos
else:
left_ext_len = left_ext
if pos + min_ss_len + right_ext >= len(tar):
right_ext_len = len(tar) - pos - min_ss_len
else:
right_ext_len = right_ext
if (
search_begin + left_ext_len + min_ss_len + right_ext_len
> search_begin
):
search_val = tar[
search_begin : (
search_begin
+ left_ext_len
+ min_ss_len
+ right_ext_len
)
]
else:
search_val = ''
flagged_tar = ''
while substring in search_val and pos + ix <= len(src):
hit_len = len(substring)
flagged_tar = tar.replace(substring, '#' * hit_len)
if pos + min_ss_len + ix <= len(src):
substring = src[pos : pos + min_ss_len + ix]
if pos + min_ss_len + right_ext_len + 1 <= len(tar):
right_ext_len += 1
# The following is unnecessary, I think
# if (search_begin + left_ext_len + min_ss_len + right_ext_len
# <= len(tar)):
search_val = tar[
search_begin : (
search_begin
+ left_ext_len
+ min_ss_len
+ right_ext_len
)
]
ix += 1
if hit_len > 0:
tar = flagged_tar
match_len += hit_len
pos += ix
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_benini_i.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._benini_i.
Benini I correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BeniniI']
class BeniniI(_TokenDistance):
r"""BeniniI correlation.
For two sets X and Y and a population N, Benini I correlation, Benini's
Index of Attraction, :cite:`Benini:1901` is
.. math::
corr_{BeniniI}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}{|Y| \cdot |N \setminus X|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{BeniniI} = \frac{ad-bc}{(a+c)(c+d)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BeniniI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BeniniI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Benini I correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Benini I correlation
Examples
--------
>>> cmp = BeniniI()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.3953727506426735
>>> cmp.corr('aluminum', 'Catalan')
0.11485180412371133
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483954
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = a * d - b * c
if num == 0.0:
return 0.0
return num / ((a + c) * (c + d))
def sim(self, src: str, tar: str) -> float:
"""Return the Benini I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Benini I similarity
Examples
--------
>>> cmp = BeniniI()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6976863753213367
>>> cmp.sim('aluminum', 'Catalan')
0.5574259020618557
>>> cmp.sim('ATCG', 'TAGC')
0.496790757381258
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_benini_ii.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._benini_ii.
Benini II correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BeniniII']
class BeniniII(_TokenDistance):
r"""BeniniII correlation.
For two sets X and Y and a population N, Benini II correlation, Benini's
Index of Repulsion, :cite:`Benini:1901` is
.. math::
corr_{BeniniII}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{min(|Y| \cdot |N \setminus X|, |X| \cdot |N \setminus Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{BeniniII} = \frac{ad-bc}{min((a+c)(c+d), (a+b)(b+d))}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BeniniII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BeniniII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Benini II correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Benini II correlation
Examples
--------
>>> cmp = BeniniII()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.3953727506426735
>>> cmp.corr('aluminum', 'Catalan')
0.11485180412371133
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483954
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = a * d - b * c
if num == 0.0:
return 0.0
bc_min = min(b, c)
return num / ((a + bc_min) * (bc_min + d))
def sim(self, src: str, tar: str) -> float:
"""Return the Benini II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Benini II similarity
Examples
--------
>>> cmp = BeniniII()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6976863753213367
>>> cmp.sim('aluminum', 'Catalan')
0.5574259020618557
>>> cmp.sim('ATCG', 'TAGC')
0.496790757381258
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_bennet.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._bennet.
Bennet's S correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Bennet']
class Bennet(_TokenDistance):
r"""Bennet's S correlation.
For two sets X and Y and a population N, Bennet's :math:`S`
correlation :cite:`Bennet:1954` is
.. math::
corr_{Bennet}(X, Y) = S =
\frac{p_o - p_e^S}{1 - p_e^S}
where
.. math::
p_o = \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}
p_e^S = \frac{1}{2}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
p_o = \frac{a+d}{n}
p_e^S = \frac{1}{2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Bennet instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Bennet, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Bennet's S correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Bennet's S correlation
Examples
--------
>>> cmp = Bennet()
>>> cmp.corr('cat', 'hat')
0.989795918367347
>>> cmp.corr('Niall', 'Neil')
0.9821428571428572
>>> cmp.corr('aluminum', 'Catalan')
0.9617834394904459
>>> cmp.corr('ATCG', 'TAGC')
0.9744897959183674
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
d = self._total_complement_card()
n = self._population_unique_card()
return 2 * (a + d) / n - 1
def sim(self, src: str, tar: str) -> float:
"""Return the Bennet's S similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Bennet's S similarity
Examples
--------
>>> cmp = Bennet()
>>> cmp.sim('cat', 'hat')
0.9948979591836735
>>> cmp.sim('Niall', 'Neil')
0.9910714285714286
>>> cmp.sim('aluminum', 'Catalan')
0.9808917197452229
>>> cmp.sim('ATCG', 'TAGC')
0.9872448979591837
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_bhattacharyya.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._bhattacharyya.
Bhattacharyya distance
"""
from math import log
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Bhattacharyya']
class Bhattacharyya(_TokenDistance):
r"""Bhattacharyya distance.
For two multisets X and Y drawn from an alphabet S, Bhattacharyya distance
:cite:`Bhattacharyya:1946` is
.. math::
dist_{Bhattacharyya}(X, Y) =
-log(\sum_{i \in S} \sqrt{X_iY_i})
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize Bhattacharyya instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(Bhattacharyya, self).__init__(tokenizer=tokenizer, **kwargs)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Bhattacharyya distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Bhattacharyya distance
Examples
--------
>>> cmp = Bhattacharyya()
>>> cmp.dist_abs('cat', 'hat')
0.6931471805599453
>>> cmp.dist_abs('Niall', 'Neil')
1.0074515102711326
>>> cmp.dist_abs('aluminum', 'Catalan')
2.1383330595080277
>>> cmp.dist_abs('ATCG', 'TAGC')
-inf
.. versionadded:: 0.4.0
"""
bc = self.dist(src, tar)
if bc == 0:
return float('-inf')
elif bc == 1:
return 0.0
else:
return -log(bc)
def dist(self, src: str, tar: str) -> float:
"""Return the Bhattacharyya coefficient of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Bhattacharyya distance
Examples
--------
>>> cmp = Bhattacharyya()
>>> cmp.dist('cat', 'hat')
0.5
>>> cmp.dist('Niall', 'Neil')
0.3651483716701107
>>> cmp.dist('aluminum', 'Catalan')
0.11785113019775792
>>> cmp.dist('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
alphabet = self._intersection().keys()
src_pop = sum(self._src_tokens.values())
tar_pop = sum(self._tar_tokens.values())
return float(
sum(
(
self._src_tokens[tok]
/ src_pop
* self._tar_tokens[tok]
/ tar_pop
)
** 0.5
for tok in alphabet
)
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_bisim.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._bisim.
BI-SIM similarity
"""
from typing import Any, cast
from numpy import float_ as np_float
from numpy import zeros as np_zeros
from ._distance import _Distance
__all__ = ['BISIM']
class BISIM(_Distance):
r"""BI-SIM similarity.
BI-SIM similarity :cite:`Kondrak:2003` is an n-gram based, edit-distance
derived similarity measure.
.. versionadded:: 0.4.0
"""
def __init__(self, qval: int = 2, **kwargs: Any) -> None:
"""Initialize BISIM instance.
Parameters
----------
qval : int
The number of characters to consider in each n-gram (q-gram). By
default this is 2, hence BI-SIM. But TRI-SIM can be calculated by
setting this to 3.
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(BISIM, self).__init__(**kwargs)
self._qval = qval
def sim(self, src: str, tar: str) -> float:
"""Return the BI-SIM similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
BI-SIM similarity
Examples
--------
>>> cmp = BISIM()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.4
>>> cmp.sim('aluminum', 'Catalan')
0.3125
>>> cmp.sim('ATCG', 'TAGC')
0.375
.. versionadded:: 0.4.0
"""
src_len = len(src)
tar_len = len(tar)
if src == tar:
return 1.0
if not src or not tar:
return 0.0
def _id(src_pos: int, tar_pos: int) -> float:
s = 0
for i in range(self._qval):
s += int(src[src_pos + i] == tar[tar_pos + i])
return s / self._qval
src = src[0].swapcase() * (self._qval - 1) + src
tar = tar[0].swapcase() * (self._qval - 1) + tar
d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
d_mat[i, j] = max(
d_mat[i - 1, j - 1] + _id(i - 1, j - 1), # sub/==
d_mat[i - 1, j], # ins
d_mat[i, j - 1], # del
)
return cast(float, d_mat[src_len, tar_len]) / max(src_len, tar_len)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_bleu.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._bleu.
BLEU similarity
"""
from math import exp, log
from typing import Any, List, Optional
from ._distance import _Distance
from ..tokenizer import QGrams, _Tokenizer
__all__ = ['BLEU']
class BLEU(_Distance):
r"""BLEU similarity.
BLEU similarity :cite:`Papineni:2002` compares two strings for similarity
using a set of tokenizers and a brevity penalty:
.. math::
BP =
\left\{
\begin{array}{lrl}
1 & \textup{if} & c > r \\
e^{(1-\frac{r}{c})} & \textup{if} & c \leq r
\end{array}
\right.
The BLEU score is then:
.. math::
\textup{B\textsc{leu}} = BP \cdot e^{\sum_{n=1}^N w_n log p_n}
For tokenizers 1 to N, by default q-gram tokenizers for q=1 to N in
Abydos, weights :math:`w_n`, which are uniformly :math:`\frac{1}{N}`,
and :math:`p_n`:
.. math::
p_n = \frac{\sum_{token \in tar} min(Count(token \in tar),
Count(token \in src))}{|tar|}
.. versionadded:: 0.4.0
"""
def __init__(
self,
n_min: int = 1,
n_max: int = 4,
tokenizers: Optional[List[_Tokenizer]] = None,
weights: Optional[List[float]] = None,
**kwargs: Any
) -> None:
"""Initialize BLEU instance.
Parameters
----------
n_min : int
The minimum q-gram value for BLEU score calculation (1 by default)
n_max : int
The maximum q-gram value for BLEU score calculation (4 by default)
tokenizers : list(_Tokenizer)
A list of initialized tokenizers
weights : list(float)
A list of floats representing the weights of the tokenizers. If
tokenizers is set, this must have the same length. If n_min and
n_max are used to set tokenizers, this must have length equal to
n_max-n_min-1. Otherwise, uniform weights will be used.
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(BLEU, self).__init__(**kwargs)
self._tokenizers = (
[QGrams(qval=n, start_stop='') for n in range(n_min, n_max + 1)]
if tokenizers is None
else tokenizers
) # type: List[_Tokenizer]
if not weights or len(weights) != len(self._tokenizers):
self._weights = [
1.0 / len(self._tokenizers)
for _ in range(len(self._tokenizers))
]
else:
self._weights = weights
def sim(self, src: str, tar: str) -> float:
"""Return the BLEU similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
BLEU similarity
Examples
--------
>>> cmp = BLEU()
>>> cmp.sim('cat', 'hat')
0.7598356856515925
>>> cmp.sim('Niall', 'Neil')
0.7247557929987696
>>> cmp.sim('aluminum', 'Catalan')
0.44815260192961937
>>> cmp.sim('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
if not src or not tar:
return 0.0
brevity_penalty = (
1.0 if len(tar) >= len(src) else exp(1 - len(src) / len(tar))
)
bleu_sum = 0.0
bleu_null = True
for i in range(len(self._tokenizers)):
self._tokenizers[i].tokenize(tar)
tar_tokens = self._tokenizers[i].get_counter()
self._tokenizers[i].tokenize(src)
tokens_int = self._tokenizers[i].get_counter() & tar_tokens
tar_total = sum(tar_tokens.values())
if tokens_int:
bleu_null = False
bleu_sum += (
log(sum(tokens_int.values()) / tar_total)
* self._weights[i]
)
if bleu_null:
return 0.0
return brevity_penalty * exp(bleu_sum)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_block_levenshtein.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._block_levenshtein.
Levenshtein distance with block operations
"""
from typing import Any, Callable, List, Tuple
from ._lcsstr import LCSstr
from ._levenshtein import Levenshtein
__all__ = ['BlockLevenshtein']
class BlockLevenshtein(Levenshtein):
"""Levenshtein distance with block operations.
In addition to character-level insert, delete, and replace operations,
this version of the Levenshtein distance supports block-level insert,
delete, and replace, provided that the block occurs in both input
strings.
.. versionadded:: 0.4.0
"""
def __init__(
self,
cost: Tuple[float, float, float, float] = (1, 1, 1, 1),
normalizer: Callable[[List[float]], float] = max,
**kwargs: Any
):
"""Initialize BlockLevenshtein instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(BlockLevenshtein, self).__init__(
cost=cost, normalizer=normalizer, **kwargs
)
self.lcs = LCSstr()
def dist_abs(self, src: str, tar: str) -> float:
"""Return the block Levenshtein edit distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
The block Levenshtein edit distance between src & tar
Examples
--------
>>> cmp = BlockLevenshtein()
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
3
>>> cmp.dist_abs('aluminum', 'Catalan')
7
>>> cmp.dist_abs('ATCG', 'TAGC')
3
.. versionadded:: 0.4.0
"""
alphabet = set(src) | set(tar)
next_char = ord('A')
lcs = self.lcs.lcsstr(src, tar)
while len(lcs) > 1:
while chr(next_char) in alphabet:
next_char += 1
p = self.lcs.lcsstr(src, tar)
src = src.replace(p, chr(next_char))
tar = tar.replace(p, chr(next_char))
alphabet.add(chr(next_char))
lcs = self.lcs.lcsstr(src, tar)
d = super(BlockLevenshtein, self).dist_abs(src, tar)
return d
def dist(self, src: str, tar: str) -> float:
"""Return the normalized block Levenshtein distance between strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The normalized Levenshtein distance with blocks between src & tar
Examples
--------
>>> cmp = BlockLevenshtein()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.6
>>> cmp.dist('aluminum', 'Catalan')
0.875
>>> cmp.dist('ATCG', 'TAGC')
0.75
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
ins_cost, del_cost = self._cost[:2]
return self.dist_abs(src, tar) / (
self._normalizer([len(src) * del_cost, len(tar) * ins_cost])
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_brainerd_robinson.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._brainerd_robinson.
Brainerd-Robinson similarity
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BrainerdRobinson']
class BrainerdRobinson(_TokenDistance):
r"""Brainerd-Robinson similarity.
For two multisets X and Y drawn from an alphabet S, Brainerd-Robinson
similarity :cite:`Robinson:1951,Brainerd:1951` is
.. math::
sim_{BrainerdRobinson}(X, Y) =
200 - 100 \cdot \sum_{i \in S} |\frac{X_i}{\sum_{i \in S} |X_i|} -
\frac{Y_i}{\sum_{i \in S} |Y_i|}|
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize BrainerdRobinson instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(BrainerdRobinson, self).__init__(tokenizer=tokenizer, **kwargs)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Brainerd-Robinson similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Brainerd-Robinson similarity
Examples
--------
>>> cmp = BrainerdRobinson()
>>> cmp.sim_score('cat', 'hat')
100.0
>>> cmp.sim_score('Niall', 'Neil')
66.66666666666669
>>> cmp.sim_score('aluminum', 'Catalan')
22.2222222222222
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
alphabet = self._total().keys()
src_card = max(1, self._src_card())
tar_card = max(1, self._tar_card())
score = 200.0 - 100.0 * sum(
abs(
self._src_tokens[tok] / src_card
- self._tar_tokens[tok] / tar_card
)
for tok in alphabet
)
if score < 1e-13:
score = 0.0
return score
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Brainerd-Robinson similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Brainerd-Robinson similarity
Examples
--------
>>> cmp = BrainerdRobinson()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3333333333333334
>>> cmp.sim('aluminum', 'Catalan')
0.111111111111111
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return self.sim_score(src, tar) / 200.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_braun_blanquet.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._braun_blanquet.
Braun-Blanquet similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['BraunBlanquet']
class BraunBlanquet(_TokenDistance):
r"""Braun-Blanquet similarity.
For two sets X and Y and a population N, the Braun-Blanquet
similarity :cite:`BraunBlanquet:1932` is
.. math::
sim_{BraunBlanquet}(X, Y) = \frac{|X \cap Y|}{max(|X|, |Y|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{BraunBlanquet} =
\frac{a}{max(a+b, a+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize BraunBlanquet instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(BraunBlanquet, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Braun-Blanquet similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Braun-Blanquet similarity
Examples
--------
>>> cmp = BraunBlanquet()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3333333333333333
>>> cmp.sim('aluminum', 'Catalan')
0.1111111111111111
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
return self._intersection_card() / max(
self._src_card(), self._tar_card()
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_canberra.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._canberra.
Canberra distance
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Canberra']
class Canberra(_TokenDistance):
r"""Canberra distance.
For two sets X and Y, the Canberra distance :cite:`Lance:1966,Lance:1967b`
is
.. math::
sim_{Canberra}(X, Y) = \frac{|X \triangle Y|}{|X|+|Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Canberra} =
\frac{b+c}{(a+b)+(a+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Canberra instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Canberra, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the Canberra distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Canberra distance
Examples
--------
>>> cmp = Canberra()
>>> cmp.dist('cat', 'hat')
0.5
>>> cmp.dist('Niall', 'Neil')
0.6363636363636364
>>> cmp.dist('aluminum', 'Catalan')
0.8823529411764706
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
return self._symmetric_difference_card() / self._total_card()
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_cao.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._cao.
Cao's CY dissimilarity.
"""
from math import log10
from typing import Any
from ._token_distance import _TokenDistance
__all__ = ['Cao']
class Cao(_TokenDistance):
r"""Cao's CY dissimilarity.
Given :math:`X_{ij}` (the number of individuals of speecies :math:`j` in
sample :math:`i`), :math:`X_{kj}` (the number of individuals of speecies
:math:`j` in sample :math:`k`), and :math:`N` (the total number of speecies
present in both samples), Cao dissimilarity (CYd) :cite:`Cao:1997` is:
.. math::
dist_{Cao}(X, Y) =
CYd = \frac{1}{N}\sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
\frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
{X_{ij}+X_{kj}}\Bigg)
In the above formula, whenever :math:`X_{ij} = 0` or :math:`X_{kj} = 0`,
the value 0.1 is substituted.
Since this measure ranges from 0 to :math:`\infty`, a similarity measure,
CYs, ranging from 0 to 1 was also developed.
.. math::
sim_{Cao}(X, Y) = CYs = 1 - \frac{Observed~CYd}{Maximum~CYd}
where
.. math::
Observed~CYd = \sum\Bigg(\frac{(X_{ij} + X_{kj})log_{10}\big(
\frac{X_{ij}+X_{kj}}{2}\big)-X_{ij}log_{10}X_{kj}-X_{kj}log_{10}X_{ij}}
{X_{ij}+X_{kj}}\Bigg)
and with :math:`a` (the number of species present in both samples),
:math:`b` (the number of species present in sample :math:`i` only), and
:math:`c` (the number of species present in sample :math:`j` only),
.. math::
Maximum~CYd = D_1 + D_2 + D_3
with
.. math::
D_1 = \sum_{j=1}^b \Bigg(\frac{(X_{ij} + 0.1) log_{10} \big(
\frac{X_{ij}+0.1}{2}\big)-X_{ij}log_{10}0.1-0.1log_{10}X_{ij}}
{X_{ij}+0.1}\Bigg)
D_2 = \sum_{j=1}^c \Bigg(\frac{(X_{kj} + 0.1) log_{10} \big(
\frac{X_{kj}+0.1}{2}\big)-X_{kj}log_{10}0.1-0.1log_{10}X_{kj}}
{X_{kj}+0.1}\Bigg)
D_1 = \sum_{j=1}^a \frac{a}{2} \Bigg(\frac{(D_i + 1) log_{10}
\big(\frac{D_i+1}{2}\big)-log_{10}D_i}{D_i+1} + \frac{(D_k + 1) log_{10}
\big(\frac{D_k+1}{2}\big)-log_{10}D_k}{D_k+1}\Bigg)
with
.. math::
D_i = \frac{\sum X_{ij} - \frac{a}{2}}{\frac{a}{2}}
D_k = \frac{\sum X_{kj} - \frac{a}{2}}{\frac{a}{2}}
for
.. math::
X_{ij} \geq 1
X_{kj} \geq 1
.. versionadded:: 0.4.1
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize Cao instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(Cao, self).__init__(**kwargs)
def sim(self, src: str, tar: str) -> float:
"""Return Cao's CY similarity (CYs) of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Cao's CY similarity
Examples
--------
>>> cmp = Cao()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('Niall', 'Neil')
0.0
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.1
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
self._tokenize(src, tar)
alphabet = self._total().keys()
in_both_samples_half = len(self._intersection().keys()) / 2
if not in_both_samples_half:
return 0.0
observed_cyd = 0.0
maximum_cyd = 0.0
for symbol in alphabet:
src_tok = max(0.1, self._src_tokens[symbol])
tar_tok = max(0.1, self._tar_tokens[symbol])
tok_sum = src_tok + tar_tok
observed_cyd += (
tok_sum * log10(tok_sum / 2)
- src_tok * log10(tar_tok)
- tar_tok * log10(src_tok)
) / tok_sum
if self._tar_tokens[symbol] == 0:
maximum_cyd += (
(self._src_tokens[symbol] + 0.1)
* log10((self._src_tokens[symbol] + 0.1) / 2)
- self._src_tokens[symbol] * log10(0.1)
- 0.1 * log10(self._src_tokens[symbol])
) / (self._src_tokens[symbol] + 0.1)
elif self._src_tokens[symbol] == 0:
maximum_cyd += (
(self._tar_tokens[symbol] + 0.1)
* log10((self._tar_tokens[symbol] + 0.1) / 2)
- self._tar_tokens[symbol] * log10(0.1)
- 0.1 * log10(self._tar_tokens[symbol])
) / (self._tar_tokens[symbol] + 0.1)
d_i = 0.0
d_k = 0.0
for symbol in self._intersection().keys():
d_i += self._src_tokens[symbol]
d_k += self._tar_tokens[symbol]
d_i = (d_i - in_both_samples_half) / in_both_samples_half
d_k = (d_k - in_both_samples_half) / in_both_samples_half
maximum_cyd += in_both_samples_half * (
((d_i + 1) * log10((d_i + 1) / 2) - log10(d_i)) / (d_i + 1)
+ ((d_k + 1) * log10((d_k + 1) / 2) - log10(d_k)) / (d_k + 1)
)
return max(0.0, min(1.0, 1 - (observed_cyd / maximum_cyd)))
def dist_abs(self, src: str, tar: str) -> float:
"""Return Cao's CY dissimilarity (CYd) of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Cao's CY dissimilarity
Examples
--------
>>> cmp = Cao()
>>> cmp.dist_abs('cat', 'hat')
0.3247267992925765
>>> cmp.dist_abs('Niall', 'Neil')
0.4132886536450973
>>> cmp.dist_abs('aluminum', 'Catalan')
0.5530666041976232
>>> cmp.dist_abs('ATCG', 'TAGC')
0.6494535985851531
.. versionadded:: 0.4.1
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
alphabet = self._total().keys()
score = 0.0
for symbol in alphabet:
src_tok = max(0.1, self._src_tokens[symbol])
tar_tok = max(0.1, self._tar_tokens[symbol])
tok_sum = src_tok + tar_tok
score += (
tok_sum * log10(tok_sum / 2)
- src_tok * log10(tar_tok)
- tar_tok * log10(src_tok)
) / tok_sum
return score / sum(self._total().values())
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_chao_dice.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._chao_dice.
Chao's Dice similarity
"""
from typing import Any
from ._chao_jaccard import ChaoJaccard
__all__ = ['ChaoDice']
class ChaoDice(ChaoJaccard):
r"""Chao's Dice similarity.
Chao's Dice similarity :cite:`Chao:2004`
.. versionadded:: 0.4.1
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize ChaoDice instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(ChaoDice, self).__init__(**kwargs)
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Chao's Dice similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Chao's Dice similarity
Examples
--------
>>> import random
>>> random.seed(0)
>>> cmp = ChaoDice()
>>> cmp.sim('cat', 'hat')
0.36666666666666664
>>> cmp.sim('Niall', 'Neil')
0.27868852459016397
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.1
"""
return max(0.0, min(1.0, self.sim_score(src, tar)))
def sim_score(self, src: str, tar: str) -> float:
"""Return the Chao's Dice similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Chao's Dice similarity
Examples
--------
>>> import random
>>> random.seed(0)
>>> cmp = ChaoDice()
>>> cmp.sim_score('cat', 'hat')
0.36666666666666664
>>> cmp.sim_score('Niall', 'Neil')
0.27868852459016397
>>> cmp.sim_score('aluminum', 'Catalan')
0.0
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.1
"""
self._tokenize(src, tar)
u_hat, v_hat = self._get_estimates(src, tar)
num = u_hat * v_hat
if num:
return 2 * num / (u_hat + v_hat)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_chao_jaccard.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._chao_jaccard.
Chao's Jaccard similarity
"""
from collections import Counter
from typing import Any, Tuple
try:
from random import choices
except ImportError: # pragma: no cover
from random import choice
def choices(population, k=1): # type: ignore
"""Quick implementation of choices for Python < 3.6."""
return [choice(population) for _ in range(k)]
from ._token_distance import _TokenDistance
__all__ = ['ChaoJaccard']
class ChaoJaccard(_TokenDistance):
r"""Chao's Jaccard similarity.
Chao's Jaccard similarity :cite:`Chao:2004`
.. versionadded:: 0.4.1
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize ChaoJaccard instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(ChaoJaccard, self).__init__(**kwargs)
def sim(self, src: str, tar: str) -> float:
"""Return normalized Chao's Jaccard similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Chao's Jaccard similarity
Examples
--------
>>> import random
>>> random.seed(0)
>>> cmp = ChaoJaccard()
>>> cmp.sim('cat', 'hat')
0.22448979591836735
>>> cmp.sim('Niall', 'Neil')
0.1619047619047619
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.1
"""
return max(0.0, min(1.0, self.sim_score(src, tar)))
def sim_score(self, src: str, tar: str) -> float:
"""Return Chao's Jaccard similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Chao's Jaccard similarity
Examples
--------
>>> import random
>>> random.seed(0)
>>> cmp = ChaoJaccard()
>>> cmp.sim_score('cat', 'hat')
0.22448979591836735
>>> cmp.sim_score('Niall', 'Neil')
0.1619047619047619
>>> cmp.sim_score('aluminum', 'Catalan')
0.0
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.1
"""
self._tokenize(src, tar)
self._intersection()
if self._intersection_card() == 0:
return 0.0
u_hat, v_hat = self._get_estimates(src, tar)
num = u_hat * v_hat
if num:
return num / (u_hat + v_hat - u_hat * v_hat)
return 0.0
def _get_estimates(self, src: str, tar: str) -> Tuple[float, float]:
"""Get the estimates U-hat & V-hat used for Chao's measures.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
tuple(float, float)
The estimates U-hat & V-hat
.. versionadded:: 0.4.1
"""
src_card = self._src_card() # n
tar_card = self._tar_card() # m
src_token_list = self.params['tokenizer'].tokenize(src).get_list()
tar_token_list = self.params['tokenizer'].tokenize(tar).get_list()
src_sampled = Counter(choices(src_token_list, k=int(src_card)))
tar_sampled = Counter(choices(tar_token_list, k=int(tar_card)))
sample_intersection = src_sampled & tar_sampled
f_1_plus = sum(
1 if src_sampled[tok] == 1 and tar_sampled[tok] >= 1 else 0
for tok in sample_intersection
)
f_2_plus = sum(
1 if src_sampled[tok] == 2 and tar_sampled[tok] >= 1 else 0
for tok in sample_intersection
)
if not f_2_plus:
f_2_plus = 1
f_plus_1 = sum(
1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 1 else 0
for tok in sample_intersection
)
f_plus_2 = sum(
1 if src_sampled[tok] >= 1 and tar_sampled[tok] == 2 else 0
for tok in sample_intersection
)
if not f_plus_2:
f_plus_2 = 1
u_hat = 0.0
if src_card:
u_hat += sum(
src_sampled[tok] / src_card
for tok in sample_intersection.keys()
)
if tar_card:
u_hat += (
(tar_card - 1)
/ tar_card
* f_plus_1
/ (2 * f_plus_2)
* sum(
src_sampled[tok] / src_card * (tar_sampled[tok] == 1)
for tok in sample_intersection.keys()
)
)
v_hat = 0.0
if tar_card:
v_hat += sum(
tar_sampled[tok] / tar_card
for tok in sample_intersection.keys()
)
if src_card:
v_hat += (
(src_card - 1)
/ src_card
* f_1_plus
/ (2 * f_2_plus)
* sum(
tar_sampled[tok] / tar_card * (src_sampled[tok] == 1)
for tok in sample_intersection.keys()
)
)
return u_hat, v_hat
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_chebyshev.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._chebyshev.
Chebyshev distance
"""
from typing import (
Any,
Counter as TCounter,
NoReturn,
Optional,
Sequence,
Set,
Union,
)
from ._minkowski import Minkowski
from ..tokenizer import _Tokenizer
__all__ = ['Chebyshev']
class Chebyshev(Minkowski):
r"""Chebyshev distance.
Euclidean distance is the chessboard distance,
equivalent to Minkowski distance in :math:`L^\infty`-space.
.. versionadded:: 0.3.6
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = 0,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Euclidean instance.
Parameters
----------
alphabet : collection or int
The values or size of the alphabet
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Chebyshev, self).__init__(
pval=float('inf'),
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist_abs(self, src: str, tar: str, *args: Any, **kwargs: Any) -> float:
r"""Return the Chebyshev distance between two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
The Chebyshev distance
Examples
--------
>>> cmp = Chebyshev()
>>> cmp.dist_abs('cat', 'hat')
1.0
>>> cmp.dist_abs('Niall', 'Neil')
1.0
>>> cmp.dist_abs('Colin', 'Cuilen')
1.0
>>> cmp.dist_abs('ATCG', 'TAGC')
1.0
>>> cmp = Chebyshev(qval=1)
>>> cmp.dist_abs('ATCG', 'TAGC')
0.0
>>> cmp.dist_abs('ATCGATTCGGAATTTC', 'TAGCATAATCGCCG')
3.0
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return super(Chebyshev, self).dist_abs(src, tar, False)
def sim(self, *args: Any, **kwargs: Any) -> NoReturn:
"""Raise exception when called.
Parameters
----------
*args
Variable length argument list
**kwargs
Arbitrary keyword arguments
Raises
------
NotImplementedError
Method disabled for Chebyshev distance
.. versionadded:: 0.3.6
"""
raise NotImplementedError('Method disabled for Chebyshev distance.')
def dist(self, *args: Any, **kwargs: Any) -> NoReturn:
"""Raise exception when called.
Parameters
----------
*args
Variable length argument list
**kwargs
Arbitrary keyword arguments
Raises
------
NotImplementedError
Method disabled for Chebyshev distance
.. versionadded:: 0.3.6
"""
raise NotImplementedError('Method disabled for Chebyshev distance.')
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_chord.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._chord.
Chord distance
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Chord']
class Chord(_TokenDistance):
r"""Chord distance.
For two sets X and Y drawn from an alphabet S, the chord distance
:cite:`Orloci:1967` is
.. math::
sim_{chord}(X, Y) =
\sqrt{\sum_{i \in S}\Big(\frac{X_i}{\sqrt{\sum_{j \in X} X_j^2}} -
\frac{Y_i}{\sqrt{\sum_{j \in Y} Y_j^2}}\Big)^2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Chord instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Chord, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Chord distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Chord distance
Examples
--------
>>> cmp = Chord()
>>> cmp.dist_abs('cat', 'hat')
1.0
>>> cmp.dist_abs('Niall', 'Neil')
1.126811100699571
>>> cmp.dist_abs('aluminum', 'Catalan')
1.336712116966249
>>> cmp.dist_abs('ATCG', 'TAGC')
1.414213562373095
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
alphabet = self._total().keys()
den1 = max(
1, sum(val * val for val in self._src_tokens.values()) ** 0.5
)
den2 = max(
1, sum(val * val for val in self._tar_tokens.values()) ** 0.5
)
return round(
sum(
(self._src_tokens[i] / den1 - self._tar_tokens[i] / den2) ** 2
for i in alphabet
)
** 0.5,
15,
)
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Chord distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized chord distance
Examples
--------
>>> cmp = Chord()
>>> cmp.dist('cat', 'hat')
0.707106781186547
>>> cmp.dist('Niall', 'Neil')
0.796775770420944
>>> cmp.dist('aluminum', 'Catalan')
0.94519820240106
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
return round(self.dist_abs(src, tar) / (2 ** 0.5), 15)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_clark.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._clark.
Clark's coefficient of divergence
"""
from typing import Any
from ._token_distance import _TokenDistance
__all__ = ['Clark']
class Clark(_TokenDistance):
r"""Clark's coefficient of divergence.
For two sets X and Y and a population N, Clark's coefficient of divergence
:cite:`Clark:1952` is:
.. math::
dist_{Clark}(X, Y) = \sqrt{\frac{\sum_{i=0}^{|N|}
\big(\frac{x_i-y_i}{x_i+y_i}\big)^2}{|N|}}
.. versionadded:: 0.4.1
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize Clark instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(Clark, self).__init__(**kwargs)
def dist(self, src: str, tar: str) -> float:
"""Return Clark's coefficient of divergence of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Clark's coefficient of divergence
Examples
--------
>>> cmp = Clark()
>>> cmp.dist('cat', 'hat')
0.816496580927726
>>> cmp.dist('Niall', 'Neil')
0.8819171036881969
>>> cmp.dist('aluminum', 'Catalan')
0.9660917830792959
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.1
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
src_tok = self._src_tokens
tar_tok = self._tar_tokens
alphabet = set(src_tok.keys() | tar_tok.keys())
return (
sum(
((src_tok[ltr] - tar_tok[ltr]) / (src_tok[ltr] + tar_tok[ltr]))
** 2
for ltr in alphabet
)
/ len(alphabet)
) ** 0.5
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_clement.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._clement.
Clement similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Clement']
class Clement(_TokenDistance):
r"""Clement similarity.
For two sets X and Y and a population N, Clement similarity
:cite:`Clement:1976` is defined as
.. math::
sim_{Clement}(X, Y) =
\frac{|X \cap Y|}{|X|}\Big(1-\frac{|X|}{|N|}\Big) +
\frac{|(N \setminus X) \setminus Y|}{|N \setminus X|}
\Big(1-\frac{|N \setminus X|}{|N|}\Big)
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Clement} =
\frac{a}{a+b}\Big(1 - \frac{a+b}{n}\Big) +
\frac{d}{c+d}\Big(1 - \frac{c+d}{n}\Big)
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Clement instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Clement, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Clement similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Clement similarity
Examples
--------
>>> cmp = Clement()
>>> cmp.sim('cat', 'hat')
0.5025379382522239
>>> cmp.sim('Niall', 'Neil')
0.33840586363079933
>>> cmp.sim('aluminum', 'Catalan')
0.12119877280918714
>>> cmp.sim('ATCG', 'TAGC')
0.006336616803332366
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
score = 0.0
if a + b:
score += (a / (a + b)) * (1 - (a + b) / n)
if c + d:
score += (d / (c + d)) * (1 - (c + d) / n)
return score
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_cohen_kappa.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._cohen_kappa.
Cohen's Kappa similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['CohenKappa']
class CohenKappa(_TokenDistance):
r"""Cohen's Kappa similarity.
For two sets X and Y and a population N, Cohen's \kappa similarity
:cite:`Cohen:1960` is
.. math::
sim_{Cohen_\kappa}(X, Y) = \kappa =
\frac{p_o - p_e^\kappa}{1 - p_e^\kappa}
where
.. math::
\begin{array}{l}
p_o = \frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}\\
\\
p_e^\kappa = \frac{|X|}{|N|} \cdot \frac{|Y|}{|N|} +
\frac{|N \setminus X|}{|N|} \cdot \frac{|N \setminus Y|}{|N|}
\end{array}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
\begin{array}{l}
p_o = \frac{a+d}{n}\\
\\
p_e^\kappa = \frac{a+b}{n} \cdot \frac{a+c}{n} +
\frac{c+d}{n} \cdot \frac{b+d}{n}
\end{array}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize CohenKappa instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(CohenKappa, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return Cohen's Kappa similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Cohen's Kappa similarity
Examples
--------
>>> cmp = CohenKappa()
>>> cmp.sim('cat', 'hat')
0.9974358974358974
>>> cmp.sim('Niall', 'Neil')
0.9955041746949261
>>> cmp.sim('aluminum', 'Catalan')
0.9903412749517064
>>> cmp.sim('ATCG', 'TAGC')
0.993581514762516
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
if d:
return 2 * d / (b + c + 2 * d)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_cole.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._cole.
Cole correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Cole']
class Cole(_TokenDistance):
r"""Cole correlation.
For two sets X and Y and a population N, the Cole correlation
:cite:`Cole:1949` has three formulae:
- If :math:`|X \cap Y| \cdot |(N \setminus X) \setminus Y| \geq
|X \setminus Y| \cdot |Y \setminus Y|` then
.. math::
corr_{Cole}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{(|X \cap Y| + |X \setminus Y|) \cdot
(|X \setminus Y| + |(N \setminus X) \setminus Y|)}
- If :math:`|(N \setminus X) \setminus Y| \geq |X \cap Y|` then
.. math::
corr_{Cole}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{(|X \cap Y| + |X \setminus Y|) \cdot
(|X \cap Y| + |Y \setminus X|)}
- Otherwise
.. math::
corr_{Cole}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{(|X \setminus Y| + |(N \setminus X) \setminus Y|) \cdot
(|Y \setminus X| + |(N \setminus X) \setminus Y|)}
Cole terms this measurement the Coefficient of Interspecific Association.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{Cole} =
\left\{
\begin{array}{ll}
\frac{ad-bc}{(a+b)(b+d)} & \textup{if} ~ad \geq bc \\
\\
\frac{ad-bc}{(a+b)(a+c)} & \textup{if} ~d \geq a \\
\\
\frac{ad-bc}{(b+d)(c+d)} & \textup{otherwise}
\end{array}
\right.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Cole instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Cole, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Cole correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Cole correlation
Examples
--------
>>> cmp = Cole()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.3290543431750107
>>> cmp.corr('aluminum', 'Catalan')
0.10195910195910196
>>> cmp.corr('ATCG', 'TAGC')
-1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
admbc = a * d - b * c
if admbc == 0.0:
return 0.0
if a * d >= b * c:
return admbc / ((a + b) * (b + d))
if d >= a:
return admbc / ((a + b) * (a + c))
return admbc / ((b + d) * (c + d))
def sim(self, src: str, tar: str) -> float:
"""Return the Cole similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for similarity
tar : str
Target string (or QGrams/Counter objects) for similarity
Returns
-------
float
Cole similarity
Examples
--------
>>> cmp = Cole()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6645271715875054
>>> cmp.sim('aluminum', 'Catalan')
0.550979550979551
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_complete_linkage.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._complete_linkage.
Complete linkage distance
"""
from typing import Any, Optional, cast
from ._distance import _Distance
from ._levenshtein import Levenshtein
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['CompleteLinkage']
class CompleteLinkage(_TokenDistance):
r"""Complete linkage distance.
For two multisets X and Y, complete linkage distance
:cite:`Deza:2016` is
.. math::
sim_{CompleteLinkage}(X, Y) =
max_{i \in X, j \in Y} dist(X_i, Y_j)
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
metric: Optional[_Distance] = None,
**kwargs: Any
) -> None:
"""Initialize CompleteLinkage instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants. (Defaults to Levenshtein distance)
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(CompleteLinkage, self).__init__(tokenizer=tokenizer, **kwargs)
self._metric = cast(_Distance, metric)
if metric is None:
self._metric = Levenshtein()
def dist_abs(self, src: str, tar: str) -> float:
"""Return the complete linkage distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
complete linkage distance
Examples
--------
>>> cmp = CompleteLinkage()
>>> cmp.dist_abs('cat', 'hat')
2
>>> cmp.dist_abs('Niall', 'Neil')
2
>>> cmp.dist_abs('aluminum', 'Catalan')
2
>>> cmp.dist_abs('ATCG', 'TAGC')
2
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
src_tok, tar_tok = self._get_tokens()
max_val = float('-inf')
for term_src in src_tok.keys():
for term_tar in tar_tok.keys():
max_val = max(
max_val, self._metric.dist_abs(term_src, term_tar)
)
return max_val
def dist(self, src: str, tar: str) -> float:
"""Return the normalized complete linkage distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
normalized complete linkage distance
Examples
--------
>>> cmp = CompleteLinkage()
>>> cmp.dist('cat', 'hat')
1.0
>>> cmp.dist('Niall', 'Neil')
1.0
>>> cmp.dist('aluminum', 'Catalan')
1.0
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
src_tok, tar_tok = self._get_tokens()
max_val = 0.0
for term_src in src_tok.keys():
for term_tar in tar_tok.keys():
max_val = max(max_val, self._metric.dist(term_src, term_tar))
return max_val
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_consonni_todeschini_i.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._consonni_todeschini_i.
Consonni & Todeschini I similarity
"""
from math import log1p
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ConsonniTodeschiniI']
class ConsonniTodeschiniI(_TokenDistance):
r"""Consonni & Todeschini I similarity.
For two sets X and Y and a population N, Consonni & Todeschini I similarity
:cite:`Consonni:2012` is
.. math::
sim_{ConsonniTodeschiniI}(X, Y) =
\frac{log(1+|X \cap Y|+|(N \setminus X) \setminus Y|)}
{log(1+|N|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{ConsonniTodeschiniI} =
\frac{log(1+a+d)}{log(1+n)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ConsonniTodeschiniI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ConsonniTodeschiniI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Consonni & Todeschini I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Consonni & Todeschini I similarity
Examples
--------
>>> cmp = ConsonniTodeschiniI()
>>> cmp.sim('cat', 'hat')
0.9992336018090547
>>> cmp.sim('Niall', 'Neil')
0.998656222829757
>>> cmp.sim('aluminum', 'Catalan')
0.9971098629456009
>>> cmp.sim('ATCG', 'TAGC')
0.9980766131469967
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
d = self._total_complement_card()
n = self._population_unique_card()
return log1p(a + d) / log1p(n)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_consonni_todeschini_ii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._consonni_todeschini_ii.
Consonni & Todeschini II similarity
"""
from math import log1p
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ConsonniTodeschiniII']
class ConsonniTodeschiniII(_TokenDistance):
r"""Consonni & Todeschini II similarity.
For two sets X and Y and a population N, Consonni & Todeschini II
similarity :cite:`Consonni:2012` is
.. math::
sim_{ConsonniTodeschiniII}(X, Y) =
\frac{log(1+|N|) - log(1+|X \setminus Y|+|Y \setminus X|}
{log(1+|N|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{ConsonniTodeschiniII} =
\frac{log(1+n)-log(1+b+c)}{log(1+n)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ConsonniTodeschiniII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ConsonniTodeschiniII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Consonni & Todeschini II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Consonni & Todeschini II similarity
Examples
--------
>>> cmp = ConsonniTodeschiniII()
>>> cmp.sim('cat', 'hat')
0.7585487129939101
>>> cmp.sim('Niall', 'Neil')
0.6880377723094788
>>> cmp.sim('aluminum', 'Catalan')
0.5841297898633079
>>> cmp.sim('ATCG', 'TAGC')
0.640262668568961
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
b = self._src_only_card()
c = self._tar_only_card()
n = self._population_unique_card()
return (log1p(n) - log1p(b + c)) / log1p(n)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_consonni_todeschini_iii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._consonni_todeschini_iii.
Consonni & Todeschini III similarity
"""
from math import log1p
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ConsonniTodeschiniIII']
class ConsonniTodeschiniIII(_TokenDistance):
r"""Consonni & Todeschini III similarity.
For two sets X and Y and a population N, Consonni & Todeschini III
similarity :cite:`Consonni:2012` is
.. math::
sim_{ConsonniTodeschiniIII}(X, Y) =
\frac{log(1+|X \cap Y|)}{log(1+|N|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{ConsonniTodeschiniIII} =
\frac{log(1+a)}{log(1+n)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ConsonniTodeschiniIII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ConsonniTodeschiniIII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Consonni & Todeschini III similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Consonni & Todeschini III similarity
Examples
--------
>>> cmp = ConsonniTodeschiniIII()
>>> cmp.sim('cat', 'hat')
0.1648161441769704
>>> cmp.sim('Niall', 'Neil')
0.1648161441769704
>>> cmp.sim('aluminum', 'Catalan')
0.10396755253417303
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
n = self._population_unique_card()
if src == tar and n <= a:
return 1.0
return log1p(a) / log1p(n)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_consonni_todeschini_iv.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._consonni_todeschini_iv.
Consonni & Todeschini IV similarity
"""
from math import log1p
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ConsonniTodeschiniIV']
class ConsonniTodeschiniIV(_TokenDistance):
r"""Consonni & Todeschini IV similarity.
For two sets X and Y and a population N, Consonni & Todeschini IV
similarity :cite:`Consonni:2012` is
.. math::
sim_{ConsonniTodeschiniIV}(X, Y) =
\frac{log(1+|X \cap Y|)}{log(1+|X \cup Y|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{ConsonniTodeschiniIV} =
\frac{log(1+a)}{log(1+a+b+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ConsonniTodeschiniIV instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ConsonniTodeschiniIV, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Consonni & Todeschini IV similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Consonni & Todeschini IV similarity
Examples
--------
>>> cmp = ConsonniTodeschiniIV()
>>> cmp.sim('cat', 'hat')
0.5645750340535796
>>> cmp.sim('Niall', 'Neil')
0.4771212547196623
>>> cmp.sim('aluminum', 'Catalan')
0.244650542118226
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
return log1p(a) / log1p(a + b + c)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_consonni_todeschini_v.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._consonni_todeschini_v.
Consonni & Todeschini V correlation
"""
from math import log1p
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ConsonniTodeschiniV']
class ConsonniTodeschiniV(_TokenDistance):
r"""Consonni & Todeschini V correlation.
For two sets X and Y and a population N, Consonni & Todeschini V
correlation :cite:`Consonni:2012` is
.. math::
corr_{ConsonniTodeschiniV}(X, Y) =
\frac{log(1+|X \cap Y| \cdot |(N \setminus X) \setminus Y|)-
log(1+|X \setminus Y| \cdot |Y \setminus X|)}
{log(1+\frac{|N|^2}{4})}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{ConsonniTodeschiniV} =
\frac{log(1+ad)-log(1+bc)}{log(1+\frac{n^2}{4})}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ConsonniTodeschiniV instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ConsonniTodeschiniV, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Consonni & Todeschini V correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Consonni & Todeschini V correlation
Examples
--------
>>> cmp = ConsonniTodeschiniV()
>>> cmp.corr('cat', 'hat')
0.48072545510682463
>>> cmp.corr('Niall', 'Neil')
0.4003930264973547
>>> cmp.corr('aluminum', 'Catalan')
0.21794239483504532
>>> cmp.corr('ATCG', 'TAGC')
-0.2728145951429799
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
num = log1p(a * d) - log1p(b * c)
if num == 0.0:
return 0.0
return num / log1p(n ** 2 / 4)
def sim(self, src: str, tar: str) -> float:
"""Return the Consonni & Todeschini V similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Consonni & Todeschini V similarity
Examples
--------
>>> cmp = ConsonniTodeschiniV()
>>> cmp.sim('cat', 'hat')
0.7403627275534124
>>> cmp.sim('Niall', 'Neil')
0.7001965132486774
>>> cmp.sim('aluminum', 'Catalan')
0.6089711974175227
>>> cmp.sim('ATCG', 'TAGC')
0.36359270242851005
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_cormode_lz.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._cormode_lz.
Cormode's LZ distance
"""
from typing import Any
from ._distance import _Distance
__all__ = ['CormodeLZ']
class CormodeLZ(_Distance):
r"""Cormode's LZ distance.
Cormode's LZ distance :cite:`Cormode:2000,Cormode:2003`
.. versionadded:: 0.4.0
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize CormodeLZ instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(CormodeLZ, self).__init__(**kwargs)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Cormode's LZ distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Cormode's LZ distance
Examples
--------
>>> cmp = CormodeLZ()
>>> cmp.dist_abs('cat', 'hat')
2
>>> cmp.dist_abs('Niall', 'Neil')
5
>>> cmp.dist_abs('aluminum', 'Catalan')
6
>>> cmp.dist_abs('ATCG', 'TAGC')
4
.. versionadded:: 0.4.0
"""
edits = 0
pos = 0
span = 1
while max(pos + 1, pos + span) <= len(src):
if (src[pos : pos + span] in tar) or (
src[pos : pos + span] in src[:pos]
):
span += 1
else:
edits += 1
pos += max(1, span - 1)
span = 1
return 1 + edits
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Cormode's LZ distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Cormode's LZ distance
Examples
--------
>>> cmp = CormodeLZ()
>>> cmp.dist('cat', 'hat')
0.3333333333333333
>>> cmp.dist('Niall', 'Neil')
0.8
>>> cmp.dist('aluminum', 'Catalan')
0.625
>>> cmp.dist('ATCG', 'TAGC')
0.75
.. versionadded:: 0.4.0
"""
num = self.dist_abs(src, tar) - 1
if num == 0:
return 0.0
return num / len(src)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_cosine.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._cosine.
Cosine similarity & distance
"""
from math import sqrt
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Cosine']
class Cosine(_TokenDistance):
r"""Cosine similarity.
For two sets X and Y, the cosine similarity, Otsuka-Ochiai coefficient, or
Ochiai coefficient :cite:`Otsuka:1936,Ochiai:1957` is
.. math::
sim_{cosine}(X, Y) = \frac{|X \cap Y|}{\sqrt{|X| \cdot |Y|}}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{cosine} =
\frac{a}{\sqrt{(a+b)(a+c)}}
Notes
-----
This measure is also known as the Fowlkes-Mallows index
:cite:`Fowlkes:1983` for two classes and G-measure, the geometric mean of
precision & recall.
.. versionadded:: 0.3.6
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Cosine instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Cosine, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def sim(self, src: str, tar: str) -> float:
r"""Return the cosine similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Cosine similarity
Examples
--------
>>> cmp = Cosine()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3651483716701107
>>> cmp.sim('aluminum', 'Catalan')
0.11785113019775793
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
self._tokenize(src, tar)
num = self._intersection_card()
if num:
return num / sqrt(self._src_card() * self._tar_card())
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_covington.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._covington.
Covington distance
"""
from collections import namedtuple
from typing import Any, List, Optional, Tuple, cast
from unicodedata import normalize as unicode_normalize
from ._distance import _Distance
__all__ = ['Covington']
Alignment = namedtuple('Alignment', ['src', 'tar', 'score'])
class Covington(_Distance):
r"""Covington distance.
Covington distance :cite:`Covington:1996`
.. versionadded:: 0.4.0
"""
def __init__(
self,
weights: Tuple[int, int, int, int, int, int, int, int] = (
0,
5,
10,
30,
60,
100,
40,
50,
),
**kwargs: Any
) -> None:
"""Initialize Covington instance.
Parameters
----------
weights : tuple
An 8-tuple of costs for each kind of match or mismatch described in
Covington's paper:
- exact consonant or glide match
- exact vowel match
- vowel-vowel length mismatch or i and y or u and w
- vowel-vowel mismatch
- consonant-consonant mismatch
- consonant-vowel mismatch
- skip preceded by a skip
- skip not preceded by a skip
The weights used in Covington's first approximation can be used
by supplying the tuple (0.0, 0.0, 0.5, 0.5, 0.5, 1.0, 0.5, 0.5)
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Covington, self).__init__(**kwargs)
self._weights = weights
self._vowels = set('aeiou')
self._consonants = set('bcdfghjklmnpqrstvxz')
self._glides = set('wy')
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Covington distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Covington distance
Examples
--------
>>> cmp = Covington()
>>> cmp.dist_abs('cat', 'hat')
65
>>> cmp.dist_abs('Niall', 'Neil')
115
>>> cmp.dist_abs('aluminum', 'Catalan')
325
>>> cmp.dist_abs('ATCG', 'TAGC')
200
.. versionadded:: 0.4.0
"""
return cast(float, self.alignments(src, tar, 1)[0][-1])
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Covington distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Covington distance
Examples
--------
>>> cmp = Covington()
>>> cmp.dist('cat', 'hat')
0.19117647058823528
>>> cmp.dist('Niall', 'Neil')
0.25555555555555554
>>> cmp.dist('aluminum', 'Catalan')
0.43333333333333335
>>> cmp.dist('ATCG', 'TAGC')
0.45454545454545453
.. versionadded:: 0.4.0
"""
normalizer = self._weights[5] * min(len(src), len(tar))
if len(src) != len(tar):
normalizer += self._weights[7]
normalizer += self._weights[6] * abs(abs(len(src) - len(tar)) - 1)
return self.dist_abs(src, tar) / normalizer
def alignment(self, src: str, tar: str) -> Tuple[float, str, str]:
"""Return the top Covington alignment of two strings.
This returns only the top alignment in a standard
(score, source alignment, target alignment) tuple format.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
tuple(float, str, str)
Covington score & alignment
Examples
--------
>>> cmp = Covington()
>>> cmp.alignment('hart', 'kordis')
(240, 'hart--', 'kordis')
>>> cmp.alignment('niy', 'genu')
(170, '--niy', 'genu-')
.. versionadded:: 0.4.1
"""
alignment = self.alignments(src, tar, 1)[0]
return alignment.score, alignment.src, alignment.tar
def alignments(
self, src: str, tar: str, top_n: Optional[int] = None
) -> List[Alignment]:
"""Return the Covington alignments of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
top_n : int
The number of alignments to return. If None, all alignments will
be returned. If 0, all alignments with the top score will be
returned.
Returns
-------
list
Covington alignments
Examples
--------
>>> cmp = Covington()
>>> cmp.alignments('hart', 'kordis', top_n=1)[0]
Alignment(src='hart--', tar='kordis', score=240)
>>> cmp.alignments('niy', 'genu', top_n=1)[0]
Alignment(src='--niy', tar='genu-', score=170)
.. versionadded:: 0.4.0
"""
if not src:
if not tar:
return [Alignment('', '', 0)]
return [
Alignment(
'-' * len(tar),
src,
self._weights[7] + self._weights[6] * (len(tar) - 1),
)
]
if not tar:
return [
Alignment(
src,
'-' * len(src),
self._weights[7] + self._weights[6] * (len(src) - 1),
)
]
terminals = []
def _cost(s: str, t: str) -> float:
if s[-1:] == '-':
if s[-2:] == '--':
return self._weights[6]
else:
return self._weights[7]
elif t[-1:] == '-':
if t[-2:] == '--':
return self._weights[6]
else:
return self._weights[7]
s = unicode_normalize('NFC', s)[-1:]
t = unicode_normalize('NFC', t)[-1:]
if s == t:
if s in self._consonants or s in self._glides:
return self._weights[0]
else:
return self._weights[1]
if ''.join(sorted([s, t])) in {'iy', 'uw'}:
return self._weights[2]
sd = unicode_normalize('NFKD', s)
td = unicode_normalize('NFKD', t)
if sd[0] == td[0] and s in self._vowels:
return self._weights[2]
if sd[0] in self._vowels and td[0] in self._vowels:
return self._weights[3]
if sd[0] in self._consonants and td[0] in self._consonants:
return self._weights[4]
return self._weights[5]
def _add_alignments(
cost: float, src: str, tar: str, src_align: str, tar_align: str
) -> None:
cost += _cost(src_align, tar_align)
if src and tar:
_add_alignments(
cost,
src[1:],
tar[1:],
src_align + src[0],
tar_align + tar[0],
)
if tar and tar_align[-1] != '-':
_add_alignments(
cost, src, tar[1:], src_align + '-', tar_align + tar[0]
)
if src and src_align[-1] != '-':
_add_alignments(
cost, src[1:], tar, src_align + src[0], tar_align + '-'
)
if not src and not tar:
terminals.append(Alignment(src_align, tar_align, cost))
return
_add_alignments(0, src, tar[1:], '-', tar[0])
_add_alignments(0, src[1:], tar, src[0], '-')
_add_alignments(0, src[1:], tar[1:], src[0], tar[0])
terminals = sorted(terminals, key=lambda al: al.score)
if top_n == 0:
top_score = terminals[0].score
top_n = 1
while (
top_n < len(terminals) and terminals[top_n].score == top_score
):
top_n += 1
if top_n is None:
return terminals
else:
return terminals[:top_n]
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_damerau_levenshtein.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._damerau_levenshtein.
Damerau-Levenshtein distance
"""
from sys import maxsize
from typing import Any, Callable, List, Tuple, cast
from numpy import int_ as np_int
from numpy import zeros as np_zeros
from ._distance import _Distance
__all__ = [
'DamerauLevenshtein',
]
class DamerauLevenshtein(_Distance):
"""Damerau-Levenshtein distance.
This computes the Damerau-Levenshtein distance :cite:`Damerau:1964`.
Damerau-Levenshtein code is based on Java code by Kevin L. Stern
:cite:`Stern:2014`, under the MIT license:
https://github.com/KevinStern/software-and-algorithms/blob/master/src/main/java/blogspot/software_and_algorithms/stern_library/string/DamerauLevenshteinAlgorithm.java
"""
def __init__(
self,
cost: Tuple[float, float, float, float] = (1, 1, 1, 1),
normalizer: Callable[[List[float]], float] = max,
**kwargs: Any
):
"""Initialize Levenshtein instance.
Parameters
----------
cost : tuple
A 4-tuple representing the cost of the four possible edits:
inserts, deletes, substitutions, and transpositions, respectively
(by default: (1, 1, 1, 1))
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). Another
good option is the sum function.
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(DamerauLevenshtein, self).__init__(**kwargs)
self._cost = cost
self._normalizer = normalizer
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Damerau-Levenshtein distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int (may return a float if cost has float values)
The Damerau-Levenshtein distance between src & tar
Raises
------
ValueError
Unsupported cost assignment; the cost of two transpositions must
not be less than the cost of an insert plus a delete.
Examples
--------
>>> cmp = DamerauLevenshtein()
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
3
>>> cmp.dist_abs('aluminum', 'Catalan')
7
>>> cmp.dist_abs('ATCG', 'TAGC')
2
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
ins_cost, del_cost, sub_cost, trans_cost = self._cost
if src == tar:
return 0
if not src:
return len(tar) * ins_cost
if not tar:
return len(src) * del_cost
if 2 * trans_cost < ins_cost + del_cost:
raise ValueError(
'Unsupported cost assignment; the cost of two transpositions '
+ 'must not be less than the cost of an insert plus a delete.'
)
d_mat = np_zeros((len(src), len(tar)), dtype=np_int)
if src[0] != tar[0]:
d_mat[0, 0] = min(sub_cost, ins_cost + del_cost)
src_index_by_character = {src[0]: 0}
for i in range(1, len(src)):
del_distance = d_mat[i - 1, 0] + del_cost
ins_distance = (i + 1) * del_cost + ins_cost
match_distance = i * del_cost + (
0 if src[i] == tar[0] else sub_cost
)
d_mat[i, 0] = min(del_distance, ins_distance, match_distance)
for j in range(1, len(tar)):
del_distance = (j + 1) * ins_cost + del_cost
ins_distance = d_mat[0, j - 1] + ins_cost
match_distance = j * ins_cost + (
0 if src[0] == tar[j] else sub_cost
)
d_mat[0, j] = min(del_distance, ins_distance, match_distance)
for i in range(1, len(src)):
max_src_letter_match_index = 0 if src[i] == tar[0] else -1
for j in range(1, len(tar)):
candidate_swap_index = (
-1
if tar[j] not in src_index_by_character
else src_index_by_character[tar[j]]
)
j_swap = max_src_letter_match_index
del_distance = d_mat[i - 1, j] + del_cost
ins_distance = d_mat[i, j - 1] + ins_cost
match_distance = d_mat[i - 1, j - 1]
if src[i] != tar[j]:
match_distance += sub_cost
else:
max_src_letter_match_index = j
if candidate_swap_index != -1 and j_swap != -1:
i_swap = candidate_swap_index
if i_swap == 0 and j_swap == 0:
pre_swap_cost = 0
else:
pre_swap_cost = d_mat[
max(0, i_swap - 1), max(0, j_swap - 1)
]
swap_distance = (
pre_swap_cost
+ (i - i_swap - 1) * del_cost
+ (j - j_swap - 1) * ins_cost
+ trans_cost
)
else:
swap_distance = maxsize
d_mat[i, j] = min(
del_distance, ins_distance, match_distance, swap_distance
)
src_index_by_character[src[i]] = i
return cast(float, d_mat[len(src) - 1, len(tar) - 1])
def dist(self, src: str, tar: str) -> float:
"""Return the Damerau-Levenshtein similarity of two strings.
Damerau-Levenshtein distance normalized to the interval [0, 1].
The Damerau-Levenshtein distance is normalized by dividing the
Damerau-Levenshtein distance by the greater of
the number of characters in src times the cost of a delete and
the number of characters in tar times the cost of an insert.
For the case in which all operations have :math:`cost = 1`, this is
equivalent to the greater of the length of the two strings src & tar.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The normalized Damerau-Levenshtein distance
Examples
--------
>>> cmp = DamerauLevenshtein()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.6
>>> cmp.dist('aluminum', 'Catalan')
0.875
>>> cmp.dist('ATCG', 'TAGC')
0.5
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 0.0
ins_cost, del_cost = self._cost[:2]
return self.dist_abs(src, tar) / (
self._normalizer([len(src) * del_cost, len(tar) * ins_cost])
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_dennis.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._dennis.
Dennis similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Dennis']
class Dennis(_TokenDistance):
r"""Dennis similarity.
For two sets X and Y and a population N, Dennis similarity
:cite:`Dennis:1965` is
.. math::
sim_{Dennis}(X, Y) =
\frac{|X \cap Y| - \frac{|X| \cdot |Y|}{|N|}}
{\sqrt{\frac{|X|\cdot|Y|}{|N|}}}
This is the fourth of Dennis' association measures, and that which she
claims is the best of the four.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Dennis} =
\frac{a-\frac{(a+b)(a+c)}{n}}{\sqrt{\frac{(a+b)(a+c)}{n}}}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Dennis instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Dennis, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Dennis similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dennis similarity
Examples
--------
>>> cmp = Dennis()
>>> cmp.sim_score('cat', 'hat')
13.857142857142858
>>> cmp.sim_score('Niall', 'Neil')
10.028539207654113
>>> cmp.sim_score('aluminum', 'Catalan')
2.9990827802847835
>>> cmp.sim_score('ATCG', 'TAGC')
-0.17857142857142858
.. versionadded:: 0.4.0
"""
if not src and not tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
abacn = (
self._src_card()
* self._tar_card()
/ self._population_unique_card()
)
num = a - abacn
if num == 0:
return 0.0
return num / abacn ** 0.5
def corr(self, src: str, tar: str) -> float:
"""Return the Dennis correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dennis correlation
Examples
--------
>>> cmp = Dennis()
>>> cmp.corr('cat', 'hat')
0.494897959183673
>>> cmp.corr('Niall', 'Neil')
0.358162114559075
>>> cmp.corr('aluminum', 'Catalan')
0.107041854561785
>>> cmp.corr('ATCG', 'TAGC')
-0.006377551020408
.. versionadded:: 0.4.0
"""
score = self.sim_score(src, tar)
if score == 0.0:
return 0.0
return round(score / self._population_unique_card() ** 0.5, 15)
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Dennis similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Dennis similarity
Examples
--------
>>> cmp = Dennis()
>>> cmp.sim('cat', 'hat')
0.6632653061224487
>>> cmp.sim('Niall', 'Neil')
0.5721080763727167
>>> cmp.sim('aluminum', 'Catalan')
0.4046945697078567
>>> cmp.sim('ATCG', 'TAGC')
0.32908163265306134
.. versionadded:: 0.4.0
"""
return (0.5 + self.corr(src, tar)) / 1.5
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_dice.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._dice.
Sørensen–Dice coefficient & distance
"""
from typing import Any, Optional
from ._tversky import Tversky
from ..tokenizer import _Tokenizer
__all__ = ['Dice']
class Dice(Tversky):
r"""Sørensen–Dice coefficient.
For two sets X and Y, the Sørensen–Dice coefficient
:cite:`Dice:1945,Sorensen:1948,Czekanowski:1909,Motyka:1950` is
.. math::
sim_{Dice}(X, Y) = \frac{2 \cdot |X \cap Y|}{|X| + |Y|}
This is the complement of Bray & Curtis dissimilarity :cite:`Bray:1957`,
also known as the Lance & Williams dissimilarity :cite:`Lance:1967`.
This is identical to the Tanimoto similarity coefficient
:cite:`Tanimoto:1958` and the Tversky index :cite:`Tversky:1977` for
:math:`\alpha = \beta = 0.5`.
In the Ruby text library this is identified as White similarity, after
:cite:`White:Nd`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Dice} =
\frac{2a}{2a+b+c}
Notes
-----
In terms of a confusion matrix, this is equivalent to :math:`F_1` score
:py:meth:`ConfusionTable.f1_score`.
The multiset variant is termed Gleason similarity :cite:`Gleason:1920`.
.. versionadded:: 0.3.6
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Dice instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Dice, self).__init__(
alpha=0.5,
beta=0.5,
bias=None,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Sørensen–Dice coefficient of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Sørensen–Dice similarity
Examples
--------
>>> cmp = Dice()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.36363636363636365
>>> cmp.sim('aluminum', 'Catalan')
0.11764705882352941
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return super(Dice, self).sim(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_dice_asymmetric_i.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._dice_asymmetric_i.
Dice's Asymmetric I similarity
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['DiceAsymmetricI']
class DiceAsymmetricI(_TokenDistance):
r"""Dice's Asymmetric I similarity.
For two sets X and Y and a population N, Dice's Asymmetric I similarity
:cite:`Dice:1945` is
.. math::
sim_{DiceAsymmetricI}(X, Y) =
\frac{|X \cap Y|}{|X|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{DiceAsymmetricI} =
\frac{a}{a+b}
Notes
-----
In terms of a confusion matrix, this is equivalent to precision or
positive predictive value :py:meth:`ConfusionTable.precision`.
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize DiceAsymmetricI instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(DiceAsymmetricI, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Dice's Asymmetric I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dice's Asymmetric I similarity
Examples
--------
>>> cmp = DiceAsymmetricI()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3333333333333333
>>> cmp.sim('aluminum', 'Catalan')
0.1111111111111111
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
ab = self._src_card()
if a == 0.0:
return 0.0
return a / ab
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_dice_asymmetric_ii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._dice_asymmetric_ii.
Dice's Asymmetric II similarity
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['DiceAsymmetricII']
class DiceAsymmetricII(_TokenDistance):
r"""Dice's Asymmetric II similarity.
For two sets X and Y, Dice's Asymmetric II similarity
:cite:`Dice:1945` is
.. math::
sim_{DiceAsymmetricII}(X, Y) =
\frac{|X \cap Y|}{|Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{DiceAsymmetricII} =
\frac{a}{a+c}
Notes
-----
In terms of a confusion matrix, this is equivalent to recall, sensitivity,
or true positive rate :py:meth:`ConfusionTable.recall`.
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize DiceAsymmetricII instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(DiceAsymmetricII, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Dice's Asymmetric II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dice's Asymmetric II similarity
Examples
--------
>>> cmp = DiceAsymmetricII()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.4
>>> cmp.sim('aluminum', 'Catalan')
0.125
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
ac = self._tar_card()
if a == 0.0:
return 0.0
return a / ac
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_digby.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._digby.
Digby correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Digby']
class Digby(_TokenDistance):
r"""Digby correlation.
For two sets X and Y and a population N, Digby's approximation of the
tetrachoric correlation coefficient
:cite:`Digby:1983` is
.. math::
corr_{Digby}(X, Y) =
\frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4}-
(|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}}
{(|X \cap Y| \cdot |(N \setminus X) \setminus Y|)^\frac{3}{4} +
(|X \setminus Y| \cdot |Y \setminus X|)^\frac{3}{4}}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{Digby} =
\frac{ad^\frac{3}{4}-bc^\frac{3}{4}}{ad^\frac{3}{4}+bc^\frac{3}{4}}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Digby instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Digby, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Digby correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Digby correlation
Examples
--------
>>> cmp = Digby()
>>> cmp.corr('cat', 'hat')
0.9774244829419212
>>> cmp.corr('Niall', 'Neil')
0.9491281473458171
>>> cmp.corr('aluminum', 'Catalan')
0.7541039303781305
>>> cmp.corr('ATCG', 'TAGC')
-1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
if not src or not tar:
return -1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = (a * d) ** 0.75 - (b * c) ** 0.75
if num:
return num / ((a * d) ** 0.75 + (b * c) ** 0.75)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the Digby similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Digby similarity
Examples
--------
>>> cmp = Digby()
>>> cmp.sim('cat', 'hat')
0.9887122414709606
>>> cmp.sim('Niall', 'Neil')
0.9745640736729085
>>> cmp.sim('aluminum', 'Catalan')
0.8770519651890653
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_discounted_levenshtein.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._discounted_levenshtein.
Discounted Levenshtein edit distance
"""
from math import log
from typing import Any, Callable, List, Tuple, Union, cast
import numpy as np
from ._levenshtein import Levenshtein
__all__ = ['DiscountedLevenshtein']
class DiscountedLevenshtein(Levenshtein):
"""Discounted Levenshtein distance.
This is a variant of Levenshtein distance for which edits later in a string
have discounted cost, on the theory that earlier edits are less likely
than later ones.
.. versionadded:: 0.4.1
"""
def __init__(
self,
mode: str = 'lev',
normalizer: Callable[[List[float]], float] = max,
discount_from: Union[int, str] = 1,
discount_func: Union[str, Callable[[float], float]] = 'log',
vowels: str = 'aeiou',
**kwargs: Any
) -> None:
"""Initialize DiscountedLevenshtein instance.
Parameters
----------
mode : str
Specifies a mode for computing the discounted Levenshtein distance:
- ``lev`` (default) computes the ordinary Levenshtein distance,
in which edits may include inserts, deletes, and
substitutions
- ``osa`` computes the Optimal String Alignment distance, in
which edits may include inserts, deletes, substitutions, and
transpositions but substrings may only be edited once
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). Another
good option is the sum function.
discount_from : int or str
If an int is supplied, this is the first character whose edit cost
will be discounted. If the str ``coda`` is supplied, discounting
will start with the first non-vowel after the first vowel (the
first syllable coda).
discount_func : str or function
The two supported str arguments are ``log``, for a logarithmic
discount function, and ``exp`` for a exponential discount function.
See notes below for information on how to supply your own
discount function.
vowels : str
These are the letters to consider as vowels when discount_from is
set to ``coda``. It defaults to the English vowels 'aeiou', but
it would be reasonable to localize this to other languages or to
add orthographic semi-vowels like 'y', 'w', and even 'h'.
**kwargs
Arbitrary keyword arguments
Notes
-----
This class is highly experimental and will need additional tuning.
The discount function can be passed as a callable function. It should
expect an integer as its only argument and return a float, ideally
less than or equal to 1.0. The argument represents the degree of
discounting to apply.
.. versionadded:: 0.4.1
"""
super(DiscountedLevenshtein, self).__init__(**kwargs)
self._mode = mode
self._normalizer = normalizer
self._discount_from = discount_from
self._vowels = set(vowels.lower())
if callable(discount_func):
self._discount_func = discount_func
elif discount_func == 'exp':
self._discount_func = self._exp_discount
else:
self._discount_func = self._log_discount
@staticmethod
def _log_discount(discounts: float) -> float:
return 1 / (log(1 + discounts / 5) + 1)
@staticmethod
def _exp_discount(discounts: float) -> float:
return 1 / (discounts + 1) ** 0.2
def _alignment_matrix(
self, src: str, tar: str, backtrace: bool = True
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
"""Return the Levenshtein alignment matrix.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
backtrace : bool
Return the backtrace matrix as well
Returns
-------
numpy.ndarray or tuple(numpy.ndarray, numpy.ndarray)
The alignment matrix and (optionally) the backtrace matrix
.. versionadded:: 0.4.1
"""
src_len = len(src)
tar_len = len(tar)
if self._discount_from == 'coda':
discount_from = [0, 0]
src_voc = src.lower()
for i in range(len(src_voc)):
if src_voc[i] in self._vowels:
discount_from[0] = i
break
for i in range(discount_from[0], len(src_voc)):
if src_voc[i] not in self._vowels:
discount_from[0] = i
break
else:
discount_from[0] += 1
tar_voc = tar.lower()
for i in range(len(tar_voc)):
if tar_voc[i] in self._vowels:
discount_from[1] = i
break
for i in range(discount_from[1], len(tar_voc)):
if tar_voc[i] not in self._vowels:
discount_from[1] = i
break
else:
discount_from[1] += 1
elif isinstance(self._discount_from, int):
discount_from = [self._discount_from, self._discount_from]
else:
discount_from = [1, 1]
d_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.float_)
if backtrace:
trace_mat = np.zeros((src_len + 1, tar_len + 1), dtype=np.int8)
for i in range(1, src_len + 1):
d_mat[i, 0] = d_mat[i - 1, 0] + self._discount_func(
max(0, i - discount_from[0])
)
if backtrace:
trace_mat[i, 0] = 1
for j in range(1, tar_len + 1):
d_mat[0, j] = d_mat[0, j - 1] + self._discount_func(
max(0, j - discount_from[1])
)
if backtrace:
trace_mat[0, j] = 0
for i in range(src_len):
i_extend = self._discount_func(max(0, i - discount_from[0]))
for j in range(tar_len):
traces = ((i + 1, j), (i, j + 1), (i, j))
cost = min(
i_extend, self._discount_func(max(0, j - discount_from[1]))
)
opts = (
d_mat[traces[0]] + cost, # ins
d_mat[traces[1]] + cost, # del
d_mat[traces[2]]
+ (cost if src[i] != tar[j] else 0), # sub/==
)
d_mat[i + 1, j + 1] = min(opts)
if backtrace:
trace_mat[i + 1, j + 1] = int(np.argmin(opts))
if self._mode == 'osa':
if (
i + 1 > 1
and j + 1 > 1
and src[i] == tar[j - 1]
and src[i - 1] == tar[j]
):
# transposition
d_mat[i + 1, j + 1] = min(
d_mat[i + 1, j + 1], d_mat[i - 1, j - 1] + cost
)
if backtrace:
trace_mat[i + 1, j + 1] = 2
if backtrace:
return d_mat, trace_mat
return d_mat
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Levenshtein distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float (may return a float if cost has float values)
The Levenshtein distance between src & tar
Examples
--------
>>> cmp = DiscountedLevenshtein()
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
2.526064024369237
>>> cmp.dist_abs('aluminum', 'Catalan')
5.053867269967515
>>> cmp.dist_abs('ATCG', 'TAGC')
2.594032108779918
>>> cmp = DiscountedLevenshtein(mode='osa')
>>> cmp.dist_abs('ATCG', 'TAGC')
1.7482385137517997
>>> cmp.dist_abs('ACTG', 'TAGC')
3.342270622531718
.. versionadded:: 0.4.1
"""
src_len = len(src)
tar_len = len(tar)
if src == tar:
return 0.0
if isinstance(self._discount_from, int):
discount_from = self._discount_from
else:
discount_from = 1
if not src:
return sum(
self._discount_func(max(0, pos - discount_from))
for pos in range(tar_len)
)
if not tar:
return sum(
self._discount_func(max(0, pos - discount_from))
for pos in range(src_len)
)
d_mat = cast(
np.ndarray, self._alignment_matrix(src, tar, backtrace=False)
)
if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
return int(d_mat[src_len, tar_len])
else:
return cast(float, d_mat[src_len, tar_len])
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Levenshtein distance between two strings.
The Levenshtein distance is normalized by dividing the Levenshtein
distance (calculated by any of the three supported methods) by the
greater of the number of characters in src times the cost of a delete
and the number of characters in tar times the cost of an insert.
For the case in which all operations have :math:`cost = 1`, this is
equivalent to the greater of the length of the two strings src & tar.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The normalized Levenshtein distance between src & tar
Examples
--------
>>> cmp = DiscountedLevenshtein()
>>> cmp.dist('cat', 'hat')
0.3513958291799864
>>> cmp.dist('Niall', 'Neil')
0.5909885886270658
>>> cmp.dist('aluminum', 'Catalan')
0.8348163322045603
>>> cmp.dist('ATCG', 'TAGC')
0.7217609721523955
.. versionadded:: 0.4.1
"""
if src == tar:
return 0
if isinstance(self._discount_from, int):
discount_from = self._discount_from
else:
discount_from = 1
src_len = len(src)
tar_len = len(tar)
normalize_term = self._normalizer(
[
sum(
self._discount_func(max(0, pos - discount_from))
for pos in range(src_len)
),
sum(
self._discount_func(max(0, pos - discount_from))
for pos in range(tar_len)
),
]
)
return self.dist_abs(src, tar) / normalize_term
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_dispersion.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._dispersion.
Dispersion correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Dispersion']
class Dispersion(_TokenDistance):
r"""Dispersion correlation.
For two sets X and Y and a population N, the dispersion
correlation :cite:`IBM:2017` is
.. math::
corr_{dispersion}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{|N|^2}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{dispersion} =
\frac{ad-bc}{n^2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Dispersion instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Dispersion, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Dispersion correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dispersion correlation
Examples
--------
>>> cmp = Dispersion()
>>> cmp.corr('cat', 'hat')
0.002524989587671803
>>> cmp.corr('Niall', 'Neil')
0.002502212619741774
>>> cmp.corr('aluminum', 'Catalan')
0.0011570449105440383
>>> cmp.corr('ATCG', 'TAGC')
-4.06731570179092e-05
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
admbc = a * d - b * c
if admbc == 0.0:
return 0.0
return admbc / n ** 2
def sim(self, src: str, tar: str) -> float:
"""Return the Dispersion similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dispersion similarity
Examples
--------
>>> cmp = Dispersion()
>>> cmp.sim('cat', 'hat')
0.5012624947938359
>>> cmp.sim('Niall', 'Neil')
0.5012511063098709
>>> cmp.sim('aluminum', 'Catalan')
0.500578522455272
>>> cmp.sim('ATCG', 'TAGC')
0.499979663421491
.. versionadded:: 0.4.0
"""
return (1 + self.corr(src, tar)) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_distance.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._distance.
The distance._distance module implements abstract class _Distance.
"""
from typing import Any, Dict
__all__ = ['_Distance']
class _Distance:
"""Abstract Distance class.
.. versionadded:: 0.3.6
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize _Distance instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
self.params = {} # type: Dict[str, Any]
self.set_params(**kwargs)
def set_params(self, **kwargs: Any) -> None:
"""Store params in the params dict.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
for key in kwargs:
self.params[key] = kwargs[key]
def sim(self, src: str, tar: str) -> float:
"""Return similarity.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Similarity
.. versionadded:: 0.3.6
"""
return 1.0 - self.dist(src, tar)
def dist(self, src: str, tar: str) -> float:
"""Return distance.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Distance
.. versionadded:: 0.3.6
"""
return 1.0 - self.sim(src, tar)
def dist_abs(self, src: str, tar: str) -> float:
"""Return absolute distance.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
Absolute distance
.. versionadded:: 0.3.6
"""
return self.dist(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_doolittle.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._doolittle.
Doolittle similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Doolittle']
class Doolittle(_TokenDistance):
r"""Doolittle similarity.
For two sets X and Y and a population N, the Doolittle
similarity :cite:`Doolittle:1884` is
.. math::
sim_{Doolittle}(X, Y) =
\frac{(|X \cap Y| \cdot |N| - |X| \cdot |Y|)^2}
{|X| \cdot |Y| \cdot |N \setminus Y| \cdot |N \setminus X|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Doolittle} =
\frac{(an-(a+b)(a+c))^2}{(a+b)(a+c)(b+d)(c+d)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Doolittle instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Doolittle, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Doolittle similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Doolittle similarity
Examples
--------
>>> cmp = Doolittle()
>>> cmp.sim('cat', 'hat')
0.24744247205785666
>>> cmp.sim('Niall', 'Neil')
0.13009912077202224
>>> cmp.sim('aluminum', 'Catalan')
0.011710186806836291
>>> cmp.sim('ATCG', 'TAGC')
4.1196952743799446e-05
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = (a * d - b * c) ** 2
if num == 0.0:
return 0.0
return num / ((a + b) * (a + c) * (b + d) * (c + d))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_dunning.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._dunning.
Dunning similarity
"""
from math import log
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Dunning']
class Dunning(_TokenDistance):
r"""Dunning similarity.
For two sets X and Y and a population N, Dunning log-likelihood
:cite:`Dunning:1993`, following :cite:`Church:1991`, is
.. math::
sim_{Dunning}(X, Y) = \lambda =
|X \cap Y| \cdot log_2(|X \cap Y|) +\\
|X \setminus Y| \cdot log_2(|X \setminus Y|) +
|Y \setminus X| \cdot log_2(|Y \setminus X|) +\\
|(N \setminus X) \setminus Y| \cdot
log_2(|(N \setminus X) \setminus Y|) -\\
(|X| \cdot log_2(|X|) +
|Y| \cdot log_2(|Y|) +\\
|N \setminus Y| \cdot log_2(|N \setminus Y|) +
|N \setminus X| \cdot log_2(|N \setminus X|))
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Dunning} = \lambda =
a \cdot log_2(a) +\\
b \cdot log_2(b) + c \cdot log_2(c) +
d \cdot log_2(d) - \\
((a+b) \cdot log_2(a+b) + (a+c) \cdot log_2(a+c) +\\
(b+d) \cdot log_2(b+d) + (c+d) log_2(c+d))
Notes
-----
To avoid NaNs, every logarithm is calculated as the logarithm of 1 greater
than the value in question. (Python's math.log1p function is used.)
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Dunning instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Dunning, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Dunning similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Dunning similarity
Examples
--------
>>> cmp = Dunning()
>>> cmp.sim('cat', 'hat')
0.33462839191969423
>>> cmp.sim('Niall', 'Neil')
0.19229445539929793
>>> cmp.sim('aluminum', 'Catalan')
0.03220862737070572
>>> cmp.sim('ATCG', 'TAGC')
0.0010606026735052122
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = a + b + c + d
# a should not equal n, because 0 will result
# As a workaround, we set d to 1 and add one to n.
if a == n:
d = 1
n += 1
a /= n
b /= n
c /= n
d /= n
score = 0.0
for i in [a, b, c, d]:
if i > 0:
score += i * log(i)
for i in [a, d]:
for j in [b, c]:
ij = i + j
if ij > 0:
score -= ij * log(ij)
score *= 2
score /= log(2)
return abs(round(score, 15))
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Dunning similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Dunning similarity
Examples
--------
>>> cmp = Dunning()
>>> cmp.sim('cat', 'hat')
0.33462839191969423
>>> cmp.sim('Niall', 'Neil')
0.19229445539929793
>>> cmp.sim('aluminum', 'Catalan')
0.03220862737070572
>>> cmp.sim('ATCG', 'TAGC')
0.0010606026735052122
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
score = self.sim_score(src, tar)
if not score:
return 0.0
norm = max(self.sim_score(src, src), self.sim_score(tar, tar))
return score / norm
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_editex.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._editex.
editex
"""
from sys import float_info
from typing import Any, Tuple, cast
from unicodedata import normalize as unicode_normalize
from numpy import float_ as np_float
from numpy import zeros as np_zeros
from ._distance import _Distance
__all__ = ['Editex']
class Editex(_Distance):
"""Editex.
As described on pages 3 & 4 of :cite:`Zobel:1996`.
The local variant is based on :cite:`Ring:2009`.
.. versionadded:: 0.3.6
.. versionchanged:: 0.4.0
Added taper option
"""
_letter_groups = (
frozenset('AEIOUY'),
frozenset('BP'),
frozenset('CKQ'),
frozenset('DT'),
frozenset('LR'),
frozenset('MN'),
frozenset('GJ'),
frozenset('FPV'),
frozenset('SXZ'),
)
_all_letters = frozenset('ABCDEFGIJKLMNOPQRSTUVXYZ')
def __init__(
self,
cost: Tuple[int, int, int] = (0, 1, 2),
local: bool = False,
taper: bool = False,
**kwargs: Any
) -> None:
"""Initialize Editex instance.
Parameters
----------
cost : tuple
A 3-tuple representing the cost of the four possible edits: match,
same-group, and mismatch respectively (by default: (0, 1, 2))
local : bool
If True, the local variant of Editex is used
taper : bool
Enables cost tapering. Following :cite:`Zobel:1996`, it causes
edits at the start of the string to "just [exceed] twice the
minimum penalty for replacement or deletion at the end of the
string".
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Editex, self).__init__(**kwargs)
self._cost = cost
self._local = local
self._taper_enabled = taper
def _taper(self, pos: int, length: int) -> float:
return (
round(
1.0 + ((length - pos) / length) * (1 + float_info.epsilon), 15
)
if self._taper_enabled
else 1.0
)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Editex distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
Editex distance
Examples
--------
>>> cmp = Editex()
>>> cmp.dist_abs('cat', 'hat')
2
>>> cmp.dist_abs('Niall', 'Neil')
2
>>> cmp.dist_abs('aluminum', 'Catalan')
12
>>> cmp.dist_abs('ATCG', 'TAGC')
6
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
match_cost, group_cost, mismatch_cost = self._cost
def r_cost(ch1: str, ch2: str) -> int:
"""Return r(a,b) according to Zobel & Dart's definition.
Parameters
----------
ch1 : str
The first character to compare
ch2 : str
The second character to compare
Returns
-------
int
r(a,b) according to Zobel & Dart's definition
.. versionadded:: 0.1.0
"""
if ch1 == ch2:
return match_cost
if ch1 in self._all_letters and ch2 in self._all_letters:
for group in self._letter_groups:
if ch1 in group and ch2 in group:
return group_cost
return mismatch_cost
def d_cost(ch1: str, ch2: str) -> int:
"""Return d(a,b) according to Zobel & Dart's definition.
Parameters
----------
ch1 : str
The first character to compare
ch2 : str
The second character to compare
Returns
-------
int
d(a,b) according to Zobel & Dart's definition
.. versionadded:: 0.1.0
"""
if ch1 != ch2 and (ch1 == 'H' or ch1 == 'W'):
return group_cost
return r_cost(ch1, ch2)
# convert both src & tar to NFKD normalized unicode
src = unicode_normalize('NFKD', src.upper())
tar = unicode_normalize('NFKD', tar.upper())
src_len = len(src)
tar_len = len(tar)
max_len = max(src_len, tar_len)
if src == tar:
return 0.0
if not src:
return sum(
mismatch_cost * self._taper(pos, max_len)
for pos in range(tar_len)
)
if not tar:
return sum(
mismatch_cost * self._taper(pos, max_len)
for pos in range(src_len)
)
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
src = ' ' + src
tar = ' ' + tar
if not self._local:
for i in range(1, src_len + 1):
d_mat[i, 0] = d_mat[i - 1, 0] + d_cost(
src[i - 1], src[i]
) * self._taper(i, max_len)
for j in range(1, tar_len + 1):
d_mat[0, j] = d_mat[0, j - 1] + d_cost(
tar[j - 1], tar[j]
) * self._taper(j, max_len)
for i in range(1, src_len + 1):
for j in range(1, tar_len + 1):
d_mat[i, j] = min(
d_mat[i - 1, j]
+ d_cost(src[i - 1], src[i])
* self._taper(max(i, j), max_len),
d_mat[i, j - 1]
+ d_cost(tar[j - 1], tar[j])
* self._taper(max(i, j), max_len),
d_mat[i - 1, j - 1]
+ r_cost(src[i], tar[j]) * self._taper(max(i, j), max_len),
)
if int(d_mat[src_len, tar_len]) == d_mat[src_len, tar_len]:
return int(d_mat[src_len, tar_len])
else:
return cast(float, d_mat[src_len, tar_len])
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Editex distance between two strings.
The Editex distance is normalized by dividing the Editex distance
(calculated by any of the three supported methods) by the greater of
the number of characters in src times the cost of a delete and
the number of characters in tar times the cost of an insert.
For the case in which all operations have :math:`cost = 1`, this is
equivalent to the greater of the length of the two strings src & tar.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
Normalized Editex distance
Examples
--------
>>> cmp = Editex()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.2
>>> cmp.dist('aluminum', 'Catalan')
0.75
>>> cmp.dist('ATCG', 'TAGC')
0.75
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 0.0
match_cost, group_cost, mismatch_cost = self._cost
src_len = len(src)
tar_len = len(tar)
if self._taper_enabled:
normalize_term = max(
[
sum(
self._taper(pos, src_len) * mismatch_cost
for pos in range(src_len)
),
sum(
self._taper(pos, tar_len) * mismatch_cost
for pos in range(tar_len)
),
]
)
else:
normalize_term = max(
src_len * mismatch_cost, tar_len * mismatch_cost
)
return self.dist_abs(src, tar) / normalize_term
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_euclidean.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._euclidean.
Euclidean distance & similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._minkowski import Minkowski
from ..tokenizer import _Tokenizer
__all__ = ['Euclidean']
class Euclidean(Minkowski):
"""Euclidean distance.
Euclidean distance is the straigh-line or as-the-crow-flies distance,
equivalent to Minkowski distance in :math:`L^2`-space.
.. versionadded:: 0.3.6
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = 0,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Euclidean instance.
Parameters
----------
alphabet : collection or int
The values or size of the alphabet
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Euclidean, self).__init__(
pval=2,
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def dist_abs(self, src: str, tar: str, normalized: bool = False) -> float:
"""Return the Euclidean distance between two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
normalized : bool
Normalizes to [0, 1] if True
Returns
-------
float
The Euclidean distance
Examples
--------
>>> cmp = Euclidean()
>>> cmp.dist_abs('cat', 'hat')
2.0
>>> round(cmp.dist_abs('Niall', 'Neil'), 12)
2.645751311065
>>> cmp.dist_abs('Colin', 'Cuilen')
3.0
>>> round(cmp.dist_abs('ATCG', 'TAGC'), 12)
3.162277660168
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return super(Euclidean, self).dist_abs(src, tar, normalized=normalized)
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Euclidean distance between two strings.
The normalized Euclidean distance is a distance
metric in :math:`L^2`-space, normalized to [0, 1].
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
The normalized Euclidean distance
Examples
--------
>>> cmp = Euclidean()
>>> round(cmp.dist('cat', 'hat'), 12)
0.57735026919
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.683130051064
>>> round(cmp.dist('Colin', 'Cuilen'), 12)
0.727606875109
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return self.dist_abs(src, tar, normalized=True)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_eudex.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._eudex.
eudex distance functions
"""
from typing import (
Any,
Callable,
Generator,
Iterable,
List,
Optional,
Union,
cast,
)
from ._distance import _Distance
from ..phonetic import Eudex as EudexPhonetic
__all__ = ['Eudex']
class Eudex(_Distance):
"""Distance between the Eudex hashes of two terms.
Cf. :cite:`Ticki:2016`.
.. versionadded:: 0.3.6
"""
@staticmethod
def gen_fibonacci() -> Generator[float, None, None]:
"""Yield the next Fibonacci number.
Based on https://www.python-course.eu/generators.php
Starts at Fibonacci number 3 (the second 1)
Yields
------
int
The next Fibonacci number
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
num_a, num_b = 1, 2
while True:
yield num_a
num_a, num_b = num_b, num_a + num_b
@staticmethod
def gen_exponential(base: int = 2) -> Generator[float, None, None]:
"""Yield the next value in an exponential series of the base.
Starts at base**0
Parameters
----------
base : int
The base to exponentiate
Yields
------
int
The next power of `base`
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
exp = 0
while True:
yield base ** exp
exp += 1
def __init__(
self,
weights: Optional[
Union[
str,
Iterable[float],
Callable[[], Generator[float, None, None]],
]
] = 'exponential',
max_length: int = 8,
**kwargs: Any
) -> None:
"""Initialize Eudex instance.
Parameters
----------
weights : str, iterable, or generator function
The weights or weights generator function
- If set to ``None``, a simple Hamming distance is calculated.
- If set to ``exponential``, weight decays by powers of 2, as
proposed in the eudex specification:
https://github.com/ticki/eudex.
- If set to ``fibonacci``, weight decays through the Fibonacci
series, as in the eudex reference implementation.
- If set to a callable function, this assumes it creates a
generator and the generator is used to populate a series of
weights.
- If set to an iterable, the iterable's values should be
integers and will be used as the weights.
In all cases, the weights should be ordered or generated from least
significant to most significant, so larger values should generally
come first.
max_length : int
The number of characters to encode as a eudex hash
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Eudex, self).__init__(**kwargs)
self._weights = weights
self._max_length = max_length
self._phonetic_alg = EudexPhonetic(max_length=max_length)
def dist_abs(self, src: str, tar: str, normalized: bool = False) -> float:
"""Calculate the distance between the Eudex hashes of two terms.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
normalized : bool
Normalizes to [0, 1] if True
Returns
-------
int
The Eudex Hamming distance
Examples
--------
>>> cmp = Eudex()
>>> cmp.dist_abs('cat', 'hat')
128
>>> cmp.dist_abs('Niall', 'Neil')
2
>>> cmp.dist_abs('Colin', 'Cuilen')
10
>>> cmp.dist_abs('ATCG', 'TAGC')
403
>>> cmp = Eudex(weights='fibonacci')
>>> cmp.dist_abs('cat', 'hat')
34
>>> cmp.dist_abs('Niall', 'Neil')
2
>>> cmp.dist_abs('Colin', 'Cuilen')
7
>>> cmp.dist_abs('ATCG', 'TAGC')
117
>>> cmp = Eudex(weights=None)
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
1
>>> cmp.dist_abs('Colin', 'Cuilen')
2
>>> cmp.dist_abs('ATCG', 'TAGC')
9
>>> # Using the OEIS A000142:
>>> cmp = Eudex(weights=[1, 1, 2, 6, 24, 120, 720, 5040])
>>> cmp.dist_abs('cat', 'hat')
5040
>>> cmp.dist_abs('Niall', 'Neil')
1
>>> cmp.dist_abs('Colin', 'Cuilen')
7
>>> cmp.dist_abs('ATCG', 'TAGC')
15130
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
# Calculate the eudex hashes and XOR them
xored = int(self._phonetic_alg.encode(src)) ^ int(
self._phonetic_alg.encode(tar)
)
# Simple hamming distance (all bits are equal)
if not self._weights:
binary = bin(xored)
distance = binary.count('1') # type: float
if normalized:
return distance / (len(binary) - 2)
return distance
# If self._weights is a function, it should create a generator,
# which we now use to populate a list
weights_list = [] # List[float]
if hasattr(self._weights, '__iter__') and not isinstance(
self._weights, str
):
weights_list = cast(List[float], self._weights)[::-1]
weights_gen = None
elif callable(self._weights):
weights_gen = self._weights()
elif self._weights == 'exponential':
weights_gen = Eudex.gen_exponential()
elif self._weights == 'fibonacci':
weights_gen = Eudex.gen_fibonacci()
else:
raise ValueError('Unrecognized weights value or type.')
if isinstance(weights_gen, Generator):
weights_list = [
next(weights_gen) for _ in range(self._max_length)
][::-1]
# Sum the weighted hamming distance
distance = 0
max_distance = 0.0
while (xored or normalized) and weights_list:
max_distance += 8 * weights_list[-1]
distance += bin(xored & 0xFF).count('1') * weights_list.pop()
xored >>= 8
if normalized:
distance /= max_distance
return distance
def dist(self, src: str, tar: str) -> float:
"""Return normalized distance between the Eudex hashes of two terms.
This is Eudex distance normalized to [0, 1].
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
The normalized Eudex Hamming distance
Examples
--------
>>> cmp = Eudex()
>>> round(cmp.dist('cat', 'hat'), 12)
0.062745098039
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.000980392157
>>> round(cmp.dist('Colin', 'Cuilen'), 12)
0.004901960784
>>> round(cmp.dist('ATCG', 'TAGC'), 12)
0.197549019608
.. versionadded:: 0.3.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return self.dist_abs(src, tar, True)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_eyraud.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._eyraud.
Eyraud similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Eyraud']
class Eyraud(_TokenDistance):
r"""Eyraud similarity.
For two sets X and Y and a population N, the Eyraud
similarity :cite:`Eyraud:1938` is
.. math::
sim_{Eyraud}(X, Y) =
\frac{|X \cap Y| - |X| \cdot |Y|}
{|X| \cdot |Y| \cdot |N \setminus Y| \cdot |N \setminus X|}
For lack of access to the original, this formula is based on the concurring
formulae presented in :cite:`Shi:1993` and :cite:`Hubalek:1982`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Eyraud} =
\frac{a-(a+b)(a+c)}{(a+b)(a+c)(b+d)(c+d)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Eyraud instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Eyraud, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Eyraud similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Eyraud similarity
Examples
--------
>>> cmp = Eyraud()
>>> cmp.sim_score('cat', 'hat')
-1.438198553583169e-06
>>> cmp.sim_score('Niall', 'Neil')
-1.5399964580081465e-06
>>> cmp.sim_score('aluminum', 'Catalan')
-1.6354719962967386e-06
>>> cmp.sim_score('ATCG', 'TAGC')
-1.6478781097519779e-06
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
denom = max(1, a + b) * max(1, c + d) * max(1, a + c) * max(1, b + d)
num = a - (a + b) * (a + c)
return num / denom
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Eyraud similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Eyraud similarity
Examples
--------
>>> cmp = Eyraud()
>>> cmp.sim('cat', 'hat')
1.438198553583169e-06
>>> cmp.sim('Niall', 'Neil')
1.5399964580081465e-06
>>> cmp.sim('aluminum', 'Catalan')
1.6354719962967386e-06
>>> cmp.sim('ATCG', 'TAGC')
1.6478781097519779e-06
.. versionadded:: 0.4.0
"""
return 0.0 - self.sim_score(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fager_mcgowan.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fager_mcgowan.
Fager & McGowan similarity
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['FagerMcGowan']
class FagerMcGowan(_TokenDistance):
r"""Fager & McGowan similarity.
For two sets X and Y, the Fager & McGowan similarity
:cite:`Fager:1957,Fager:1963` is
.. math::
sim_{FagerMcGowan}(X, Y) =
\frac{|X \cap Y|}{\sqrt{|X|\cdot|Y|}} -
\frac{1}{2\sqrt{max(|X|, |Y|)}}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{FagerMcGowan} =
\frac{a}{\sqrt{(a+b)(a+c)}} - \frac{1}{2\sqrt{max(a+b, a+c)}}
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize FagerMcGowan instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(FagerMcGowan, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Fager & McGowan similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Fager & McGowan similarity
Examples
--------
>>> cmp = FagerMcGowan()
>>> cmp.sim_score('cat', 'hat')
0.25
>>> cmp.sim_score('Niall', 'Neil')
0.16102422643817918
>>> cmp.sim_score('aluminum', 'Catalan')
-0.048815536468908724
>>> cmp.sim_score('ATCG', 'TAGC')
-0.22360679774997896
.. versionadded:: 0.4.0
"""
if not src or not tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
apb = self._src_card()
apc = self._tar_card()
first = a / (apb * apc) ** 0.5 if a else 0.0
second = 1 / (2 * (max(apb, apc) ** 0.5))
return first - second
def sim(self, src: str, tar: str) -> float:
r"""Return the normalized Fager & McGowan similarity of two strings.
As this similarity ranges from :math:`(-\inf, 1.0)`, this normalization
simply clamps the value to the range (0.0, 1.0).
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Fager & McGowan similarity
Examples
--------
>>> cmp = FagerMcGowan()
>>> cmp.sim('cat', 'hat')
0.25
>>> cmp.sim('Niall', 'Neil')
0.16102422643817918
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return max(0.0, self.sim_score(src, tar))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_faith.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._faith.
Faith similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Faith']
class Faith(_TokenDistance):
r"""Faith similarity.
For two sets X and Y and a population N, the Faith
similarity :cite:`Faith:1983` is
.. math::
sim_{Faith}(X, Y) = \frac{|X \cap Y| +
\frac{|(N \setminus X) \setminus Y|}{2}}{|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Faith} =
\frac{a+\frac{d}{2}}{n}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Faith instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Faith, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Faith similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Faith similarity
Examples
--------
>>> cmp = Faith()
>>> cmp.sim('cat', 'hat')
0.4987244897959184
>>> cmp.sim('Niall', 'Neil')
0.4968112244897959
>>> cmp.sim('aluminum', 'Catalan')
0.4910828025477707
>>> cmp.sim('ATCG', 'TAGC')
0.49362244897959184
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
num = self._intersection_card() + self._total_complement_card() / 2
if num == 0.0:
return 0.0
return num / self._population_unique_card()
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fellegi_sunter.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fellegi_sunter.
Fellegi-Sunter similarity
"""
from math import exp, log
from sys import float_info
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['FellegiSunter']
class FellegiSunter(_TokenDistance):
r"""Fellegi-Sunter similarity.
Fellegi-Sunter similarity is based on the description in
:cite:`Cohen:2003` and implementation in :cite:`Cohen:2003b`.
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
simplified: bool = False,
mismatch_factor: float = 0.5,
**kwargs: Any
) -> None:
"""Initialize FellegiSunter instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
simplified : bool
Specifies to use the simplified scoring variant
mismatch_factor : float
Specifies the penalty factor for mismatches
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(FellegiSunter, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
self._simplified = simplified
self._mismatch_factor = mismatch_factor
def sim_score(self, src: str, tar: str) -> float:
"""Return the Fellegi-Sunter similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Fellegi-Sunter similarity
Examples
--------
>>> cmp = FellegiSunter()
>>> cmp.sim_score('cat', 'hat')
0.8803433378011485
>>> cmp.sim_score('Niall', 'Neil')
0.6958768466635681
>>> cmp.sim_score('aluminum', 'Catalan')
0.45410905865149187
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
src_tokens, tar_tokens = self._get_tokens()
src_total = sum(src_tokens.values())
tar_total = sum(tar_tokens.values())
src_unique = len(src_tokens)
tar_unique = len(tar_tokens)
similarity = 0.0
for _tok, count in self._intersection().items():
if self._simplified:
similarity += -log(count / tar_total)
else:
prob = count / tar_total
similarity -= log(
1
+ float_info.epsilon
- exp(
src_unique
* tar_unique
* log(1 + float_info.epsilon - prob * prob)
)
)
for _tok, count in self._src_only().items():
if self._simplified:
similarity -= -log(count / src_total) * self._mismatch_factor
return similarity
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Fellegi-Sunter similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Fellegi-Sunter similarity
Examples
--------
>>> cmp = FellegiSunter()
>>> cmp.sim('cat', 'hat')
0.2934477792670495
>>> cmp.sim('Niall', 'Neil')
0.13917536933271363
>>> cmp.sim('aluminum', 'Catalan')
0.056763632331436484
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
score = self.sim_score(src, tar)
if score == 0.0:
return 0.0
if self._simplified:
return max(0.0, score / (len(src) + len(tar)))
return max(0.0, score / max(len(src), len(tar)))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fidelity.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fidelity.
Fidelity
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Fidelity']
class Fidelity(_TokenDistance):
r"""Fidelity.
For two multisets X and Y drawn from an alphabet S, fidelity is
.. math::
sim_{Fidelity}(X, Y) =
\Bigg( \sum_{i \in S} \sqrt{|\frac{A_i}{|A|} \cdot
\frac{B_i}{|B|}|} \Bigg)^2
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize Fidelity instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(Fidelity, self).__init__(tokenizer=tokenizer, **kwargs)
def sim(self, src: str, tar: str) -> float:
"""Return the fidelity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
fidelity
Examples
--------
>>> cmp = Fidelity()
>>> cmp.sim('cat', 'hat')
0.25
>>> cmp.sim('Niall', 'Neil')
0.1333333333333333
>>> cmp.sim('aluminum', 'Catalan')
0.013888888888888888
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
alphabet = self._total().keys()
src_mag = max(1, sum(self._src_tokens.values()))
tar_mag = max(1, sum(self._tar_tokens.values()))
return (
sum(
(
abs(
self._src_tokens[tok]
/ src_mag
* self._tar_tokens[tok]
/ tar_mag
)
)
** 0.5
for tok in alphabet
)
** 2
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fleiss.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fleiss.
Fleiss correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Fleiss']
class Fleiss(_TokenDistance):
r"""Fleiss correlation.
For two sets X and Y and a population N, Fleiss correlation
:cite:`Fleiss:1975` is
.. math::
corr_{Fleiss}(X, Y) =
\frac{(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|) \cdot
(|X| \cdot |N \setminus X| + |Y| \cdot |N \setminus Y|)}
{2 \cdot |X| \cdot |N \setminus X| \cdot |Y| \cdot |N \setminus Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{Fleiss} =
\frac{(ad-bc)((a+b)(c+d)+(a+c)(b+d))}{2(a+b)(c+d)(a+c)(b+d)}
This is Fleiss' :math:`M(A_1)`, :math:`ad-bc` divided by the harmonic mean
of the marginals :math:`p_1q_1 = (a+b)(c+d)` and
:math:`p_2q_2 = (a+c)(b+d)`.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Fleiss instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Fleiss, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Fleiss correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Fleiss correlation
Examples
--------
>>> cmp = Fleiss()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.3621712520061204
>>> cmp.corr('aluminum', 'Catalan')
0.10839724112919989
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483954
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = (a * d - b * c) * ((a + b) * (c + d) + (a + c) * (b + d))
if num == 0.0:
return 0.0
return num / (2.0 * (a + b) * (c + d) * (a + c) * (b + d))
def sim(self, src: str, tar: str) -> float:
"""Return the Fleiss similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Fleiss similarity
Examples
--------
>>> cmp = Fleiss()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6810856260030602
>>> cmp.sim('aluminum', 'Catalan')
0.5541986205645999
>>> cmp.sim('ATCG', 'TAGC')
0.496790757381258
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fleiss_levin_paik.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fleiss_levin_paik.
Fleiss-Levin-Paik similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['FleissLevinPaik']
class FleissLevinPaik(_TokenDistance):
r"""Fleiss-Levin-Paik similarity.
For two sets X and Y and a population N, Fleiss-Levin-Paik similarity
:cite:`Fleiss:2003` is
.. math::
sim_{FleissLevinPaik}(X, Y) =
\frac{2|(N \setminus X) \setminus Y|}
{2|(N \setminus X) \setminus Y| +
|X \setminus Y| + |Y \setminus X|}
This is :cite:`Morris:2012`'s 'd Specific Agreement'.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{FleissLevinPaik} =
\frac{2d}{2d + b + c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize FleissLevinPaik instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(FleissLevinPaik, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Fleiss-Levin-Paik similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Fleiss-Levin-Paik similarity
Examples
--------
>>> cmp = FleissLevinPaik()
>>> cmp.sim('cat', 'hat')
0.9974358974358974
>>> cmp.sim('Niall', 'Neil')
0.9955041746949261
>>> cmp.sim('aluminum', 'Catalan')
0.9903412749517064
>>> cmp.sim('ATCG', 'TAGC')
0.993581514762516
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
if d == 0.0:
return 0.0
return 2 * d / (2 * d + b + c)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_flexmetric.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._flexmetric.
FlexMetric distance
"""
from typing import (
Any,
Callable,
FrozenSet,
List,
Optional,
Sequence,
Set,
Tuple,
Union,
cast,
)
from numpy import float_ as np_float
from numpy import zeros as np_zeros
from ._distance import _Distance
__all__ = ['FlexMetric']
class FlexMetric(_Distance):
r"""FlexMetric distance.
FlexMetric distance :cite:`Kempken:2005`
.. versionadded:: 0.4.0
"""
def __init__(
self,
normalizer: Callable[[List[float]], float] = max,
indel_costs: Optional[
List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]]
] = None,
subst_costs: Optional[
List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]]
] = None,
**kwargs: Any
) -> None:
"""Initialize FlexMetric instance.
Parameters
----------
normalizer : function
A function that takes an list and computes a normalization term
by which the edit distance is divided (max by default). Another
good option is the sum function.
indel_costs : list of tuples
A list of insertion and deletion costs. Each list element should
be a tuple consisting of an iterable (sets are best) and a float
value. The iterable consists of those letters whose insertion
or deletion has a cost equal to the float value.
subst_costs : list of tuples
A list of substitution costs. Each list element should
be a tuple consisting of an iterable (sets are best) and a float
value. The iterable consists of the letters in each letter class,
which may be substituted for each other at cost equal to the float
value.
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(FlexMetric, self).__init__(**kwargs)
self._normalizer = normalizer
def _get_second(
s: Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]
) -> float:
return s[1]
if indel_costs is None:
self._indel_costs = [
(frozenset('dtch'), 0.4),
(frozenset('e'), 0.5),
(frozenset('u'), 0.9),
(frozenset('rpn'), 0.95),
] # type: List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]] # noqa: E501
else:
self._indel_costs = sorted(indel_costs, key=_get_second)
if subst_costs is None:
self._subst_costs = [
(frozenset('szß'), 0.1),
(frozenset('dt'), 0.1),
(frozenset('iy'), 0.1),
(frozenset('ckq'), 0.1),
(frozenset('eä'), 0.1),
(frozenset('uüv'), 0.1),
(frozenset('iü'), 0.1),
(frozenset('fv'), 0.1),
(frozenset('zc'), 0.1),
(frozenset('ij'), 0.1),
(frozenset('bp'), 0.1),
(frozenset('eoö'), 0.2),
(frozenset('aä'), 0.2),
(frozenset('mbp'), 0.4),
(frozenset('uw'), 0.4),
(frozenset('uo'), 0.8),
(frozenset('aeiouy'), 0.9),
] # type: List[Tuple[Union[Sequence[str], Set[str], FrozenSet[str]], float]] # noqa: E501
else:
self._subst_costs = sorted(subst_costs, key=_get_second)
def _cost(self, src: str, s_pos: int, tar: str, t_pos: int) -> float:
if s_pos == -1:
if t_pos > 0 and tar[t_pos - 1] == tar[t_pos]:
return 0.0
for letter_set in self._indel_costs:
if tar[t_pos] in letter_set[0]:
return letter_set[1]
else:
return 1.0
elif t_pos == -1:
if s_pos > 0 and src[s_pos - 1] == src[s_pos]:
return 0.0
for letter_set in self._indel_costs:
if src[s_pos] in letter_set[0]:
return letter_set[1]
else:
return 1.0
for letter_set in self._subst_costs:
if src[s_pos] in letter_set[0] and tar[t_pos] in letter_set[0]:
return letter_set[1]
else:
return 1.0
def dist_abs(self, src: str, tar: str) -> float:
"""Return the FlexMetric distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
FlexMetric distance
Examples
--------
>>> cmp = FlexMetric()
>>> cmp.dist_abs('cat', 'hat')
0.8
>>> cmp.dist_abs('Niall', 'Neil')
1.5
>>> cmp.dist_abs('aluminum', 'Catalan')
6.7
>>> cmp.dist_abs('ATCG', 'TAGC')
2.1999999999999997
.. versionadded:: 0.4.0
"""
src_len = len(src)
tar_len = len(tar)
if src == tar:
return 0
if not src:
return sum(self._cost('', -1, tar, j) for j in range(len(tar)))
if not tar:
return sum(self._cost(src, i, '', -1) for i in range(len(src)))
d_mat = np_zeros((src_len + 1, tar_len + 1), dtype=np_float)
for i in range(1, src_len + 1):
d_mat[i, 0] = d_mat[i - 1, 0] + self._cost(src, i - 1, '', -1)
for j in range(1, tar_len + 1):
d_mat[0, j] = d_mat[0, j - 1] + self._cost('', -1, tar, j - 1)
src_lc = src.lower()
tar_lc = tar.lower()
for i in range(src_len):
for j in range(tar_len):
d_mat[i + 1, j + 1] = min(
d_mat[i + 1, j] + self._cost('', -1, tar_lc, j), # ins
d_mat[i, j + 1] + self._cost(src_lc, i, '', -1), # del
d_mat[i, j]
+ (
self._cost(src_lc, i, tar_lc, j)
if src[i] != tar[j]
else 0
), # sub/==
)
return cast(float, d_mat[src_len, tar_len])
def dist(self, src: str, tar: str) -> float:
"""Return the normalized FlexMetric distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized FlexMetric distance
Examples
--------
>>> cmp = FlexMetric()
>>> cmp.dist('cat', 'hat')
0.26666666666666666
>>> cmp.dist('Niall', 'Neil')
0.3
>>> cmp.dist('aluminum', 'Catalan')
0.8375
>>> cmp.dist('ATCG', 'TAGC')
0.5499999999999999
.. versionadded:: 0.4.0
"""
score = self.dist_abs(src, tar)
if score:
return score / self._normalizer([len(src), len(tar)])
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_forbes_i.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._forbes_i.
Forbes I similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ForbesI']
class ForbesI(_TokenDistance):
r"""Forbes I similarity.
For two sets X and Y and a population N, the Forbes I
similarity :cite:`Forbes:1907,Mozley:1936` is
.. math::
sim_{ForbesI}(X, Y) =
\frac{|N| \cdot |X \cap Y|}{|X| \cdot |Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{ForbesI} =
\frac{na}{(a+b)(a+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ForbesI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ForbesI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Forbes I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Forbes I similarity
Examples
--------
>>> cmp = ForbesI()
>>> cmp.sim_score('cat', 'hat')
98.0
>>> cmp.sim_score('Niall', 'Neil')
52.266666666666666
>>> cmp.sim_score('aluminum', 'Catalan')
10.902777777777779
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
n = self._population_unique_card()
apb = self._src_card()
apc = self._tar_card()
num = n * a
if num:
return num / (apb * apc)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Forbes I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Forbes I similarity
Examples
--------
>>> cmp = ForbesI()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3333333333333333
>>> cmp.sim('aluminum', 'Catalan')
0.11125283446712018
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
return self.sim_score(src, tar) / max(
self.sim_score(src, src), self.sim_score(tar, tar)
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_forbes_ii.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._forbes_ii.
Forbes II correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['ForbesII']
class ForbesII(_TokenDistance):
r"""Forbes II correlation.
For two sets X and Y and a population N, the Forbes II correlation,
as described in :cite:`Forbes:1925`, is
.. math::
corr_{ForbesII}(X, Y) =
\frac{|X \setminus Y| \cdot |Y \setminus X| -
|X \cap Y| \cdot |(N \setminus X) \setminus Y|}
{|X| \cdot |Y| - |N| \cdot min(|X|, |Y|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{ForbesII} =
\frac{bc-ad}{(a+b)(a+c) - n \cdot min(a+b, a+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize ForbesII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(ForbesII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Forbes II correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Forbes II correlation
Examples
--------
>>> cmp = ForbesII()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.3953727506426735
>>> cmp.corr('aluminum', 'Catalan')
0.11485180412371133
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483954
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
apb = self._src_card()
apc = self._tar_card()
n = self._population_unique_card()
num = n * a - apb * apc
if num:
return num / (n * min(apb, apc) - apb * apc)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the Forbes II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Forbes II similarity
Examples
--------
>>> cmp = ForbesII()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6976863753213367
>>> cmp.sim('aluminum', 'Catalan')
0.5574259020618557
>>> cmp.sim('ATCG', 'TAGC')
0.496790757381258
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fossum.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fossum.
Fossum similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Fossum']
class Fossum(_TokenDistance):
r"""Fossum similarity.
For two sets X and Y and a population N, the Fossum similarity
:cite:`Fossum:1966` is
.. math::
sim_{Fossum}(X, Y) =
\frac{|N| \cdot \Big(|X \cap Y|-\frac{1}{2}\Big)^2}{|X| \cdot |Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Fossum} =
\frac{n(a-\frac{1}{2})^2}{(a+b)(a+c)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Fossum instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Fossum, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Fossum similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Fossum similarity
Examples
--------
>>> cmp = Fossum()
>>> cmp.sim_score('cat', 'hat')
110.25
>>> cmp.sim_score('Niall', 'Neil')
58.8
>>> cmp.sim_score('aluminum', 'Catalan')
2.7256944444444446
>>> cmp.sim_score('ATCG', 'TAGC')
7.84
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
n = self._population_unique_card()
a = self._intersection_card()
apb = max(1.0, self._src_card())
apc = max(1.0, self._tar_card())
num = n * (a - 0.5) ** 2
if num:
return num / (apb * apc)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Fossum similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Fossum similarity
Examples
--------
>>> cmp = Fossum()
>>> cmp.sim('cat', 'hat')
0.1836734693877551
>>> cmp.sim('Niall', 'Neil')
0.08925619834710742
>>> cmp.sim('aluminum', 'Catalan')
0.0038927335640138415
>>> cmp.sim('ATCG', 'TAGC')
0.01234567901234568
.. versionadded:: 0.4.0
"""
num = self.sim_score(src, tar)
if num:
return num / max(
self.sim_score(src, src), self.sim_score(tar, tar)
)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fuzzywuzzy_partial_string.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fuzzywuzzy_partial_string.
FuzzyWuzzy Partial String similarity
"""
from difflib import SequenceMatcher
from ._distance import _Distance
__all__ = ['FuzzyWuzzyPartialString']
class FuzzyWuzzyPartialString(_Distance):
"""FuzzyWuzzy Partial String similarity.
This follows the FuzzyWuzzy Partial String similarity algorithm
:cite:`Cohen:2011`. Rather than returning an integer in the range [0, 100],
as demonstrated in the blog post, this implementation returns a float in
the range [0.0, 1.0].
.. versionadded:: 0.4.0
"""
def sim(self, src: str, tar: str) -> float:
"""Return the FuzzyWuzzy Partial String similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
FuzzyWuzzy Partial String similarity
Examples
--------
>>> cmp = FuzzyWuzzyPartialString()
>>> round(cmp.sim('cat', 'hat'), 12)
0.666666666667
>>> round(cmp.sim('Niall', 'Neil'), 12)
0.75
>>> round(cmp.sim('aluminum', 'Catalan'), 12)
0.428571428571
>>> cmp.sim('ATCG', 'TAGC')
0.5
.. versionadded:: 0.4.0
"""
max_sim = 0.0
start_pos = 0
if len(src) > len(tar):
src, tar = tar, src
src_len = len(src)
while max_sim < 1.0 and start_pos < len(tar) - src_len + 1:
max_sim = max(
max_sim,
SequenceMatcher(
None, src, tar[start_pos : start_pos + src_len]
).ratio(),
)
start_pos += 1
return max_sim
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fuzzywuzzy_token_set.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fuzzywuzzy_token_set.
FuzzyWuzzy Token Set similarity
"""
from difflib import SequenceMatcher
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import RegexpTokenizer, _Tokenizer
__all__ = ['FuzzyWuzzyTokenSet']
class FuzzyWuzzyTokenSet(_TokenDistance):
r"""FuzzyWuzzy Token Set similarity.
This follows the FuzzyWuzzy Token Set similarity algorithm
:cite:`Cohen:2011`. Rather than returning an integer in the range [0, 100],
as demonstrated in the blog post, this implementation returns a float in
the range [0.0, 1.0]. Distinct from the
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize FuzzyWuzzyTokenSet instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package.
By default, the regexp tokenizer is employed, matching only
letters.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
if tokenizer is None:
tokenizer = RegexpTokenizer()
super(FuzzyWuzzyTokenSet, self).__init__(tokenizer=tokenizer, **kwargs)
def sim(self, src: str, tar: str) -> float:
"""Return the FuzzyWuzzy Token Set similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
FuzzyWuzzy Token Set similarity
Examples
--------
>>> cmp = FuzzyWuzzyTokenSet()
>>> cmp.sim('cat', 'hat')
0.75
>>> cmp.sim('Niall', 'Neil')
0.7272727272727273
>>> cmp.sim('aluminum', 'Catalan')
0.47058823529411764
>>> cmp.sim('ATCG', 'TAGC')
0.6
.. versionadded:: 0.4.0
"""
src_tok = self.params['tokenizer'].tokenize(src).get_set()
tar_tok = self.params['tokenizer'].tokenize(tar).get_set()
intersection = src_tok & tar_tok
src_tok -= intersection
tar_tok -= intersection
intersection = ' '.join(sorted(intersection)) + ' '
src = intersection + ' '.join(sorted(src_tok))
tar = intersection + ' '.join(sorted(tar_tok))
return max(
SequenceMatcher(None, src, intersection).ratio(),
SequenceMatcher(None, intersection, tar).ratio(),
SequenceMatcher(None, src, tar).ratio(),
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_fuzzywuzzy_token_sort.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._fuzzywuzzy_token_sort.
FuzzyWuzzy Token Sort similarity
"""
from difflib import SequenceMatcher
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import RegexpTokenizer, _Tokenizer
__all__ = ['FuzzyWuzzyTokenSort']
class FuzzyWuzzyTokenSort(_TokenDistance):
r"""FuzzyWuzzy Token Sort similarity.
This follows the FuzzyWuzzy Token Sort similarity algorithm
:cite:`Cohen:2011`. Rather than returning an integer in the range [0, 100],
as demonstrated in the blog post, this implementation returns a float in
the range [0.0, 1.0].
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize FuzzyWuzzyTokenSort instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package.
By default, the regexp tokenizer is employed, matching only
letters.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
if tokenizer is None:
tokenizer = RegexpTokenizer()
super(FuzzyWuzzyTokenSort, self).__init__(
tokenizer=tokenizer, **kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the FuzzyWuzzy Token Sort similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
FuzzyWuzzy Token Sort similarity
Examples
--------
>>> cmp = FuzzyWuzzyTokenSort()
>>> cmp.sim('cat', 'hat')
0.6666666666666666
>>> cmp.sim('Niall', 'Neil')
0.6666666666666666
>>> cmp.sim('aluminum', 'Catalan')
0.4
>>> cmp.sim('ATCG', 'TAGC')
0.5
.. versionadded:: 0.4.0
"""
src = ' '.join(
sorted(self.params['tokenizer'].tokenize(src).get_list())
)
tar = ' '.join(
sorted(self.params['tokenizer'].tokenize(tar).get_list())
)
return SequenceMatcher(None, src, tar).ratio()
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_generalized_fleiss.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._generalized_fleiss.
Generalized Fleiss correlation
"""
from typing import (
Any,
Callable,
Counter as TCounter,
Dict,
Iterable,
Optional,
Sequence,
Set,
Union,
cast,
)
from ._token_distance import _TokenDistance
from ..stats import (
aghmean,
agmean,
amean,
cmean,
ghmean,
gmean,
heronian_mean,
hmean,
hoelder_mean,
imean,
lehmer_mean,
lmean,
qmean,
seiffert_mean,
)
from ..tokenizer import _Tokenizer
__all__ = ['GeneralizedFleiss']
def _agmean_prec6(nums: Sequence[float]) -> float:
return agmean(nums, prec=6)
def _ghmean_prec6(nums: Sequence[float]) -> float:
return ghmean(nums, prec=6)
def _aghmean_prec6(nums: Sequence[float]) -> float:
return aghmean(nums, prec=6)
means = {
'arithmetic': amean,
'geometric': gmean,
'harmonic': hmean,
'ag': _agmean_prec6,
'gh': _ghmean_prec6,
'agh': _aghmean_prec6,
'contraharmonic': cmean,
'identric': imean,
'logarithmic': lmean,
'quadratic': qmean,
'heronian': heronian_mean,
'hoelder': hoelder_mean,
'lehmer': lehmer_mean,
'seiffert': seiffert_mean,
} # type: Dict[str, Callable[[Sequence[float]], float]]
class GeneralizedFleiss(_TokenDistance):
r"""Generalized Fleiss correlation.
For two sets X and Y and a population N, Generalized Fleiss correlation
is based on observations from :cite:`Fleiss:1975`.
.. math::
corr_{GeneralizedFleiss}(X, Y) =
\frac{|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|}
{\mu_{products~of~marginals}}
The mean function :math:`\mu` may be any of the mean functions in
:py:mod:`abydos.stats`. The products of marginals may be one of the
following:
- ``a`` : :math:`|X| \cdot |N \setminus X|` &
:math:`|Y| \cdot |N \setminus Y|`
- ``b`` : :math:`|X| \cdot |Y|` &
:math:`|N \setminus X| \cdot |N \setminus Y|`
- ``c`` : :math:`|X| \cdot |N| \setminus Y|` &
:math:`|Y| \cdot |N \setminus X|`
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{GeneralizedFleiss} =
\frac{ad-bc}{\mu_{products~of~marginals}}
And the products of marginals are:
- ``a`` : :math:`p_1q_1 = (a+b)(c+d)` & :math:`p_2q_2 = (a+c)(b+d)`
- ``b`` : :math:`p_1p_2 = (a+b)(a+c)` & :math:`q_1q_2 = (c+d)(b+d)`
- ``c`` : :math:`p_1q_2 = (a+b)(b+d)` & :math:`p_2q_1 = (a+c)(c+d)`
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
mean_func: str = 'arithmetic',
marginals: str = 'a',
proportional: bool = False,
**kwargs: Any
) -> None:
"""Initialize GeneralizedFleiss instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
mean_func : str or function
Specifies the mean function to use. A function taking a list of
numbers as its only required argument may be supplied, or one of
the following strings will select the specified mean function from
:py:mod:`abydos.stats`:
- ``arithmetic`` employs :py:func:`amean`, and this measure
will be identical to :py:class:`MaxwellPilliner` with
otherwise default parameters
- ``geometric`` employs :py:func:`gmean`, and this measure
will be identical to :py:class:`PearsonPhi` with otherwise
default parameters
- ``harmonic`` employs :py:func:`hmean`, and this measure
will be identical to :py:class:`Fleiss` with otherwise
default parameters
- ``ag`` employs the arithmetic-geometric mean
:py:func:`agmean`
- ``gh`` employs the geometric-harmonic mean
:py:func:`ghmean`
- ``agh`` employs the arithmetic-geometric-harmonic mean
:py:func:`aghmean`
- ``contraharmonic`` employs the contraharmonic mean
:py:func:`cmean`
- ``identric`` employs the identric mean :py:func:`imean`
- ``logarithmic`` employs the logarithmic mean
:py:func:`lmean`
- ``quadratic`` employs the quadratic mean :py:func:`qmean`
- ``heronian`` employs the Heronian mean
:py:func:`heronian_mean`
- ``hoelder`` employs the Hölder mean :py:func:`hoelder_mean`
- ``lehmer`` employs the Lehmer mean :py:func:`lehmer_mean`
- ``seiffert`` employs Seiffert's mean
:py:func:`seiffert_mean`
marginals : str
Specifies the pairs of marginals to multiply and calculate the
resulting mean of. Can be:
- ``a`` : :math:`p_1q_1 = (a+b)(c+d)` &
:math:`p_2q_2 = (a+c)(b+d)`
- ``b`` : :math:`p_1p_2 = (a+b)(a+c)` &
:math:`q_1q_2 = (c+d)(b+d)`
- ``c`` : :math:`p_1q_2 = (a+b)(b+d)` &
:math:`p_2q_1 = (a+c)(c+d)`
proportional : bool
If true, each of the values, :math:`a, b, c, d` and the marginals
will be divided by the total :math:`a+b+c+d=n`.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
self._mean_func = cast(
Callable[[Iterable[float]], float],
mean_func if callable(mean_func) else means[mean_func],
)
self._marginals = marginals
self._proportional = proportional
super(GeneralizedFleiss, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Generalized Fleiss correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Generalized Fleiss correlation
Examples
--------
>>> cmp = GeneralizedFleiss()
>>> cmp.corr('cat', 'hat')
0.49743589743589745
>>> cmp.corr('Niall', 'Neil')
0.35921989956790845
>>> cmp.corr('aluminum', 'Catalan')
0.10803030303030303
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483954
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
if self._proportional:
a /= n
b /= n
c /= n
d /= n
num = a * d - b * c
if not num:
return 0.0
if self._marginals == 'b':
mps = [(a + b) * (a + c), (c + d) * (b + d)]
elif self._marginals == 'c':
mps = [(a + b) * (b + d), (a + c) * (c + d)]
else:
mps = [(a + b) * (c + d), (a + c) * (b + d)]
mean_value = self._mean_func(mps)
return num / mean_value
def sim(self, src: str, tar: str) -> float:
"""Return the Generalized Fleiss similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Generalized Fleiss similarity
Examples
--------
>>> cmp = GeneralizedFleiss()
>>> cmp.sim('cat', 'hat')
0.7487179487179487
>>> cmp.sim('Niall', 'Neil')
0.6796099497839543
>>> cmp.sim('aluminum', 'Catalan')
0.5540151515151515
>>> cmp.sim('ATCG', 'TAGC')
0.496790757381258
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gilbert.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gilbert.
Gilbert correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Gilbert']
class Gilbert(_TokenDistance):
r"""Gilbert correlation.
For two sets X and Y and a population N, the Gilbert correlation
:cite:`Gilbert:1884` is
.. math::
corr_{Gilbert}(X, Y) =
\frac{2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|)}
{|N|^2 - |X \cap Y|^2 + |X \setminus Y|^2 + |Y \setminus X|^2 -
|(N \setminus X) \setminus Y|^2}
For lack of access to the original, this formula is based on the concurring
formulae presented in :cite:`Peirce:1884` and :cite:`Doolittle:1884`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{Gilbert} =
\frac{2(ad-cd)}{n^2-a^2+b^2+c^2-d^2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Gilbert instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Gilbert, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Gilbert correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gilbert correlation
Examples
--------
>>> cmp = Gilbert()
>>> cmp.corr('cat', 'hat')
0.3310580204778157
>>> cmp.corr('Niall', 'Neil')
0.21890122402504983
>>> cmp.corr('aluminum', 'Catalan')
0.057094811018577836
>>> cmp.corr('ATCG', 'TAGC')
-0.003198976327575176
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
n = self._population_unique_card()
num = a * n - (a + b) * (a + c)
if num:
return num / (n * (a + b + c) - (a + b) * (a + c))
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the Gilbert similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gilbert similarity
Examples
--------
>>> cmp = Gilbert()
>>> cmp.sim('cat', 'hat')
0.6655290102389079
>>> cmp.sim('Niall', 'Neil')
0.6094506120125249
>>> cmp.sim('aluminum', 'Catalan')
0.5285474055092889
>>> cmp.sim('ATCG', 'TAGC')
0.4984005118362124
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gilbert_wells.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gilbert_wells.
Gilbert & Wells similarity
"""
from math import factorial, log, pi
from sys import float_info
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GilbertWells']
_epsilon = float_info.epsilon
class GilbertWells(_TokenDistance):
r"""Gilbert & Wells similarity.
For two sets X and Y and a population N, the Gilbert & Wells
similarity :cite:`Gilbert:1966` is
.. math::
sim_{GilbertWells}(X, Y) =
ln \frac{|N|^3}{2\pi |X| \cdot |Y| \cdot
|N \setminus Y| \cdot |N \setminus X|} + 2ln
\frac{|N|! \cdot |X \cap Y|! \cdot |X \setminus Y|! \cdot
|Y \setminus X|! \cdot |(N \setminus X) \setminus Y|!}
{|X|! \cdot |Y|! \cdot |N \setminus Y|! \cdot |N \setminus X|!}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{GilbertWells} =
ln \frac{n^3}{2\pi (a+b)(a+c)(b+d)(c+d)} +
2ln \frac{n!a!b!c!d!}{(a+b)!(a+c)!(b+d)!(c+d)!}
Notes
-----
Most lists of similarity & distance measures, including
:cite:`Hubalek:1982,Choi:2010,Morris:2012` have a quite different formula,
which would be :math:`ln~a - ln~b - ln \frac{a+b}{n} - ln \frac{a+c}{n} =
ln\frac{an}{(a+b)(a+c)}`. However, neither this formula nor anything
similar or equivalent to it appears anywhere within the cited work,
:cite:`Gilbert:1966`. See :class:``UnknownF`` for this, alternative,
measure.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
**kwargs: Any
) -> None:
"""Initialize GilbertWells instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(GilbertWells, self).__init__(
alphabet=alphabet, tokenizer=tokenizer, **kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Gilbert & Wells similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gilbert & Wells similarity
Examples
--------
>>> cmp = GilbertWells()
>>> cmp.sim_score('cat', 'hat')
20.17617447734673
>>> cmp.sim_score('Niall', 'Neil')
16.717742356982733
>>> cmp.sim_score('aluminum', 'Catalan')
5.495096667524002
>>> cmp.sim_score('ATCG', 'TAGC')
1.6845961909440712
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
return log(
max(
_epsilon,
n ** 3
/ (
2
* pi
* max(_epsilon, a + b)
* max(_epsilon, a + c)
* max(_epsilon, b + d)
* max(_epsilon, c + d)
),
)
) + 2 * (
log(factorial(n))
+ log(factorial(a))
+ log(factorial(b))
+ log(factorial(c))
+ log(factorial(d))
- log(factorial(a + b))
- log(factorial(a + c))
- log(factorial(b + d))
- log(factorial(c + d))
)
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Gilbert & Wells similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Gilbert & Wells similarity
Examples
--------
>>> cmp = GilbertWells()
>>> cmp.sim('cat', 'hat')
0.4116913723876516
>>> cmp.sim('Niall', 'Neil')
0.2457247406857589
>>> cmp.sim('aluminum', 'Catalan')
0.05800001636414742
>>> cmp.sim('ATCG', 'TAGC')
0.028716013247135602
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
norm = max(self.sim_score(src, src), self.sim_score(tar, tar))
return self.sim_score(src, tar) / norm
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gini_i.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gini_i.
Gini I correlation
"""
from sys import float_info
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GiniI']
_epsilon = float_info.epsilon
class GiniI(_TokenDistance):
r"""Gini I correlation.
For two sets X and Y and a population N, Gini I correlation
:cite:`Gini:1912`, using the formula from :cite:`Goodman:1959`, is
.. math::
corr_{GiniI}(X, Y) =
\frac{\frac{|X \cap Y|+|(N \setminus X) \setminus Y|}{|N|} -
\frac{|X| \cdot |Y|}{|N|} +
\frac{|N \setminus Y| \cdot |N \setminus X|}{|N|}}
{\sqrt{(1-(\frac{|X|}{|N|}^2+\frac{|Y|}{|N|}^2)) \cdot
(1-(\frac{|N \setminus Y|}{|N|}^2 +
\frac{|N \setminus X|}{|N|}^2))}}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
after each term has been converted to a proportion by dividing by n, this
is
.. math::
corr_{GiniI} =
\frac{(a+d)-(a+b)(a+c) + (b+d)(c+d)}
{\sqrt{(1-((a+b)^2+(c+d)^2))\cdot(1-((a+c)^2+(b+d)^2))}}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
normalizer: str = 'proportional',
**kwargs: Any
) -> None:
"""Initialize GiniI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
normalizer : str
Specifies the normalization type. See :ref:`normalizer `
description in :py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GiniI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
normalizer=normalizer,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Gini I correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gini I correlation
Examples
--------
>>> cmp = GiniI()
>>> cmp.corr('cat', 'hat')
0.49722814498933254
>>> cmp.corr('Niall', 'Neil')
0.39649090262533215
>>> cmp.corr('aluminum', 'Catalan')
0.14887105223941113
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237489576
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
return ((a + d) - ((a + b) * (a + c) + (c + d) * (b + d))) / (
(1 + _epsilon - ((a + b) ** 2 + (c + d) ** 2))
* (1 + _epsilon - ((a + c) ** 2 + (b + d) ** 2))
) ** 0.5
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Gini I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Gini I similarity
Examples
--------
>>> cmp = GiniI()
>>> cmp.sim('cat', 'hat')
0.7486140724946663
>>> cmp.sim('Niall', 'Neil')
0.6982454513126661
>>> cmp.sim('aluminum', 'Catalan')
0.5744355261197056
>>> cmp.sim('ATCG', 'TAGC')
0.4967907573812552
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gini_ii.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gini_ii.
Gini II correlation
"""
from sys import float_info
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GiniII']
_epsilon = float_info.epsilon
class GiniII(_TokenDistance):
r"""Gini II distance.
For two sets X and Y and a population N, Gini II correlation
:cite:`Gini:1915`, using the formula from :cite:`Goodman:1959`, is
.. math::
corr_{GiniII}(X, Y) =
\frac{\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|} -
(\frac{|X| \cdot |Y|}{|N|} +
\frac{|N \setminus Y| \cdot |N \setminus X|}{|N|})}
{1 - |\frac{|Y \setminus X| - |X \setminus Y|}{|N|}|
- (\frac{|X| \cdot |Y|}{|N|} +
\frac{|N \setminus Y| \cdot |N \setminus X|}{|N|})}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
after each term has been converted to a proportion by dividing by n, this
is
.. math::
corr_{GiniII} =
\frac{(a+d) - ((a+b)(a+c) + (b+d)(c+d))}
{1 - |b-c| - ((a+b)(a+c) + (b+d)(c+d))}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
normalizer: str = 'proportional',
**kwargs: Any
) -> None:
"""Initialize GiniII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
normalizer : str
Specifies the normalization type. See :ref:`normalizer `
description in :py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GiniII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
normalizer=normalizer,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Gini II correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gini II correlation
Examples
--------
>>> cmp = GiniII()
>>> cmp.corr('cat', 'hat')
0.49722814498933254
>>> cmp.corr('Niall', 'Neil')
0.4240703425535771
>>> cmp.corr('aluminum', 'Catalan')
0.15701415701415936
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237489576
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
return ((a + d) - ((a + b) * (a + c) + (c + d) * (b + d))) / (
(
1
+ _epsilon
- abs(b - c)
- ((a + b) * (a + c) + (c + d) * (b + d))
)
)
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Gini II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Gini II similarity
Examples
--------
>>> cmp = GiniII()
>>> cmp.sim('cat', 'hat')
0.7486140724946663
>>> cmp.sim('Niall', 'Neil')
0.7120351712767885
>>> cmp.sim('aluminum', 'Catalan')
0.5785070785070797
>>> cmp.sim('ATCG', 'TAGC')
0.4967907573812552
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_goodall.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._goodall.
Goodall similarity
"""
from math import asin, pi
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Goodall']
class Goodall(_TokenDistance):
r"""Goodall similarity.
For two sets X and Y and a population N, Goodall similarity
:cite:`Goodall:1967,Austin:1977` is an angular transformation of Sokal
& Michener's simple matching coefficient
.. math::
sim_{Goodall}(X, Y) = \frac{2}{\pi} \sin^{-1}\Big(
\sqrt{\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}}
\Big)
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Goodall} =\frac{2}{\pi} \sin^{-1}\Big(
\sqrt{\frac{a + d}{n}}
\Big)
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Goodall instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Goodall, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Goodall similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Goodall similarity
Examples
--------
>>> cmp = Goodall()
>>> cmp.sim('cat', 'hat')
0.9544884026871964
>>> cmp.sim('Niall', 'Neil')
0.9397552079794624
>>> cmp.sim('aluminum', 'Catalan')
0.9117156301536503
>>> cmp.sim('ATCG', 'TAGC')
0.9279473952929225
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
d = self._total_complement_card()
n = self._population_unique_card()
return 2 / pi * asin(((a + d) / n) ** 0.5)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_goodman_kruskal_lambda.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._goodman_kruskal_lambda.
Goodman & Kruskal's Lambda similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GoodmanKruskalLambda']
class GoodmanKruskalLambda(_TokenDistance):
r"""Goodman & Kruskal's Lambda similarity.
For two sets X and Y and a population N, Goodman & Kruskal's lambda
:cite:`Goodman:1954` is
.. math::
sim_{GK_\lambda}(X, Y) =
\frac{\frac{1}{2}(max(|X \cap Y|, |X \setminus Y|)+
max(|Y \setminus X|, |(N \setminus X) \setminus Y|)+
max(|X \cap Y|, |Y \setminus X|)+
max(|X \setminus Y|, |(N \setminus X) \setminus Y|))-
(max(|X|, |N \setminus X|)+max(|Y|, |N \setminus Y|))}
{|N|-\frac{1}{2}(max(|X|, |N \setminus X|)+
max(|Y|, |N \setminus Y|))}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{GK_\lambda} =
\frac{\frac{1}{2}((max(a,b)+max(c,d)+max(a,c)+max(b,d))-
(max(a+b,c+d)+max(a+c,b+d)))}
{n-\frac{1}{2}(max(a+b,c+d)+max(a+c,b+d))}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize GoodmanKruskalLambda instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GoodmanKruskalLambda, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return Goodman & Kruskal's Lambda similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Goodman & Kruskal's Lambda similarity
Examples
--------
>>> cmp = GoodmanKruskalLambda()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('Niall', 'Neil')
0.0
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
sigma = max(a, b) + max(c, d) + max(a, c) + max(b, d)
sigma_prime = max(a + c, b + d) + max(a + b, c + d)
num = sigma - sigma_prime
if num:
return num / (2 * (a + b + c + d) - sigma_prime)
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_goodman_kruskal_lambda_r.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._goodman_kruskal_lambda_r.
Goodman & Kruskal Lambda-r correlation.
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GoodmanKruskalLambdaR']
class GoodmanKruskalLambdaR(_TokenDistance):
r"""Goodman & Kruskal Lambda-r correlation.
For two sets X and Y and a population N, Goodman & Kruskal
:math:`\lambda_r` correlation :cite:`Goodman:1954` is
.. math::
corr_{GK_{\lambda_r}}(X, Y) =
\frac{|X \cap Y| + |(N \setminus X) \setminus Y| -
\frac{1}{2}(max(|X|, |N \setminus X|) + max(|Y|, |N \setminus Y|))}
{|N| -
\frac{1}{2}(max(|X|, |N \setminus X|) + max(|Y|, |N \setminus Y|))}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{GK_{\lambda_r}} =
\frac{a + d - \frac{1}{2}(max(a+b,c+d)+max(a+c,b+d))}
{n - \frac{1}{2}(max(a+b,c+d)+max(a+c,b+d))}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize GoodmanKruskalLambdaR instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GoodmanKruskalLambdaR, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return Goodman & Kruskal Lambda-r correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Goodman & Kruskal Lambda-r correlation
Examples
--------
>>> cmp = GoodmanKruskalLambdaR()
>>> cmp.corr('cat', 'hat')
0.0
>>> cmp.corr('Niall', 'Neil')
-0.2727272727272727
>>> cmp.corr('aluminum', 'Catalan')
-0.7647058823529411
>>> cmp.corr('ATCG', 'TAGC')
-1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
if not self._src_card() or not self._tar_card():
return -1.0
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
sigma_prime = max(a + b, c + d) + max(a + c, b + d)
num = 2 * (a + d) - sigma_prime
if num:
return num / (2 * n - sigma_prime)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return Goodman & Kruskal Lambda-r similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Goodman & Kruskal Lambda-r similarity
Examples
--------
>>> cmp = GoodmanKruskalLambdaR()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.36363636363636365
>>> cmp.sim('aluminum', 'Catalan')
0.11764705882352944
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_goodman_kruskal_tau_a.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._goodman_kruskal_tau_a.
Goodman & Kruskal's Tau A similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GoodmanKruskalTauA']
class GoodmanKruskalTauA(_TokenDistance):
r"""Goodman & Kruskal's Tau A similarity.
For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_a`
similarity :cite:`Goodman:1954`, by analogy with :math:`\tau_b`, is
.. math::
sim_{GK_{\tau_a}}(X, Y) =
\frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
\frac{|Y \setminus X|}{|N|}^2}{\frac{|Y|}{|N|}}+
\frac{\frac{|X \setminus Y|}{|N|}^2 +
\frac{|(N \setminus X) \setminus Y|}{|N|}^2}
{\frac{|N \setminus X|}{|N|}} -
(\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)}
{1 - (\frac{|X|}{|N|}^2 + \frac{|N \setminus X|}{|N|}^2)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
after each term has been converted to a proportion by dividing by n, this
is
.. math::
sim_{GK_{\tau_a}} =
\frac{
\frac{a^2 + c^2}{a+c} +
\frac{b^2 + d^2}{b+d} -
((a+b)^2 + (c+d)^2)}
{1 - ((a+b)^2 + (c+d)^2)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
normalizer: str = 'proportional',
**kwargs: Any
) -> None:
"""Initialize GoodmanKruskalTauA instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
normalizer : str
Specifies the normalization type. See :ref:`normalizer `
description in :py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GoodmanKruskalTauA, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
normalizer=normalizer,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return Goodman & Kruskal's Tau A similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Goodman & Kruskal's Tau A similarity
Examples
--------
>>> cmp = GoodmanKruskalTauA()
>>> cmp.sim('cat', 'hat')
0.3304969657208484
>>> cmp.sim('Niall', 'Neil')
0.22137604585914503
>>> cmp.sim('aluminum', 'Catalan')
0.05991264724130685
>>> cmp.sim('ATCG', 'TAGC')
4.119695274745721e-05
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
if a + b == 0 or a + c == 0:
return 0.0
fp = (a * a + c * c) / (a + c)
sp = b * b + d * d
if sp:
sp /= b + d
num = fp + sp - (a + b) ** 2 - (c + d) ** 2
if num > 1e-14:
return num / (1 - (a + b) ** 2 - (c + d) ** 2)
return 0.0 # pragma: no cover
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_goodman_kruskal_tau_b.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._goodman_kruskal_tau_b.
Goodman & Kruskal's Tau B similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GoodmanKruskalTauB']
class GoodmanKruskalTauB(_TokenDistance):
r"""Goodman & Kruskal's Tau B similarity.
For two sets X and Y and a population N, Goodman & Kruskal's :math:`\tau_b`
similarity :cite:`Goodman:1954` is
.. math::
sim_{GK_{\tau_b}}(X, Y) =
\frac{\frac{\frac{|X \cap Y|}{|N|}^2 +
\frac{|X \setminus Y|}{|N|}^2}{\frac{|X|}{|N|}}+
\frac{\frac{|Y \setminus X|}{|N|}^2 +
\frac{|(N \setminus X) \setminus Y|}{|N|}^2}
{\frac{|N \setminus X|}{|N|}} -
(\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
{1 - (\frac{|Y|}{|N|}^2 + \frac{|N \setminus Y|}{|N|}^2)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
after each term has been converted to a proportion by dividing by n, this
is
.. math::
sim_{GK_{\tau_b}} =
\frac{
\frac{a^2 + b^2}{a+b} +
\frac{c^2 + d^2}{c+d} -
((a+c)^2 + (b+d)^2)}
{1 - ((a+c)^2 + (b+d)^2)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
normalizer: str = 'proportional',
**kwargs: Any
) -> None:
"""Initialize GoodmanKruskalTauB instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
normalizer : str
Specifies the normalization type. See :ref:`normalizer `
description in :py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GoodmanKruskalTauB, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
normalizer=normalizer,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return Goodman & Kruskal's Tau B similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Goodman & Kruskal's Tau B similarity
Examples
--------
>>> cmp = GoodmanKruskalTauB()
>>> cmp.sim('cat', 'hat')
0.3304969657208484
>>> cmp.sim('Niall', 'Neil')
0.2346006486710202
>>> cmp.sim('aluminum', 'Catalan')
0.06533810992392582
>>> cmp.sim('ATCG', 'TAGC')
4.119695274745721e-05
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
if a + b == 0 or a + c == 0:
return 0.0
fp = (a * a + b * b) / (a + b)
sp = c * c + d * d
if sp:
sp /= c + d
num = fp + sp - (a + c) ** 2 - (b + d) ** 2
if num > 1e-14:
return num / (1 - (a + c) ** 2 - (b + d) ** 2)
return 0.0 # pragma: no cover
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gotoh.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gotoh.
Gotoh score
"""
from typing import Any, Callable, Optional, cast
from numpy import float_ as np_float
from numpy import zeros as np_zeros
from ._needleman_wunsch import NeedlemanWunsch
__all__ = ['Gotoh']
class Gotoh(NeedlemanWunsch):
"""Gotoh score.
The Gotoh score :cite:`Gotoh:1982` is essentially Needleman-Wunsch with
affine gap penalties.
.. versionadded:: 0.3.6
"""
def __init__(
self,
gap_open: float = 1,
gap_ext: float = 0.4,
sim_func: Optional[Callable[[str, str], float]] = None,
**kwargs: Any
) -> None:
"""Initialize Gotoh instance.
Parameters
----------
gap_open : float
The cost of an open alignment gap (1 by default)
gap_ext : float
The cost of an alignment gap extension (0.4 by default)
sim_func : function
A function that returns the similarity of two characters (identity
similarity by default)
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Gotoh, self).__init__(**kwargs)
self._gap_open = gap_open
self._gap_ext = gap_ext
self._sim_func = cast(
Callable[[str, str], float],
NeedlemanWunsch.sim_matrix if sim_func is None else sim_func,
) # type: Callable[[str, str], float]
def sim_score(self, src: str, tar: str) -> float:
"""Return the Gotoh score of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Gotoh score
Examples
--------
>>> cmp = Gotoh()
>>> cmp.sim_score('cat', 'hat')
2.0
>>> cmp.sim_score('Niall', 'Neil')
1.0
>>> round(cmp.sim_score('aluminum', 'Catalan'), 12)
-0.4
>>> cmp.sim_score('cat', 'hat')
2.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
d_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
p_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
q_mat = np_zeros((len(src) + 1, len(tar) + 1), dtype=np_float)
d_mat[0, 0] = 0
p_mat[0, 0] = float('-inf')
q_mat[0, 0] = float('-inf')
for i in range(1, len(src) + 1):
d_mat[i, 0] = float('-inf')
p_mat[i, 0] = -self._gap_open - self._gap_ext * (i - 1)
q_mat[i, 0] = float('-inf')
if len(tar) > 1:
q_mat[i, 1] = -self._gap_open
for j in range(1, len(tar) + 1):
d_mat[0, j] = float('-inf')
p_mat[0, j] = float('-inf')
if len(src) > 1:
p_mat[1, j] = -self._gap_open
q_mat[0, j] = -self._gap_open - self._gap_ext * (j - 1)
for i in range(1, len(src) + 1):
for j in range(1, len(tar) + 1):
sim_val = self._sim_func(src[i - 1], tar[j - 1])
d_mat[i, j] = max(
d_mat[i - 1, j - 1] + sim_val,
p_mat[i - 1, j - 1] + sim_val,
q_mat[i - 1, j - 1] + sim_val,
)
p_mat[i, j] = max(
d_mat[i - 1, j] - self._gap_open,
p_mat[i - 1, j] - self._gap_ext,
)
q_mat[i, j] = max(
d_mat[i, j - 1] - self._gap_open,
q_mat[i, j - 1] - self._gap_ext,
)
i, j = (n - 1 for n in d_mat.shape)
return cast(float, max(d_mat[i, j], p_mat[i, j], q_mat[i, j]))
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Gotoh score of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Gotoh score
Examples
--------
>>> cmp = Gotoh()
>>> cmp.sim('cat', 'hat')
0.6666666666666667
>>> cmp.sim('Niall', 'Neil')
0.22360679774997896
>>> round(cmp.sim('aluminum', 'Catalan'), 12)
0.0
>>> cmp.sim('cat', 'hat')
0.6666666666666667
.. versionadded:: 0.4.1
"""
if src == tar:
return 1.0
return max(0.0, self.sim_score(src, tar)) / (
self.sim_score(src, src) ** 0.5 * self.sim_score(tar, tar) ** 0.5
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gower_legendre.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gower_legendre.
Gower & Legendre similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GowerLegendre']
class GowerLegendre(_TokenDistance):
r"""Gower & Legendre similarity.
For two sets X and Y and a population N, the Gower & Legendre similarity
:cite:`Gower:1986` is
.. math::
sim_{GowerLegendre}(X, Y) =
\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}
{|X \cap Y| + |(N \setminus X) \setminus Y| +
\theta \cdot |X \triangle Y|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{GowerLegendre} =
\frac{a+d}{a+\theta(b+c)+d}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
theta: float = 0.5,
**kwargs: Any
) -> None:
"""Initialize GowerLegendre instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
theta : float
The weight to place on the symmetric difference.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
self.theta = theta
super(GowerLegendre, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Gower & Legendre similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gower & Legendre similarity
Examples
--------
>>> cmp = GowerLegendre()
>>> cmp.sim('cat', 'hat')
0.9974424552429667
>>> cmp.sim('Niall', 'Neil')
0.9955156950672646
>>> cmp.sim('aluminum', 'Catalan')
0.9903536977491961
>>> cmp.sim('ATCG', 'TAGC')
0.993581514762516
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
apd = self._intersection_card() + self._total_complement_card()
bpc = self._src_only_card() + self._tar_only_card()
return apd / (apd + self.theta * bpc)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_guth.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._guth.
Guth matching algorithm
"""
from typing import Any, List, Optional, Union
from ._distance import _Distance
from ..tokenizer import QGrams, _Tokenizer
__all__ = ['Guth']
class Guth(_Distance):
r"""Guth matching.
Guth matching :cite:`Guth:1976` uses a simple positional matching rule list
to determine whether two names match. Following the original, the
:meth:`.sim_score` method returns only 1.0 for matching or 0.0 for
non-matching.
The :math:`.sim` mathod instead penalizes more distant matches and never
outrightly declares two names a non-matching unless no matches can be made
in the two strings.
Tokens other than single characters can be matched by specifying a
tokenizer during initialization or setting the qval parameter.
.. versionadded:: 0.4.1
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize Guth instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.1
"""
super(Guth, self).__init__(**kwargs)
self.params['tokenizer'] = tokenizer
if 'qval' in self.params:
self.params['tokenizer'] = QGrams(
qval=self.params['qval'], start_stop='$#', skip=0, scaler=None
)
def _token_at(
self, name: Union[List[str], str], pos: int
) -> Optional[str]:
"""Return the token of name at position pos.
Parameters
----------
name : str or list
A string (or list) from which to return a token
pos : int
The position of the token to return
Returns
-------
str
The requested token or None if the position is invalid
.. versionadded:: 0.4.1
"""
if pos < 0:
return None
if pos >= len(name):
return None
return name[pos]
def sim_score(self, src: str, tar: str) -> float:
"""Return the Guth matching score of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Guth matching score (1.0 if matching, otherwise 0.0)
Examples
--------
>>> cmp = Guth()
>>> cmp.sim_score('cat', 'hat')
1.0
>>> cmp.sim_score('Niall', 'Neil')
1.0
>>> cmp.sim_score('aluminum', 'Catalan')
0.0
>>> cmp.sim_score('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.1
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
if self.params['tokenizer']:
src = self.params['tokenizer'].tokenize(src).get_list()
tar = self.params['tokenizer'].tokenize(tar).get_list()
for pos in range(len(src)):
s = self._token_at(src, pos)
if s and s in set(tar[max(0, pos - 1) : pos + 3]):
continue
t = self._token_at(tar, pos)
if t and t in set(src[max(0, pos - 1) : pos + 3]):
continue
s = self._token_at(src, pos + 1)
t = self._token_at(tar, pos + 1)
if s and t and s == t:
continue
s = self._token_at(src, pos + 2)
t = self._token_at(tar, pos + 2)
if s and t and s == t:
continue
break
else:
return 1.0
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the relative Guth similarity of two strings.
This deviates from the algorithm described in :cite:`Guth:1976` in that
more distant matches are penalized, so that less similar terms score
lower that more similar terms.
If no match is found for a particular token in the source string, this
does not result in an automatic 0.0 score. Rather, the score is further
penalized towards 0.0.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Relative Guth matching score
Examples
--------
>>> cmp = Guth()
>>> cmp.sim('cat', 'hat')
0.8666666666666667
>>> cmp.sim('Niall', 'Neil')
0.8800000000000001
>>> cmp.sim('aluminum', 'Catalan')
0.4
>>> cmp.sim('ATCG', 'TAGC')
0.8
.. versionadded:: 0.4.1
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
if self.params['tokenizer']:
src = self.params['tokenizer'].tokenize(src).get_list()
tar = self.params['tokenizer'].tokenize(tar).get_list()
score = 0.0
for pos in range(len(src)):
s = self._token_at(src, pos)
t = self._token_at(tar, pos)
if s and t and s == t:
score += 1.0
continue
t = self._token_at(tar, pos + 1)
if s and t and s == t:
score += 0.8
continue
t = self._token_at(tar, pos + 2)
if s and t and s == t:
score += 0.6
continue
t = self._token_at(tar, pos - 1)
if s and t and s == t:
score += 0.8
continue
s = self._token_at(src, pos - 1)
t = self._token_at(tar, pos)
if s and t and s == t:
score += 0.8
continue
s = self._token_at(src, pos + 1)
if s and t and s == t:
score += 0.8
continue
s = self._token_at(src, pos + 2)
if s and t and s == t:
score += 0.6
continue
s = self._token_at(src, pos + 1)
t = self._token_at(tar, pos + 1)
if s and t and s == t:
score += 0.6
continue
s = self._token_at(src, pos + 2)
t = self._token_at(tar, pos + 2)
if s and t and s == t:
score += 0.2
continue
return score / len(src)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_guttman_lambda_a.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._guttman_lambda_a.
Guttman's Lambda A similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GuttmanLambdaA']
class GuttmanLambdaA(_TokenDistance):
r"""Guttman's Lambda A similarity.
For two sets X and Y and a population N, Guttman's :math:`\lambda_a`
similarity :cite:`Guttman:1941` is
.. math::
sim_{Guttman_{\lambda_a}}(X, Y) =
\frac{max(|X \cap Y|, |Y \setminus X|) + max(|X \setminus Y|,
|(N \setminus X) \setminus Y|) - max(|X|, |N \setminus X|)}
{|N| - max(|X|, |N \setminus X|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Guttman_{\lambda_a}} =
\frac{max(a, c) + max(b, d) - max(a+b, c+d)}{n - max(a+b, c+d)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize GuttmanLambdaA instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GuttmanLambdaA, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Guttman Lambda A similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Guttman's Lambda A similarity
Examples
--------
>>> cmp = GuttmanLambdaA()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('Niall', 'Neil')
0.0
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
num = round(float(max(a, c) + max(b, d) - max(a + b, c + d)), 15)
if num > 1e-8:
return num / float(n - max(a + b, c + d))
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_guttman_lambda_b.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._guttman_lambda_b.
Guttman's Lambda B similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GuttmanLambdaB']
class GuttmanLambdaB(_TokenDistance):
r"""Guttman's Lambda B similarity.
For two sets X and Y and a population N, Guttman's :math:`\lambda_b`
similarity :cite:`Guttman:1941` is
.. math::
sim_{Guttman_{\lambda_b}}(X, Y) =
\frac{max(|X \cap Y|, |X \setminus Y|) + max(|Y \setminus X|,
|(N \setminus X) \setminus Y|) - max(|Y|, |N \setminus Y|)}
{|N| - max(|Y|, |N \setminus Y|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Guttman_{\lambda_b}} =
\frac{max(a, b) + max(c, d) - max(a+c, b+d)}{n - max(a+c, b+d)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize GuttmanLambdaB instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GuttmanLambdaB, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Guttman Lambda B similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Guttman's Lambda B similarity
Examples
--------
>>> cmp = GuttmanLambdaB()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('Niall', 'Neil')
0.0
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
if not src or not tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
num = round(float(max(a, b) + max(c, d) - max(a + c, b + d)), 15)
if num > 1e-8:
return num / float(n - max(a + c, b + d))
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_gwet_ac.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._gwet_ac.
Gwet's AC correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['GwetAC']
class GwetAC(_TokenDistance):
r"""Gwet's AC correlation.
For two sets X and Y and a population N, Gwet's AC correlation
:cite:`Gwet:2008` is
.. math::
corr_{Gwet_{AC}}(X, Y) = AC =
\frac{p_o - p_e^{AC}}{1 - p_e^{AC}}
where
.. math::
\begin{array}{lll}
p_o &=&\frac{|X \cap Y| + |(N \setminus X) \setminus Y|}{|N|}
p_e^{AC}&=&\frac{1}{2}\Big(\frac{|X|+|Y|}{|N|}\cdot
\frac{|X \setminus Y| + |Y \setminus X|}{|N|}\Big)
\end{array}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
\begin{array}{lll}
p_o&=&\frac{a+d}{n}
p_e^{AC}&=&\frac{1}{2}\Big(\frac{2a+b+c}{n}\cdot
\frac{2d+b+c}{n}\Big)
\end{array}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize GwetAC instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(GwetAC, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Gwet's AC correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gwet's AC correlation
Examples
--------
>>> cmp = GwetAC()
>>> cmp.corr('cat', 'hat')
0.9948456319360438
>>> cmp.corr('Niall', 'Neil')
0.990945276504824
>>> cmp.corr('aluminum', 'Catalan')
0.9804734301840141
>>> cmp.corr('ATCG', 'TAGC')
0.9870811678360627
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = a + b + c + d
po = (a + d) / n
q = (2 * a + b + c) / (2 * n)
pe = 2 * q * (1 - q)
return (po - pe) / (1 - pe)
def sim(self, src: str, tar: str) -> float:
"""Return the Gwet's AC similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Gwet's AC similarity
Examples
--------
>>> cmp = GwetAC()
>>> cmp.sim('cat', 'hat')
0.9974228159680218
>>> cmp.sim('Niall', 'Neil')
0.995472638252412
>>> cmp.sim('aluminum', 'Catalan')
0.9902367150920071
>>> cmp.sim('ATCG', 'TAGC')
0.9935405839180314
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_hamann.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._hamann.
Hamann correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Hamann']
class Hamann(_TokenDistance):
r"""Hamann correlation.
For two sets X and Y and a population N, the Hamann correlation
:cite:`Hamann:1961` is
.. math::
corr_{Hamann}(X, Y) =
\frac{|X \cap Y| + |(N \setminus X) \setminus Y| -
|X \setminus Y| - |Y \setminus X|}{|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{Hamann} =
\frac{a+d-b-c}{n}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Hamann instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Hamann, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Hamann correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Hamann correlation
Examples
--------
>>> cmp = Hamann()
>>> cmp.corr('cat', 'hat')
0.9897959183673469
>>> cmp.corr('Niall', 'Neil')
0.9821428571428571
>>> cmp.corr('aluminum', 'Catalan')
0.9617834394904459
>>> cmp.corr('ATCG', 'TAGC')
0.9744897959183674
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
return (
self._intersection_card()
+ self._total_complement_card()
- self._src_only_card()
- self._tar_only_card()
) / self._population_unique_card()
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Hamann similarity of two strings.
Hamann similarity, which has a range [-1, 1] is normalized to [0, 1] by
adding 1 and dividing by 2.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Hamann similarity
Examples
--------
>>> cmp = Hamann()
>>> cmp.sim('cat', 'hat')
0.9948979591836735
>>> cmp.sim('Niall', 'Neil')
0.9910714285714286
>>> cmp.sim('aluminum', 'Catalan')
0.9808917197452229
>>> cmp.sim('ATCG', 'TAGC')
0.9872448979591837
.. versionadded:: 0.4.0
"""
return (self.corr(src, tar) + 1) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_hamming.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._hamming.
Hamming distance
"""
from typing import Any
from ._distance import _Distance
__all__ = ['Hamming']
class Hamming(_Distance):
"""Hamming distance.
Hamming distance :cite:`Hamming:1950` equals the number of character
positions at which two strings differ. For strings of unequal lengths,
it is not normally defined. By default, this implementation calculates the
Hamming distance of the first n characters where n is the lesser of the two
strings' lengths and adds to this the difference in string lengths.
.. versionadded:: 0.3.6
"""
def __init__(self, diff_lens: bool = True, **kwargs: Any) -> None:
"""Initialize Hamming instance.
Parameters
----------
diff_lens : bool
If True (default), this returns the Hamming distance for those
characters that have a matching character in both strings plus the
difference in the strings' lengths. This is equivalent to extending
the shorter string with obligatorily non-matching characters. If
False, an exception is raised in the case of strings of unequal
lengths.
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Hamming, self).__init__(**kwargs)
self._diff_lens = diff_lens
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Hamming distance between two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
int
The Hamming distance between src & tar
Raises
------
ValueError
Undefined for sequences of unequal length; set diff_lens to True
for Hamming distance between strings of unequal lengths.
Examples
--------
>>> cmp = Hamming()
>>> cmp.dist_abs('cat', 'hat')
1
>>> cmp.dist_abs('Niall', 'Neil')
3
>>> cmp.dist_abs('aluminum', 'Catalan')
8
>>> cmp.dist_abs('ATCG', 'TAGC')
4
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if not self._diff_lens and len(src) != len(tar):
raise ValueError(
'Undefined for sequences of unequal length; set diff_lens '
+ 'to True for Hamming distance between strings of unequal '
+ 'lengths.'
)
hdist = 0
if self._diff_lens:
hdist += abs(len(src) - len(tar))
hdist += sum(c1 != c2 for c1, c2 in zip(src, tar))
return hdist
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Hamming distance between two strings.
Hamming distance normalized to the interval [0, 1].
The Hamming distance is normalized by dividing it
by the greater of the number of characters in src & tar (unless
diff_lens is set to False, in which case an exception is raised).
The arguments are identical to those of the hamming() function.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Hamming distance
Examples
--------
>>> cmp = Hamming()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> cmp.dist('Niall', 'Neil')
0.6
>>> cmp.dist('aluminum', 'Catalan')
1.0
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if src == tar:
return 0.0
return self.dist_abs(src, tar) / max(len(src), len(tar))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_harris_lahey.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._harris_lahey.
Harris & Lahey similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['HarrisLahey']
class HarrisLahey(_TokenDistance):
r"""Harris & Lahey similarity.
For two sets X and Y and a population N, Harris & Lahey similarity
:cite:`Harris:1978` is
.. math::
sim_{HarrisLahey}(X, Y) =
\frac{|X \cap Y|}{|X \cup Y|}\cdot
\frac{|N \setminus Y| + |N \setminus X|}{2|N|}+
\frac{|(N \setminus X) \setminus Y|}{|N \setminus (X \cap Y)|}\cdot
\frac{|X| + |Y|}{2|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{HarrisLahey} =
\frac{a}{a+b+c}\cdot\frac{2d+b+c}{2n}+
\frac{d}{d+b+c}\cdot\frac{2a+b+c}{2n}
Notes
-----
Most catalogs of similarity coefficients
:cite:`Warrens:2008,Morris:2012,Xiang:2013` omit the :math:`n` terms in the
denominators, but the worked example in :cite:`Harris:1978` makes it clear
that this is intended in the original.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize HarrisLahey instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(HarrisLahey, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Harris & Lahey similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Harris & Lahey similarity
Examples
--------
>>> cmp = HarrisLahey()
>>> cmp.sim('cat', 'hat')
0.3367085964820711
>>> cmp.sim('Niall', 'Neil')
0.22761577457069784
>>> cmp.sim('aluminum', 'Catalan')
0.07244410503054725
>>> cmp.sim('ATCG', 'TAGC')
0.006296204706372345
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
score = 0.0
if a and (d + b + c):
score += a / (a + b + c) * (2 * d + b + c) / (2 * n)
if d and (a + b + c):
score += d / (d + b + c) * (2 * a + b + c) / (2 * n)
return score
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_hassanat.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._hassanat.
Hassanat distance
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Hassanat']
class Hassanat(_TokenDistance):
r"""Hassanat distance.
For two multisets X and Y drawn from an alphabet S, Hassanat distance
:cite:`Hassanat:2014` is
.. math::
dist_{Hassanat}(X, Y) = \sum_{i \in S} D(X_i, Y_i)
where
.. math::
D(X_i, Y_i) =
\left\{\begin{array}{ll}
1-\frac{1+min(X_i, Y_i)}{1+max(X_i, Y_i)}&,
min(X_i, Y_i) \geq 0
\\
\\
1-\frac{1+min(X_i, Y_i)+|min(X_i, Y_i)|}
{1+max(X_i, Y_i)+|min(X_i, Y_i)|}&,
min(X_i, Y_i) < 0
\end{array}\right.
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize Hassanat instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(Hassanat, self).__init__(tokenizer=tokenizer, **kwargs)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Hassanat distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Hassanat distance
Examples
--------
>>> cmp = Hassanat()
>>> cmp.dist_abs('cat', 'hat')
2.0
>>> cmp.dist_abs('Niall', 'Neil')
3.5
>>> cmp.dist_abs('aluminum', 'Catalan')
7.166666666666667
>>> cmp.dist_abs('ATCG', 'TAGC')
5.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
distance = 0.0
for tok in self._total().keys():
x = self._src_tokens[tok]
y = self._tar_tokens[tok]
min_val = min(x, y)
if min_val >= 0:
distance += 1 - (1 + min_val) / (1 + max(x, y))
else:
distance += 1 - 1 / (1 + max(x, y) - min_val)
return distance
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Hassanat distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Hassanat distance
Examples
--------
>>> cmp = Hassanat()
>>> cmp.dist('cat', 'hat')
0.3333333333333333
>>> cmp.dist('Niall', 'Neil')
0.3888888888888889
>>> cmp.dist('aluminum', 'Catalan')
0.4777777777777778
>>> cmp.dist('ATCG', 'TAGC')
0.5
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
return self.dist_abs(src, tar) / len(self._total().keys())
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_hawkins_dotson.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._hawkins_dotson.
Hawkins & Dotson similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['HawkinsDotson']
class HawkinsDotson(_TokenDistance):
r"""Hawkins & Dotson similarity.
For two sets X and Y and a population N, Hawkins & Dotson similarity
:cite:`Hawkins:1973` is the mean of the occurrence agreement and
non-occurrence agreement
.. math::
sim_{HawkinsDotson}(X, Y) =
\frac{1}{2}\cdot\Big(
\frac{|X \cap Y|}{|X \cup Y|}+
\frac{|(N \setminus X) \setminus Y|}{|N \setminus (X \cap Y)|}
\Big)
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{HawkinsDotson} =
\frac{1}{2}\cdot\Big(\frac{a}{a+b+c}+\frac{d}{b+c+d}\Big)
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize HawkinsDotson instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(HawkinsDotson, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
"""Return the Hawkins & Dotson similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Hawkins & Dotson similarity
Examples
--------
>>> cmp = HawkinsDotson()
>>> cmp.sim('cat', 'hat')
0.6641091219096334
>>> cmp.sim('Niall', 'Neil')
0.606635407786303
>>> cmp.sim('aluminum', 'Catalan')
0.5216836734693877
>>> cmp.sim('ATCG', 'TAGC')
0.49362244897959184
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
score = 0.0
if a:
score += a / (a + b + c)
if d:
score += d / (b + c + d)
return score / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_hellinger.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._hellinger.
Hellinger distance
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Hellinger']
class Hellinger(_TokenDistance):
r"""Hellinger distance.
For two multisets X and Y drawn from an alphabet S, Hellinger distance
:cite:`Hellinger:1909` is
.. math::
dist_{Hellinger}(X, Y) =
\sqrt{2 \cdot \sum_{i \in S} (\sqrt{|A_i|} - \sqrt{|B_i|})^2}
.. versionadded:: 0.4.0
"""
def __init__(
self, tokenizer: Optional[_Tokenizer] = None, **kwargs: Any
) -> None:
"""Initialize Hellinger instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
.. versionadded:: 0.4.0
"""
super(Hellinger, self).__init__(tokenizer=tokenizer, **kwargs)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Hellinger distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Hellinger distance
Examples
--------
>>> cmp = Hellinger()
>>> cmp.dist_abs('cat', 'hat')
2.8284271247461903
>>> cmp.dist_abs('Niall', 'Neil')
3.7416573867739413
>>> cmp.dist_abs('aluminum', 'Catalan')
5.477225575051661
>>> cmp.dist_abs('ATCG', 'TAGC')
4.47213595499958
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
alphabet = self._total().keys()
return (
2
* sum(
( # type: ignore
(abs(self._src_tokens[tok])) ** 0.5
- (abs(self._tar_tokens[tok])) ** 0.5
)
** 2
for tok in alphabet
)
) ** 0.5
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Hellinger distance of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Hellinger distance
Examples
--------
>>> cmp = Hellinger()
>>> cmp.dist('cat', 'hat')
0.8164965809277261
>>> cmp.dist('Niall', 'Neil')
0.881917103688197
>>> cmp.dist('aluminum', 'Catalan')
0.9128709291752769
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
score = self.dist_abs(src, tar)
norm = (
2
* sum(
max(self._src_tokens[tok], self._tar_tokens[tok]) ** 2
for tok in self._total().keys()
)
) ** 0.5
return score / norm
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_henderson_heron.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._henderson_heron.
Henderson-Heron dissimilarity
"""
from math import factorial
from typing import Any
from ._token_distance import _TokenDistance
__all__ = ['HendersonHeron']
class HendersonHeron(_TokenDistance):
r"""Henderson-Heron dissimilarity.
For two sets X and Y and a population N, Henderson-Heron dissimilarity
:cite:`Henderson:1977` is:
.. math:
sim_{Henderson-Heron}(X, Y) = \frac{|X|! |Y|! (|N| - |X|)!
(|N|- |Y|)!}{|N|! |X \cap Y|! (|X| - |X \cap Y|)!
(|Y| - |Y \cap X|)! (|N| - |X| - |Y| + |X \cap Y|)!}
.. versionadded:: 0.4.1
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize HendersonHeron instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(HendersonHeron, self).__init__(**kwargs)
def dist(self, src: str, tar: str) -> float:
"""Return the Henderson-Heron dissimilarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Henderson-Heron dissimilarity
Examples
--------
>>> cmp = HendersonHeron()
>>> cmp.dist('cat', 'hat')
0.00011668873858680838
>>> cmp.dist('Niall', 'Neil')
0.00048123075776606097
>>> cmp.dist('aluminum', 'Catalan')
0.08534181060514882
>>> cmp.dist('ATCG', 'TAGC')
0.9684367974410505
.. versionadded:: 0.4.1
"""
self._tokenize(src, tar)
a = self._intersection_card()
ab = self._src_card()
ac = self._tar_card()
n = self._population_unique_card()
return (
factorial(ab)
* factorial(ac)
* factorial(n - ab)
* factorial(n - ac)
/ (
factorial(n)
* factorial(a)
* factorial(ab - a)
* factorial(ac - a)
* factorial((n - ac - ab + a))
)
)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_higuera_mico.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._higuera_mico.
The Higuera-Micó contextual normalized edit distance
"""
from typing import Any
from numpy import full as np_full
from ._distance import _Distance
__all__ = ['HigueraMico']
class HigueraMico(_Distance):
"""The Higuera-Micó contextual normalized edit distance.
This is presented in :cite:`Higuera:2008`.
This measure is not normalized to a particular range. Indeed, for an
string of infinite length as and a string of 0 length, the contextual
normalized edit distance would be infinity. But so long as the relative
difference in string lengths is not too great, the distance will generally
remain below 1.0
Notes
-----
The "normalized" version of this distance, implemented in the dist method
is merely the minimum of the distance and 1.0.
.. versionadded:: 0.4.0
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize Levenshtein instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(HigueraMico, self).__init__(**kwargs)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Higuera-Micó distance between two strings.
This is a straightforward implementation of Higuera & Micó pseudocode
from :cite:`Higuera:2008`, ported to Numpy.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The Higuera-Micó distance between src & tar
Examples
--------
>>> cmp = HigueraMico()
>>> cmp.dist_abs('cat', 'hat')
0.3333333333333333
>>> cmp.dist_abs('Niall', 'Neil')
0.5333333333333333
>>> cmp.dist_abs('aluminum', 'Catalan')
0.7916666666666667
>>> cmp.dist_abs('ATCG', 'TAGC')
0.6000000000000001
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
mx = np_full(
(len(src) + 1, len(tar) + 1, len(src) + len(tar) + 1),
fill_value=float('-inf'),
dtype=float,
)
for i in range(1, len(src) + 1):
mx[i, 0, i] = 0
for j in range(len(tar) + 1):
mx[0, j, j] = j
for i in range(1, len(src) + 1):
for j in range(1, len(tar) + 1):
if src[i - 1] == tar[j - 1]:
for k in range(len(src) + len(tar) + 1):
mx[i, j, k] = mx[i - 1, j - 1, k]
else:
for k in range(1, len(src) + len(tar) + 1):
mx[i, j, k] = mx[i - 1, j - 1, k - 1]
for k in range(1, len(src) + len(tar) + 1):
mx[i, j, k] = max(
mx[i - 1, j, k - 1],
mx[i, j - 1, k - 1] + 1,
mx[i, j, k],
)
min_dist = float('inf')
for k in range(len(src) + len(tar) + 1):
if mx[len(src), len(tar), k] >= 0:
n_i = int(mx[len(src), len(tar), k])
n_d = len(src) - len(tar) + n_i
n_s = k - (n_i + n_d)
loc_dist = 0.0
for i in range(len(src) + 1, len(src) + n_i + 1):
loc_dist += 1 / i
loc_dist += n_s / (len(src) + n_i)
for i in range(len(tar) + 1, len(tar) + n_d + 1):
loc_dist += 1 / i
if loc_dist < min_dist:
min_dist = loc_dist
return min_dist
def dist(self, src: str, tar: str) -> float:
"""Return the bounded Higuera-Micó distance between two strings.
This is the distance bounded to the range [0, 1].
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The bounded Higuera-Micó distance between src & tar
Examples
--------
>>> cmp = HigueraMico()
>>> cmp.dist('cat', 'hat')
0.3333333333333333
>>> cmp.dist('Niall', 'Neil')
0.5333333333333333
>>> cmp.dist('aluminum', 'Catalan')
0.7916666666666667
>>> cmp.dist('ATCG', 'TAGC')
0.6000000000000001
.. versionadded:: 0.4.0
"""
return min(1.0, self.dist_abs(src, tar))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_horn_morisita.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._horn_morisita.
Horn-Morisita index of overlap
"""
from typing import Any
from ._token_distance import _TokenDistance
__all__ = ['HornMorisita']
class HornMorisita(_TokenDistance):
r"""Horn-Morisita index of overlap.
Horn-Morisita index of overlap :cite:`Horn:1966`, given two populations X
and Y drawn from S species, is:
.. math::
sim_{Horn-Morisita}(X, Y) =
C_{\lambda} = \frac{2\sum_{i=1}^S x_i y_i}
{(\hat{\lambda}_x + \hat{\lambda}_y)XY}
where
.. math::
X = \sum_{i=1}^S x_i ~~;~~ Y = \sum_{i=1}^S y_i
.. math::
\hat{\lambda}_x = \frac{\sum_{i=1}^S x_i^2}{X^2} ~~;~~
\hat{\lambda}_y = \frac{\sum_{i=1}^S y_i^2}{Y^2}
Observe that this is identical to Morisita similarity, except for the
definition of the :math:`\lambda` values in the denominator.
.. versionadded:: 0.4.1
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize HornMorisita instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(HornMorisita, self).__init__(**kwargs)
def sim(self, src: str, tar: str) -> float:
"""Return the Horn-Morisita similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Horn-Morisita similarity
Examples
--------
>>> cmp = HornMorisita()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3636363636363636
>>> cmp.sim('aluminum', 'Catalan')
0.10650887573964497
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.1
"""
self._tokenize(src, tar)
intersection = self._intersection()
src_card = self._src_card()
tar_card = self._tar_card()
src_lambda = 0.0
tar_lambda = 0.0
for val in self._src_tokens.values():
src_lambda += val * val
if src_lambda:
src_lambda /= src_card * src_card
for val in self._tar_tokens.values():
tar_lambda += val * val
if tar_lambda:
tar_lambda /= tar_card * tar_card
sim = 0.0
for symbol in intersection.keys():
sim += self._src_tokens[symbol] * self._tar_tokens[symbol]
sim *= 2
if src_card:
sim /= src_card
if tar_card:
sim /= tar_card
if src_lambda + tar_lambda:
sim /= src_lambda + tar_lambda
return sim
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_hurlbert.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._hurlbert.
Hurlbert correlation
"""
from math import ceil, copysign, floor
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Hurlbert']
class Hurlbert(_TokenDistance):
r"""Hurlbert correlation.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
Hurlbert's coefficient of interspecific association :cite:`Hurlbert:1969`
is
.. math::
corr_{Hurlbert} =
\frac{ad-bc}{|ad-bc|} \sqrt{\frac{Obs_{\chi^2}-Min_{\chi^2}}
{Max_{\chi^2}-Min_{\chi^2}}}
Where:
.. math::
\begin{array}{lll}
Obs_{\chi^2} &= \frac{(ad-bc)^2n}{(a+b)(a+c)(b+d)(c+d)}
Max_{\chi^2} &= \frac{(a+b)(b+d)n}{(a+c)(c+d)} &\textrm{ when }
ad \geq bc
Max_{\chi^2} &= \frac{(a+b)(a+c)n}{(b+d)(c+d)} &\textrm{ when }
ad < bc \textrm{ and } a \leq d
Max_{\chi^2} &= \frac{(b+d)(c+d)n}{(a+b)(a+c)} &\textrm{ when }
ad < bc \textrm{ and } a > d
Min_{\chi^2} &= \frac{n^3 (\hat{a} - g(\hat{a}))^2}
{(a+b)(a+c)(c+d)(b+d)}
\textrm{where } \hat{a} &= \frac{(a+b)(a+c)}{n}
\textrm{and } g(\hat{a}) &= \lfloor\hat{a}\rfloor
&\textrm{ when } ad < bc,
\textrm{otherwise } g(\hat{a}) &= \lceil\hat{a}\rceil
\end{array}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Hurlbert instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Hurlbert, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Hurlbert correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Hurlbert correlation
Examples
--------
>>> cmp = Hurlbert()
>>> cmp.corr('cat', 'hat')
0.497416003373807
>>> cmp.corr('Niall', 'Neil')
0.32899851514665707
>>> cmp.corr('aluminum', 'Catalan')
0.10144329225459262
>>> cmp.corr('ATCG', 'TAGC')
-1.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
if not src or not tar:
return -1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = a + b + c + d
admbc = a * d - b * c
marginals_product = (
max(1.0, a + b)
* max(1.0, a + c)
* max(1.0, b + d)
* max(1.0, c + d)
)
obs_chisq = admbc * admbc * n / marginals_product
if a * d >= b * c:
max_chisq = (
(a + b) * (b + d) * n / (max(1.0, a + c) * max(1.0, c + d))
)
elif a <= d:
max_chisq = (
(a + b) * (a + c) * n / (max(1.0, b + d) * max(1.0, c + d))
)
else:
max_chisq = (
(b + d) * (c + d) * n / (max(1.0, a + b) * max(1.0, a + c))
)
a_hat = (a + b) * (a + c) / n
g_a_hat = ceil(a_hat) if a * d < b * c else floor(a_hat)
min_chisq = n ** 3 * (a_hat - g_a_hat) ** 2 / marginals_product
num = obs_chisq - min_chisq
if num:
return copysign(abs(num / (max_chisq - min_chisq)) ** 0.5, admbc)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the Hurlbert similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Hurlbert similarity
Examples
--------
>>> cmp = Hurlbert()
>>> cmp.sim('cat', 'hat')
0.7487080016869034
>>> cmp.sim('Niall', 'Neil')
0.6644992575733285
>>> cmp.sim('aluminum', 'Catalan')
0.5507216461272963
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_ident.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._ident.
Identity similarity & distance
"""
from ._distance import _Distance
__all__ = ['Ident']
class Ident(_Distance):
"""Identity distance and similarity.
.. versionadded:: 0.3.6
"""
def sim(self, src: str, tar: str) -> float:
"""Return the identity similarity of two strings.
Identity similarity is 1.0 if the two strings are identical,
otherwise 0.0
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Identity similarity
Examples
--------
>>> cmp = Ident()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('cat', 'cat')
1.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return 1.0 if src == tar else 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_inclusion.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._inclusion.
Bouchard & Pouyez's INClusion Programme
"""
from ._distance import _Distance
from ._levenshtein import Levenshtein
__all__ = ['Inclusion']
class Inclusion(_Distance):
"""Inclusion distance.
The INC Programme, developed by :cite:`Bouchard:1980` designates two
terms as being "included" when:
- One name is shorter than the other
- There are at least 3 common characters
- There is at most one difference, disregarding unmatching
prefixes and suffixes
In addition to these rules, this implementation considers two terms
as being "included" if they are identical.
The return value, though a float, can only take one of two values:
0.0, indicating inclusion, or 1.0, indication non-inclusion.
.. versionadded:: 0.4.1
"""
_lev = Levenshtein()
def dist(self, src: str, tar: str) -> float:
"""Return the INClusion Programme value of two words.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The INC Programme distance
Examples
--------
>>> cmp = Inclusion()
>>> cmp.dist('cat', 'hat')
1.0
>>> cmp.dist('Niall', 'Neil')
1.0
>>> cmp.dist('aluminum', 'Catalan')
1.0
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.1
"""
if src == tar:
return 0.0
if len(src) == len(tar):
return 1.0
diff, src, tar = self._lev.alignment(src, tar)
src_tok = list(src)
tar_tok = list(tar)
while src_tok and src_tok[0] == '-':
src_tok.pop(0)
tar_tok.pop(0)
diff -= 1
while tar_tok and tar_tok[0] == '-':
src_tok.pop(0)
tar_tok.pop(0)
diff -= 1
while src_tok and src_tok[-1] == '-':
src_tok.pop()
tar_tok.pop()
diff -= 1
while tar_tok and tar_tok[-1] == '-':
src_tok.pop()
tar_tok.pop()
diff -= 1
if diff > 1:
return 1.0
if len(src_tok) - diff < 3:
return 1.0
return 0.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_indel.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._indel.
Indel distance
"""
from typing import Any
from ._levenshtein import Levenshtein
__all__ = ['Indel']
class Indel(Levenshtein):
"""Indel distance.
This is equivalent to Levenshtein distance, when only inserts and deletes
are possible.
.. versionadded:: 0.3.6
"""
def __init__(self, **kwargs: Any) -> None:
"""Initialize Levenshtein instance.
Parameters
----------
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(Indel, self).__init__(
mode='lev', cost=(1, 1, float('inf'), float('inf')), **kwargs
)
def dist(self, src: str, tar: str) -> float:
"""Return the normalized indel distance between two strings.
This is equivalent to normalized Levenshtein distance, when only
inserts and deletes are possible.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized indel distance
Examples
--------
>>> cmp = Indel()
>>> round(cmp.dist('cat', 'hat'), 12)
0.333333333333
>>> round(cmp.dist('Niall', 'Neil'), 12)
0.333333333333
>>> round(cmp.dist('Colin', 'Cuilen'), 12)
0.454545454545
>>> cmp.dist('ATCG', 'TAGC')
0.5
.. versionadded:: 0.3.6
"""
if src == tar:
return 0.0
return self.dist_abs(src, tar) / (len(src) + len(tar))
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_isg.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._isg.
Bouchard & Pouyez's Indice de Similitude-Guth (ISG)
"""
from typing import Any
from ._distance import _Distance
__all__ = ['ISG']
class ISG(_Distance):
"""Indice de Similitude-Guth (ISG) similarity.
This is an implementation of Bouchard & Pouyez's Indice de Similitude-Guth
(ISG) :cite:`Bouchard:1980`. At its heart, ISG is Jaccard similarity, but
limits on token matching are added according to part of Guth's matching
criteria :cite:`Guth:1976`.
:cite:`Bouchard:1980` is limited in its implementation details. Based on
the examples given in the paper, it appears that only the first 4 of Guth's
rules are considered (a letter in the first string must match a letter in
the second string appearing in the same position, an adjacent position, or
two positions ahead). It also appears that the distance in the paper is
the greater of the distance from string 1 to string 2 and the distance
from string 2 to string 1.
These qualities can be specified as parameters. At initialization, specify
``full_guth=True`` to apply all of Guth's rules and ``symmetric=False`` to
calculate only the distance from string 1 to string 2.
.. versionadded:: 0.4.1
"""
def __init__(
self, full_guth: bool = False, symmetric: bool = True, **kwargs: Any
) -> None:
"""Initialize ISG instance.
Parameters
----------
full_guth : bool
Whether to apply all of Guth's matching rules
symmetric : bool
Whether to calculate the symmetric distance
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.1
"""
super(ISG, self).__init__(**kwargs)
self._full_guth = full_guth
self._symmetric = symmetric
def _isg_i(self, src: str, tar: str) -> float:
"""Return an individual ISG similarity (not symmetric) for src to tar.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The ISG similarity
.. versionadded:: 0.4.1
"""
def _char_at(name: str, pos: int) -> str:
if pos >= len(name):
return ''
return name[pos]
matches = 0
for pos in range(len(src)):
s = _char_at(src, pos)
if s and s in set(tar[max(0, pos - 1) : pos + 3]):
matches += 1
continue
if self._full_guth:
t = _char_at(tar, pos)
if t and t in set(src[max(0, pos - 1) : pos + 3]):
matches += 1
continue
s = _char_at(src, pos + 1)
t = _char_at(tar, pos + 1)
if s and t and s == t:
matches += 1
continue
s = _char_at(src, pos + 2)
t = _char_at(tar, pos + 2)
if s and t and s == t:
matches += 1
continue
return matches / (len(src) + len(tar) - matches)
def sim(self, src: str, tar: str) -> float:
"""Return the Indice de Similitude-Guth (ISG) similarity of two words.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
The ISG similarity
Examples
--------
>>> cmp = ISG()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.5
>>> cmp.sim('aluminum', 'Catalan')
0.15384615384615385
>>> cmp.sim('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.1
"""
if src == tar:
return 1.0
if len(src) > len(tar):
src, tar = tar, src
elif self._symmetric and len(src) == len(tar):
return max(self._isg_i(src, tar), self._isg_i(tar, src))
return self._isg_i(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_iterative_substring.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._iterative_substring.
Iterative-SubString (I-Sub) correlation
"""
from typing import Any
from ._distance import _Distance
__all__ = ['IterativeSubString']
class IterativeSubString(_Distance):
r"""Iterative-SubString correlation.
Iterative-SubString (I-Sub) correlation :cite:`Stoilos:2005`
This is a straightforward port of the primary author's Java implementation:
http://www.image.ece.ntua.gr/~gstoil/software/I_Sub.java
.. versionadded:: 0.4.0
"""
def __init__(
self,
hamacher: float = 0.6,
normalize_strings: bool = False,
**kwargs: Any
) -> None:
"""Initialize IterativeSubString instance.
Parameters
----------
hamacher : float
The constant factor for the Hamacher product
normalize_strings : bool
Normalize the strings by removing the characters in '._ ' and
lower casing
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(IterativeSubString, self).__init__(**kwargs)
self._normalize_strings = normalize_strings
self._hamacher = hamacher
def corr(self, src: str, tar: str) -> float:
"""Return the Iterative-SubString correlation of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Iterative-SubString correlation
Examples
--------
>>> cmp = IterativeSubString()
>>> cmp.corr('cat', 'hat')
-1.0
>>> cmp.corr('Niall', 'Neil')
-0.9
>>> cmp.corr('aluminum', 'Catalan')
-1.0
>>> cmp.corr('ATCG', 'TAGC')
-1.0
.. versionadded:: 0.4.0
"""
input_src = src
input_tar = tar
def _winkler_improvement(
src: str, tar: str, commonality: float
) -> float:
for i in range(min(len(src), len(tar))):
if src[i] != tar[i]:
break
return min(4.0, i) * 0.1 * (1.0 - commonality)
if self._normalize_strings:
src = src.lower()
tar = tar.lower()
for ch in '._ ':
src = src.replace(ch, '')
tar = tar.replace(ch, '')
src_len = len(src)
tar_len = len(tar)
if src_len == 0 and tar_len == 0:
return 1.0
if src_len == 0 or tar_len == 0:
return -1.0
common = 0
best = 2
while len(src) > 0 and len(tar) > 0 and best != 0:
best = 0
ls = len(src)
lt = len(tar)
start_src = 0
end_src = 0
start_tar = 0
end_tar = 0
i = 0
while i < ls and ls - i > best:
j = 0
while lt - j > best:
k = i
while j < lt and src[k] != tar[j]:
j += 1
if j != lt:
p = j
j += 1
k += 1
while j < lt and k < ls and src[k] == tar[j]:
j += 1
k += 1
if k - i > best:
best = k - i
start_src = i
end_src = k
start_tar = p
end_tar = j
i += 1
src = src[:start_src] + src[end_src:]
tar = tar[:start_tar] + tar[end_tar:]
if best > 2:
common += best
else:
best = 0
commonality = 2.0 * common / (src_len + tar_len)
winkler_improvement = _winkler_improvement(
input_src, input_tar, commonality
)
unmatched_src = max(src_len - common, 0) / src_len
unmatched_tar = max(tar_len - common, 0) / tar_len
unmatched_prod = unmatched_src * unmatched_tar
dissimilarity = unmatched_prod / (
self._hamacher
+ (1 - self._hamacher)
* (unmatched_src + unmatched_tar - unmatched_prod)
)
return commonality - dissimilarity + winkler_improvement
def sim(self, src: str, tar: str) -> float:
"""Return the Iterative-SubString similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Iterative-SubString similarity
Examples
--------
>>> cmp = IterativeSubString()
>>> cmp.sim('cat', 'hat')
0.0
>>> cmp.sim('Niall', 'Neil')
0.04999999999999999
>>> cmp.sim('aluminum', 'Catalan')
0.0
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return (self.corr(src, tar) + 1.0) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_jaccard.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._jaccard.
Jaccard similarity coefficient, distance, & Tanimoto coefficient
"""
from math import log2
from typing import Any, Optional
from ._tversky import Tversky
from ..tokenizer import _Tokenizer
__all__ = ['Jaccard']
class Jaccard(Tversky):
r"""Jaccard similarity.
For two sets X and Y, the Jaccard similarity coefficient
:cite:`Jaccard:1901,Ruzicka:1958` is
.. math::
sim_{Jaccard}(X, Y) =
\frac{|X \cap Y|}{|X \cup Y|}`.
This is identical to the Tanimoto similarity coefficient
:cite:`Tanimoto:1958`
and the Tversky index :cite:`Tversky:1977` for
:math:`\alpha = \beta = 1`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Jaccard} =
\frac{a}{a+b+c}
Notes
-----
The multiset variant is termed Ellenberg similarity :cite:`Ellenberg:1956`.
.. versionadded:: 0.3.6
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Jaccard instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Jaccard, self).__init__(
alpha=1,
beta=1,
bias=None,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim(self, src: str, tar: str) -> float:
r"""Return the Jaccard similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Jaccard similarity
Examples
--------
>>> cmp = Jaccard()
>>> cmp.sim('cat', 'hat')
0.3333333333333333
>>> cmp.sim('Niall', 'Neil')
0.2222222222222222
>>> cmp.sim('aluminum', 'Catalan')
0.0625
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
return super(Jaccard, self).sim(src, tar)
def tanimoto_coeff(self, src: str, tar: str) -> float:
"""Return the Tanimoto distance between two strings.
Tanimoto distance :cite:`Tanimoto:1958` is
:math:`-log_{2} sim_{Tanimoto}(X, Y)`.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Tanimoto distance
Examples
--------
>>> cmp = Jaccard()
>>> cmp.tanimoto_coeff('cat', 'hat')
-1.5849625007211563
>>> cmp.tanimoto_coeff('Niall', 'Neil')
-2.1699250014423126
>>> cmp.tanimoto_coeff('aluminum', 'Catalan')
-4.0
>>> cmp.tanimoto_coeff('ATCG', 'TAGC')
-inf
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
coeff = self.sim(src, tar)
if coeff != 0:
return log2(coeff)
return float('-inf')
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_jaccard_nm.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._jaccard_nm.
Jaccard-NM similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['JaccardNM']
class JaccardNM(_TokenDistance):
r"""Jaccard-NM similarity.
For two sets X and Y and a population N, Jaccard-NM similarity
:cite:`Naseem:2011` is
.. math::
sim_{JaccardNM}(X, Y) =
\frac{|X \cap Y|}
{|N| + |X \cap Y| + |X \setminus Y| + |Y \setminus X|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{JaccardNM} =
\frac{a}{2(a+b+c)+d}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize JaccardNM instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(JaccardNM, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Jaccard-NM similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Jaccard-NM similarity
Examples
--------
>>> cmp = JaccardNM()
>>> cmp.sim_score('cat', 'hat')
0.002531645569620253
>>> cmp.sim_score('Niall', 'Neil')
0.0025220680958385876
>>> cmp.sim_score('aluminum', 'Catalan')
0.0012484394506866417
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if not src or not tar:
return 0.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
n = self._population_unique_card()
return a / (a + b + c + n)
def sim(self, src: str, tar: str) -> float:
"""Return the Jaccard-NM similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Jaccard-NM similarity
Examples
--------
>>> cmp = JaccardNM()
>>> cmp.sim('cat', 'hat')
0.005063291139240506
>>> cmp.sim('Niall', 'Neil')
0.005044136191677175
>>> cmp.sim('aluminum', 'Catalan')
0.0024968789013732834
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return 2 * self.sim_score(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_jaro_winkler.py
================================================
# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._jaro_winkler.
The distance._JaroWinkler module implements distance metrics based on
:cite:`Jaro:1989` and subsequent works:
- Jaro distance
- Jaro-Winkler distance
"""
from typing import Any
from ._distance import _Distance
from ..tokenizer import QGrams
__all__ = ['JaroWinkler']
class JaroWinkler(_Distance):
"""Jaro-Winkler distance.
Jaro(-Winkler) distance is a string edit distance initially proposed by
Jaro and extended by Winkler :cite:`Jaro:1989,Winkler:1990`.
This is Python based on the C code for strcmp95:
http://web.archive.org/web/20110629121242/http://www.census.gov/geo/msb/stand/strcmp.c
:cite:`Winkler:1994`. The above file is a US Government publication and,
accordingly, in the public domain.
.. versionadded:: 0.3.6
"""
def __init__(
self,
qval: int = 1,
mode: str = 'winkler',
long_strings: bool = False,
boost_threshold: float = 0.7,
scaling_factor: float = 0.1,
**kwargs: Any
) -> None:
"""Initialize JaroWinkler instance.
Parameters
----------
qval : int
The length of each q-gram (defaults to 1: character-wise matching)
mode : str
Indicates which variant of this distance metric to compute:
- ``winkler`` -- computes the Jaro-Winkler distance (default)
which increases the score for matches near the start of the
word
- ``jaro`` -- computes the Jaro distance
long_strings : bool
Set to True to "Increase the probability of a match when the number
of matched characters is large. This option allows for a little
more tolerance when the strings are large. It is not an appropriate
test when comparing fixed length fields such as phone and social
security numbers." (Used in 'winkler' mode only.)
boost_threshold : float
A value between 0 and 1, below which the Winkler boost is not
applied (defaults to 0.7). (Used in 'winkler' mode only.)
scaling_factor : float
A value between 0 and 0.25, indicating by how much to boost scores
for matching prefixes (defaults to 0.1). (Used in 'winkler' mode
only.)
.. versionadded:: 0.4.0
"""
super(JaroWinkler, self).__init__(**kwargs)
self._qval = qval
self._mode = mode
self._long_strings = long_strings
self._boost_threshold = boost_threshold
self._scaling_factor = scaling_factor
def sim(self, src: str, tar: str) -> float:
"""Return the Jaro or Jaro-Winkler similarity of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Jaro or Jaro-Winkler similarity
Raises
------
ValueError
Unsupported boost_threshold assignment; boost_threshold must be
between 0 and 1.
ValueError
Unsupported scaling_factor assignment; scaling_factor must be
between 0 and 0.25.'
Examples
--------
>>> cmp = JaroWinkler()
>>> round(cmp.sim('cat', 'hat'), 12)
0.777777777778
>>> round(cmp.sim('Niall', 'Neil'), 12)
0.805
>>> round(cmp.sim('aluminum', 'Catalan'), 12)
0.60119047619
>>> round(cmp.sim('ATCG', 'TAGC'), 12)
0.833333333333
>>> cmp = JaroWinkler(mode='jaro')
>>> round(cmp.sim('cat', 'hat'), 12)
0.777777777778
>>> round(cmp.sim('Niall', 'Neil'), 12)
0.783333333333
>>> round(cmp.sim('aluminum', 'Catalan'), 12)
0.60119047619
>>> round(cmp.sim('ATCG', 'TAGC'), 12)
0.833333333333
.. versionadded:: 0.1.0
.. versionchanged:: 0.3.6
Encapsulated in class
"""
if self._mode == 'winkler':
if self._boost_threshold > 1 or self._boost_threshold < 0:
raise ValueError(
'Unsupported boost_threshold assignment; '
+ 'boost_threshold must be between 0 and 1.'
)
if self._scaling_factor > 0.25 or self._scaling_factor < 0:
raise ValueError(
'Unsupported scaling_factor assignment; '
+ 'scaling_factor must be between 0 and 0.25.'
)
if src == tar:
return 1.0
tokenizer = QGrams(self._qval)
tokenizer.tokenize(src.strip())
src_list = tokenizer.get_list()
tokenizer.tokenize(tar.strip())
tar_list = tokenizer.get_list()
lens = len(src_list)
lent = len(tar_list)
# If either string is blank - return - added in Version 2
if lens == 0 or lent == 0:
return 0.0
if lens > lent:
search_range = lens
minv = lent
else:
search_range = lent
minv = lens
# Zero out the flags
src_flag = [0] * search_range
tar_flag = [0] * search_range
search_range = max(0, search_range // 2 - 1)
# Looking only within the search range,
# count and flag the matched pairs.
num_com = 0
yl1 = lent - 1
for i in range(lens):
low_lim = (i - search_range) if (i >= search_range) else 0
hi_lim = (i + search_range) if ((i + search_range) <= yl1) else yl1
for j in range(low_lim, hi_lim + 1):
if (tar_flag[j] == 0) and (tar_list[j] == src_list[i]):
tar_flag[j] = 1
src_flag[i] = 1
num_com += 1
break
# If no characters in common - return
if num_com == 0:
return 0.0
# Count the number of transpositions
k = n_trans = 0
for i in range(lens):
if src_flag[i] != 0:
j = 0
for j in range(k, lent): # pragma: no branch
if tar_flag[j] != 0:
k = j + 1
break
if src_list[i] != tar_list[j]:
n_trans += 1
n_trans //= 2
# Main weight computation for Jaro distance
weight = (
num_com / lens + num_com / lent + (num_com - n_trans) / num_com
)
weight /= 3.0
# Continue to boost the weight if the strings are similar
# This is the Winkler portion of Jaro-Winkler distance
if self._mode == 'winkler' and weight > self._boost_threshold:
# Adjust for having up to the first 4 characters in common
j = 4 if (minv >= 4) else minv
i = 0
while (i < j) and (src_list[i] == tar_list[i]):
i += 1
weight += i * self._scaling_factor * (1.0 - weight)
# Optionally adjust for long strings.
# After agreeing beginning chars, at least two more must agree and
# the agreeing characters must be > .5 of remaining characters.
if (
self._long_strings
and (minv > 4)
and (num_com > i + 1)
and (2 * num_com >= minv + i)
):
weight += (1.0 - weight) * (
(num_com - i - 1) / (lens + lent - i * 2 + 2)
)
return weight
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_jensen_shannon.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._jensen_shannon.
Jensen-Shannon divergence
"""
from math import log
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['JensenShannon']
class JensenShannon(_TokenDistance):
r"""Jensen-Shannon divergence.
Jensen-Shannon divergence :cite:`Dagan:1999` of two multi-sets X and Y is
.. math::
\begin{array}{rl}
dist_{JS}(X, Y) &= log 2 + \frac{1}{2} \sum_{i \in X \cap Y}
h(p(X_i) + p(Y_i)) - h(p(X_i)) - h(p(Y_i))
h(x) &= -x log x
p(X_i \in X) &= \frac{|X_i|}{|X|}
\end{array}
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize JensenShannon instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
.. versionadded:: 0.4.0
"""
super(JensenShannon, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def dist_abs(self, src: str, tar: str) -> float:
"""Return the Jensen-Shannon divergence of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Jensen-Shannon divergence
Examples
--------
>>> cmp = JensenShannon()
>>> cmp.dist_abs('cat', 'hat')
0.3465735902799726
>>> cmp.dist_abs('Niall', 'Neil')
0.44051045978517045
>>> cmp.dist_abs('aluminum', 'Catalan')
0.6115216713968132
>>> cmp.dist_abs('ATCG', 'TAGC')
0.6931471805599453
.. versionadded:: 0.4.0
"""
if src == tar:
return 0.0
self._tokenize(src, tar)
def entropy(prob: float) -> float:
"""Return the entropy of prob."""
if not prob:
return 0.0
return -(prob * log(prob))
src_total = sum(self._src_tokens.values())
tar_total = sum(self._tar_tokens.values())
diverg = log(2)
for key in self._intersection().keys():
p_src = self._src_tokens[key] / src_total
p_tar = self._tar_tokens[key] / tar_total
diverg += (
entropy(p_src + p_tar) - entropy(p_src) - entropy(p_tar)
) / 2
return diverg
def dist(self, src: str, tar: str) -> float:
"""Return the normalized Jensen-Shannon distance of two strings.
Parameters
----------
src : str
Source string for comparison
tar : str
Target string for comparison
Returns
-------
float
Normalized Jensen-Shannon distance
Examples
--------
>>> cmp = JensenShannon()
>>> cmp.dist('cat', 'hat')
0.49999999999999994
>>> cmp.dist('Niall', 'Neil')
0.6355222557917826
>>> cmp.dist('aluminum', 'Catalan')
0.8822392827203127
>>> cmp.dist('ATCG', 'TAGC')
1.0
.. versionadded:: 0.4.0
"""
return self.dist_abs(src, tar) / log(2)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_johnson.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._johnson.
Johnson similarity
"""
from typing import Any, Optional
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['Johnson']
class Johnson(_TokenDistance):
r"""Johnson similarity.
For two sets X and Y, the Johnson
similarity :cite:`Johnson:1967` is
.. math::
sim_{Johnson}(X, Y) =
\frac{(|X \cap Y|}{|X|} + \frac{|Y \cap X}{|Y|}`.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{Johnson} =
\frac{a}{a+b}+\frac{a}{a+c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize Johnson instance.
Parameters
----------
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(Johnson, self).__init__(
tokenizer=tokenizer, intersection_type=intersection_type, **kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Johnson similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Johnson similarity
Examples
--------
>>> cmp = Johnson()
>>> cmp.sim_score('cat', 'hat')
1.0
>>> cmp.sim_score('Niall', 'Neil')
0.7333333333333334
>>> cmp.sim_score('aluminum', 'Catalan')
0.2361111111111111
>>> cmp.sim_score('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
if src == tar:
return 2.0
self._tokenize(src, tar)
if not self._src_card() or not self._tar_card():
return 0.0
a = self._intersection_card()
ab = self._src_card()
ac = self._tar_card()
return a / ab + a / ac
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Johnson similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Johnson similarity
Examples
--------
>>> cmp = Johnson()
>>> cmp.sim('cat', 'hat')
0.5
>>> cmp.sim('Niall', 'Neil')
0.3666666666666667
>>> cmp.sim('aluminum', 'Catalan')
0.11805555555555555
>>> cmp.sim('ATCG', 'TAGC')
0.0
.. versionadded:: 0.4.0
"""
return self.sim_score(src, tar) / 2
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_kendall_tau.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._kendall_tau.
Kendall's Tau correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KendallTau']
class KendallTau(_TokenDistance):
r"""Kendall's Tau correlation.
For two sets X and Y and a population N, Kendall's Tau correlation
:cite:`Kendall:1938` is
.. math::
corr_{KendallTau}(X, Y) =
\frac{2 \cdot (|X \cap Y| + |(N \setminus X) \setminus Y| -
|X \triangle Y|)}{|N| \cdot (|N|-1)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{KendallTau} =
\frac{2 \cdot (a+d-b-c)}{n \cdot (n-1)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KendallTau instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KendallTau, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Kendall's Tau correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kendall's Tau correlation
Examples
--------
>>> cmp = KendallTau()
>>> cmp.corr('cat', 'hat')
0.0025282143508744493
>>> cmp.corr('Niall', 'Neil')
0.00250866630176975
>>> cmp.corr('aluminum', 'Catalan')
0.0024535291823735866
>>> cmp.corr('ATCG', 'TAGC')
0.0024891182526650506
Notes
-----
This correlation is not necessarily bounded to [-1.0, 1.0], but will
typically be within these bounds for real data.
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
n = self._population_unique_card()
num = a + d - b - c
if num:
return 2 * num / (n * max(n - 1, 1))
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the Kendall's Tau similarity of two strings.
The Tau correlation is first clamped to the range [-1.0, 1.0] before
being converted to a similarity value to ensure that the similarity
is in the range [0.0, 1.0].
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kendall's Tau similarity
Examples
--------
>>> cmp = KendallTau()
>>> cmp.sim('cat', 'hat')
0.5012641071754372
>>> cmp.sim('Niall', 'Neil')
0.5012543331508849
>>> cmp.sim('aluminum', 'Catalan')
0.5012267645911868
>>> cmp.sim('ATCG', 'TAGC')
0.5012445591263325
.. versionadded:: 0.4.0
"""
score = max(-1.0, min(1.0, self.corr(src, tar)))
return (1.0 + score) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_kent_foster_i.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._kent_foster_i.
Kent & Foster I similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KentFosterI']
class KentFosterI(_TokenDistance):
r"""Kent & Foster I similarity.
For two sets X and Y and a population N, Kent & Foster I similarity
:cite:`Kent:1977`, :math:`K_{occ}`, is
.. math::
sim_{KentFosterI}(X, Y) =
\frac{|X \cap Y| - \frac{|X|\cdot|Y|}{|X \cup Y|}}
{|X \cap Y| - \frac{|X|\cdot|Y|}{|X \cup Y|} +
|X \setminus Y| + |Y \setminus X|}
Kent & Foster derived this from Cohen's :math:`\kappa` by "subtracting
appropriate chance agreement correction figures from the numerators and
denominators" to arrive at an occurrence reliability measure.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{KentFosterI} =
\frac{a-\frac{(a+b)(a+c)}{a+b+c}}{a-\frac{(a+b)(a+c)}{a+b+c}+b+c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KentFosterI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KentFosterI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Kent & Foster I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kent & Foster I similarity
Examples
--------
>>> cmp = KentFosterI()
>>> cmp.sim_score('cat', 'hat')
-0.19999999999999996
>>> cmp.sim_score('Niall', 'Neil')
-0.23529411764705888
>>> cmp.sim_score('aluminum', 'Catalan')
-0.30434782608695654
>>> cmp.sim_score('ATCG', 'TAGC')
-0.3333333333333333
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
num = (a + b) * (a + c)
if not num:
bigterm = a
else:
bigterm = a - (num / (a + b + c))
if bigterm:
return bigterm / (bigterm + b + c)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Kent & Foster I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Kent & Foster I similarity
Examples
--------
>>> cmp = KentFosterI()
>>> cmp.sim('cat', 'hat')
0.8
>>> cmp.sim('Niall', 'Neil')
0.7647058823529411
>>> cmp.sim('aluminum', 'Catalan')
0.6956521739130435
>>> cmp.sim('ATCG', 'TAGC')
0.6666666666666667
.. versionadded:: 0.4.0
"""
return 1.0 + self.sim_score(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_kent_foster_ii.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._kent_foster_ii.
Kent & Foster II similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KentFosterII']
class KentFosterII(_TokenDistance):
r"""Kent & Foster II similarity.
For two sets X and Y and a population N, Kent & Foster II similarity
:cite:`Kent:1977`, :math:`K_{nonocc}`, is
.. math::
sim_{KentFosterII}(X, Y) =
\frac{|(N \setminus X) \setminus Y| -
\frac{|X \setminus Y|\cdot|Y \setminus X|}
{|N \setminus (X \cap Y)|}}
{|(N \setminus X) \setminus Y| -
\frac{|X \setminus Y|\cdot|Y \setminus X|}
{|N \setminus (X \cap Y)|} +
|X \setminus Y| + |Y \setminus X|}
Kent & Foster derived this from Cohen's :math:`\kappa` by "subtracting
appropriate chance agreement correction figures from the numerators and
denominators" to arrive at an non-occurrence reliability measure.
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{KentFosterII} =
\frac{d-\frac{(b+d)(c+d)}{b+c+d}}{d-\frac{(b+d)(c+d)}{b+c+d}+b+c}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KentFosterII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KentFosterII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Kent & Foster II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kent & Foster II similarity
Examples
--------
>>> cmp = KentFosterII()
>>> cmp.sim_score('cat', 'hat')
-0.0012804097311239404
>>> cmp.sim_score('Niall', 'Neil')
-0.002196997436837158
>>> cmp.sim_score('aluminum', 'Catalan')
-0.004784688995214218
>>> cmp.sim_score('ATCG', 'TAGC')
-0.0031989763275758767
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
num = (b + d) * (c + d)
if not num:
bigterm = d
else:
bigterm = d - (num / (b + c + d))
if bigterm:
return bigterm / (bigterm + b + c)
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Kent & Foster II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Kent & Foster II similarity
Examples
--------
>>> cmp = KentFosterII()
>>> cmp.sim('cat', 'hat')
0.998719590268876
>>> cmp.sim('Niall', 'Neil')
0.9978030025631628
>>> cmp.sim('aluminum', 'Catalan')
0.9952153110047858
>>> cmp.sim('ATCG', 'TAGC')
0.9968010236724241
.. versionadded:: 0.4.0
"""
return 1.0 + self.sim_score(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_koppen_i.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._koppen_i.
Köppen I correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KoppenI']
class KoppenI(_TokenDistance):
r"""Köppen I correlation.
For two sets X and Y and an alphabet N, provided that :math:`|X| = |Y|`,
Köppen I correlation :cite:`Koppen:1870,Goodman:1959` is
.. math::
corr_{KoppenI}(X, Y) =
\frac{|X| \cdot |N \setminus X| - |X \setminus Y|}
{|X| \cdot |N \setminus X|}
To support cases where :math:`|X| \neq |Y|`, this class implements a slight
variation, while still providing the expected results when
:math:`|X| = |Y|`:
.. math::
corr_{KoppenI}(X, Y) =
\frac{\frac{|X|+|Y|}{2} \cdot
\frac{|N \setminus X|+|N \setminus Y|}{2}-
\frac{|X \triangle Y|}{2}}
{\frac{|X|+|Y|}{2} \cdot
\frac{|N \setminus X|+|N \setminus Y|}{2}}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{KoppenI} =
\frac{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}-
\frac{b+c}{2}}
{\frac{2a+b+c}{2} \cdot \frac{2d+b+c}{2}}
Notes
-----
In the usual case all of the above values should be proportional to the
total number of samples n. I.e., a, b, c, d, & n should all be divided by
n prior to calculating the coefficient. This class's default normalizer
is, accordingly, 'proportional'.
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
normalizer: str = 'proportional',
**kwargs: Any
) -> None:
"""Initialize KoppenI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
normalizer : str
Specifies the normalization type. See :ref:`normalizer `
description in :py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KoppenI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
normalizer=normalizer,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Köppen I correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Köppen I correlation
Examples
--------
>>> cmp = KoppenI()
>>> cmp.corr('cat', 'hat')
0.49615384615384617
>>> cmp.corr('Niall', 'Neil')
0.3575056927658083
>>> cmp.corr('aluminum', 'Catalan')
0.1068520131813188
>>> cmp.corr('ATCG', 'TAGC')
-0.006418485237483896
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
abac_dbdc_mean_prod = (2 * a + b + c) * (2 * d + b + c) / 4
num = abac_dbdc_mean_prod - (b + c) / 2
if num:
return num / abac_dbdc_mean_prod
return 0.0
def sim(self, src: str, tar: str) -> float:
"""Return the Köppen I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Köppen I similarity
Examples
--------
>>> cmp = KoppenI()
>>> cmp.sim('cat', 'hat')
0.7480769230769231
>>> cmp.sim('Niall', 'Neil')
0.6787528463829041
>>> cmp.sim('aluminum', 'Catalan')
0.5534260065906594
>>> cmp.sim('ATCG', 'TAGC')
0.49679075738125805
.. versionadded:: 0.4.0
"""
return (1.0 + self.corr(src, tar)) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_koppen_ii.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._koppen_ii.
Köppen II similarity
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KoppenII']
class KoppenII(_TokenDistance):
r"""Köppen II similarity.
For two sets X and Y, Köppen II similarity
:cite:`Koppen:1870,Goodman:1959` is
.. math::
sim_{KoppenII}(X, Y) =
|X \cap Y| + \frac{|X \setminus Y| + |Y \setminus X|}{2}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
sim_{KoppenII} =
a + \frac{b+c}{2}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KoppenII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KoppenII, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def sim_score(self, src: str, tar: str) -> float:
"""Return the Köppen II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Köppen II similarity
Examples
--------
>>> cmp = KoppenII()
>>> cmp.sim_score('cat', 'hat')
4.0
>>> cmp.sim_score('Niall', 'Neil')
5.5
>>> cmp.sim_score('aluminum', 'Catalan')
8.5
>>> cmp.sim_score('ATCG', 'TAGC')
5.0
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
return a + (b + c) / 2
def sim(self, src: str, tar: str) -> float:
"""Return the normalized Köppen II similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Normalized Köppen II similarity
Examples
--------
>>> cmp = KoppenII()
>>> cmp.sim('cat', 'hat')
0.6666666666666666
>>> cmp.sim('Niall', 'Neil')
0.6111111111111112
>>> cmp.sim('aluminum', 'Catalan')
0.53125
>>> cmp.sim('ATCG', 'TAGC')
0.5
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
score = self.sim_score(src, tar)
return score / self._union_card()
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_kuder_richardson.py
================================================
# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._kuder_richardson.
Kuder & Richardson correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KuderRichardson']
class KuderRichardson(_TokenDistance):
r"""Kuder & Richardson correlation.
For two sets X and Y and a population N, Kuder & Richardson similarity
:cite:`Kuder:1937,Cronbach:1951` is
.. math::
corr_{KuderRichardson}(X, Y) =
\frac{4(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|)}
{|X| \cdot |N \setminus X| +
|Y| \cdot |N \setminus Y| +
2(|X \cap Y| \cdot |(N \setminus X) \setminus Y| -
|X \setminus Y| \cdot |Y \setminus X|)}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{KuderRichardson} =
\frac{4(ad-bc)}{(a+b)(c+d) + (a+c)(b+d) +2(ad-bc)}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KuderRichardson instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KuderRichardson, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Kuder & Richardson correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kuder & Richardson correlation
Examples
--------
>>> cmp = KuderRichardson()
>>> cmp.corr('cat', 'hat')
0.6643835616438356
>>> cmp.corr('Niall', 'Neil')
0.5285677463699631
>>> cmp.corr('aluminum', 'Catalan')
0.19499521400246136
>>> cmp.corr('ATCG', 'TAGC')
-0.012919896640826873
.. versionadded:: 0.4.0
"""
if src == tar:
return 1.0
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
d = self._total_complement_card()
admbc = a * d - b * c
denom = (a + b) * (c + d) + (a + c) * (b + d) + 2 * admbc
if not admbc:
return 0.0
elif not denom:
return float('-inf')
else:
return (4 * admbc) / denom
def sim(self, src: str, tar: str) -> float:
"""Return the Kuder & Richardson similarity of two strings.
Since Kuder & Richardson correlation is unbounded in the negative,
this measure is first clamped to [-1.0, 1.0].
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kuder & Richardson similarity
Examples
--------
>>> cmp = KuderRichardson()
>>> cmp.sim('cat', 'hat')
0.8321917808219178
>>> cmp.sim('Niall', 'Neil')
0.7642838731849815
>>> cmp.sim('aluminum', 'Catalan')
0.5974976070012307
>>> cmp.sim('ATCG', 'TAGC')
0.4935400516795866
.. versionadded:: 0.4.0
"""
score = max(-1.0, self.corr(src, tar))
return (1.0 + score) / 2.0
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_kuhns_i.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._kuhns_i.
Kuhns I correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KuhnsI']
class KuhnsI(_TokenDistance):
r"""Kuhns I correlation.
For two sets X and Y and a population N, Kuhns I correlation
:cite:`Kuhns:1965`, the excess of separation over its independence value
(S), is
.. math::
corr_{KuhnsI}(X, Y) =
\frac{2\delta(X, Y)}{|N|}
where
.. math::
\delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{KuhnsI} =
\frac{2\delta(a+b, a+c)}{n}
where
.. math::
\delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KuhnsI instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type ` description in
:py:class:`_TokenDistance` for details.
**kwargs
Arbitrary keyword arguments
Other Parameters
----------------
qval : int
The length of each q-gram. Using this parameter and tokenizer=None
will cause the instance to use the QGram tokenizer with this
q value.
metric : _Distance
A string distance measure class for use in the ``soft`` and
``fuzzy`` variants.
threshold : float
A threshold value, similarities above which are counted as
members of the intersection for the ``fuzzy`` variant.
.. versionadded:: 0.4.0
"""
super(KuhnsI, self).__init__(
alphabet=alphabet,
tokenizer=tokenizer,
intersection_type=intersection_type,
**kwargs
)
def corr(self, src: str, tar: str) -> float:
"""Return the Kuhns I correlation of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kuhns I correlation
Examples
--------
>>> cmp = KuhnsI()
>>> cmp.corr('cat', 'hat')
0.005049979175343606
>>> cmp.corr('Niall', 'Neil')
0.005004425239483548
>>> cmp.corr('aluminum', 'Catalan')
0.0023140898210880765
>>> cmp.corr('ATCG', 'TAGC')
-8.134631403581842e-05
.. versionadded:: 0.4.0
"""
self._tokenize(src, tar)
a = self._intersection_card()
b = self._src_only_card()
c = self._tar_only_card()
n = self._population_unique_card()
apbmapc = (a + b) * (a + c)
if not apbmapc:
delta_ab = a
else:
delta_ab = a - apbmapc / n
if not delta_ab:
return 0.0
else:
return 2 * delta_ab / n
def sim(self, src: str, tar: str) -> float:
"""Return the Kuhns I similarity of two strings.
Parameters
----------
src : str
Source string (or QGrams/Counter objects) for comparison
tar : str
Target string (or QGrams/Counter objects) for comparison
Returns
-------
float
Kuhns I similarity
Examples
--------
>>> cmp = KuhnsI()
>>> cmp.sim('cat', 'hat')
0.5050499791753436
>>> cmp.sim('Niall', 'Neil')
0.5050044252394835
>>> cmp.sim('aluminum', 'Catalan')
0.502314089821088
>>> cmp.sim('ATCG', 'TAGC')
0.49991865368596416
.. versionadded:: 0.4.0
"""
return 0.5 + self.corr(src, tar)
if __name__ == '__main__':
import doctest
doctest.testmod()
================================================
FILE: abydos/distance/_kuhns_ii.py
================================================
# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see .
"""abydos.distance._kuhns_ii.
Kuhns II correlation
"""
from typing import Any, Counter as TCounter, Optional, Sequence, Set, Union
from ._token_distance import _TokenDistance
from ..tokenizer import _Tokenizer
__all__ = ['KuhnsII']
class KuhnsII(_TokenDistance):
r"""Kuhns II correlation.
For two sets X and Y and a population N, Kuhns II correlation
:cite:`Kuhns:1965`, the excess of rectangular distance over its
independence value (R), is
.. math::
corr_{KuhnsII}(X, Y) =
\frac{\delta(X, Y)}{max(|X|, |Y|)}
where
.. math::
\delta(X, Y) = |X \cap Y| - \frac{|X| \cdot |Y|}{|N|}
In :ref:`2x2 confusion table terms `, where a+b+c+d=n,
this is
.. math::
corr_{KuhnsII} =
\frac{\delta(a+b, a+c)}{max(a+b, a+c)}
where
.. math::
\delta(a+b, a+c) = a - \frac{(a+b)(a+c)}{n}
.. versionadded:: 0.4.0
"""
def __init__(
self,
alphabet: Optional[
Union[TCounter[str], Sequence[str], Set[str], int]
] = None,
tokenizer: Optional[_Tokenizer] = None,
intersection_type: str = 'crisp',
**kwargs: Any
) -> None:
"""Initialize KuhnsII instance.
Parameters
----------
alphabet : Counter, collection, int, or None
This represents the alphabet of possible tokens.
See :ref:`alphabet ` description in
:py:class:`_TokenDistance` for details.
tokenizer : _Tokenizer
A tokenizer instance from the :py:mod:`abydos.tokenizer` package
intersection_type : str
Specifies the intersection type, and set type as a result:
See :ref:`intersection_type