Repository: neocl/jamdict Branch: main Commit: 85c66c190649 Files: 55 Total size: 453.7 KB Directory structure: gitextract_xlbja41h/ ├── .gitignore ├── .gitmodules ├── LICENSE ├── MANIFEST.in ├── README.md ├── TODO.md ├── _config.yml ├── data/ │ └── README.md ├── docs/ │ ├── Makefile │ ├── api.rst │ ├── conf.py │ ├── contributing.rst │ ├── index.rst │ ├── install.rst │ ├── make.bat │ ├── recipes.rst │ ├── requirements.txt │ ├── tutorials.rst │ └── updates.rst ├── jamdict/ │ ├── __init__.py │ ├── __main__.py │ ├── __version__.py │ ├── config.py │ ├── data/ │ │ ├── config_template.json │ │ ├── setup_jmdict.sql │ │ ├── setup_jmnedict.sql │ │ └── setup_kanjidic2.sql │ ├── jmdict.py │ ├── jmdict_sqlite.py │ ├── jmnedict_sqlite.py │ ├── kanjidic2.py │ ├── kanjidic2_sqlite.py │ ├── krad.py │ ├── tools.py │ └── util.py ├── jamdict_demo.py ├── jamdol-flask.py ├── jmd ├── logging.json ├── release.sh ├── requirements.txt ├── run ├── setup.py ├── test/ │ ├── __init__.py │ ├── data/ │ │ ├── JMdict_mini.xml │ │ ├── jamdict.json │ │ ├── jmendict_mini.xml │ │ └── kanjidic2_mini.xml │ ├── logging.json │ ├── test_jamdict.py │ ├── test_jmdict_sqlite.py │ ├── test_jmnedict.py │ ├── test_kanjidic2_sqlite.py │ └── test_krad.py └── test.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea/ test/data/test.db *.py~ *.sh~ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # IPython Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # dotenv .env # virtualenv venv/ ENV/ # Spyder project settings .spyderproject # Rope project settings .ropeproject ================================================ FILE: .gitmodules ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2016 Le Tuan Anh Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include README.rst include CHANGES.md include LICENSE include requirements*.txt recursive-include jamdict/data/ *.sql recursive-include jamdict/data/ *.json recursive-include jamdict/data/ *.gz ================================================ FILE: README.md ================================================ # Jamdict [Jamdict](https://github.com/neocl/jamdict) is a Python 3 library for manipulating Jim Breen's JMdict, KanjiDic2, JMnedict and kanji-radical mappings. [![ReadTheDocs Badge](https://readthedocs.org/projects/jamdict/badge/?version=latest&style=plastic)](https://jamdict.readthedocs.io/) **Documentation:** https://jamdict.readthedocs.io/ # Main features * Support querying different Japanese language resources - Japanese-English dictionary JMDict - Kanji dictionary KanjiDic2 - Kanji-radical and radical-kanji maps KRADFILE/RADKFILE - Japanese Proper Names Dictionary (JMnedict) * Fast look up (dictionaries are stored in SQLite databases) * Command-line lookup tool [(Example)](#command-line-tools) [Contributors](#contributors) are welcome! 🙇. If you want to help, please see [Contributing](https://jamdict.readthedocs.io/en/latest/contributing.html) page. # Try Jamdict out Jamdict is used in [Jamdict-web](https://jamdict.herokuapp.com/) - a web-based free and open-source Japanese reading assistant software. Please try out the demo instance online at: https://jamdict.herokuapp.com/ There also is a demo [Jamdict virtual machine](https://replit.com/@tuananhle/jamdict-demo) online for trying out Jamdict Python code on Repl.it: https://replit.com/@tuananhle/jamdict-demo # Installation Jamdict & Jamdict database are both available on [PyPI](https://pypi.org/project/jamdict/) and can be installed using pip ```bash pip install --upgrade jamdict jamdict-data ``` # Sample jamdict Python code ```python from jamdict import Jamdict jam = Jamdict() # use wildcard matching to find anything starts with 食べ and ends with る result = jam.lookup('食べ%る') # print all word entries for entry in result.entries: print(entry) # [id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on # [id#1358300] たべすぎる (食べ過ぎる) : to overeat ((Ichidan verb|transitive verb)) # [id#1852290] たべつける (食べ付ける) : to be used to eating ((Ichidan verb|transitive verb)) # [id#2145280] たべはじめる (食べ始める) : to start eating ((Ichidan verb)) # [id#2449430] たべかける (食べ掛ける) : to start eating ((Ichidan verb)) # [id#2671010] たべなれる (食べ慣れる) : to be used to eating/to become used to eating/to be accustomed to eating/to acquire a taste for ((Ichidan verb)) # [id#2765050] たべられる (食べられる) : 1. to be able to eat ((Ichidan verb|intransitive verb)) 2. to be edible/to be good to eat ((pre-noun adjectival (rentaishi))) # [id#2795790] たべくらべる (食べ比べる) : to taste and compare several dishes (or foods) of the same type ((Ichidan verb|transitive verb)) # [id#2807470] たべあわせる (食べ合わせる) : to eat together (various foods) ((Ichidan verb)) # print all related characters for c in result.chars: print(repr(c)) # 食:9:eat,food # 喰:12:eat,drink,receive (a blow),(kokuji) # 過:12:overdo,exceed,go beyond,error # 付:5:adhere,attach,refer to,append # 始:8:commence,begin # 掛:11:hang,suspend,depend,arrive at,tax,pour # 慣:14:accustomed,get used to,become experienced # 比:4:compare,race,ratio,Philippines # 合:6:fit,suit,join,0.1 ``` ## Command line tools To make sure that jamdict is configured properly, try to look up a word using command line ```bash python3 -m jamdict lookup 言語学 ======================================== Found entries ======================================== Entry: 1264430 | Kj: 言語学 | Kn: げんごがく -------------------- 1. linguistics ((noun (common) (futsuumeishi))) ======================================== Found characters ======================================== Char: 言 | Strokes: 7 -------------------- Readings: yan2, eon, 언, Ngôn, Ngân, ゲン, ゴン, い.う, こと Meanings: say, word Char: 語 | Strokes: 14 -------------------- Readings: yu3, yu4, eo, 어, Ngữ, Ngứ, ゴ, かた.る, かた.らう Meanings: word, speech, language Char: 学 | Strokes: 8 -------------------- Readings: xue2, hag, 학, Học, ガク, まな.ぶ Meanings: study, learning, science No name was found. ``` ## Using KRAD/RADK mapping Jamdict has built-in support for KRAD/RADK (i.e. kanji-radical and radical-kanji mapping). The terminology of radicals/components used by Jamdict can be different from else where. - A radical in Jamdict is a principal component, each character has only one radical. - A character may be decomposed into several writing components. By default jamdict provides two maps: - jam.krad is a Python dict that maps characters to list of components. - jam.radk is a Python dict that maps each available components to a list of characters. ```python # Find all writing components (often called "radicals") of the character 雲 print(jam.krad['雲']) # ['一', '雨', '二', '厶'] # Find all characters with the component 鼎 chars = jam.radk['鼎'] print(chars) # {'鼏', '鼒', '鼐', '鼎', '鼑'} # look up the characters info result = jam.lookup(''.join(chars)) for c in result.chars: print(c, c.meanings()) # 鼏 ['cover of tripod cauldron'] # 鼒 ['large tripod cauldron with small'] # 鼐 ['incense tripod'] # 鼎 ['three legged kettle'] # 鼑 [] ``` ## Finding name entities ```bash # Find all names with 鈴木 inside result = jam.lookup('%鈴木%') for name in result.names: print(name) # [id#5025685] キューティーすずき (キューティー鈴木) : Kyu-ti- Suzuki (1969.10-) (full name of a particular person) # [id#5064867] パパイヤすずき (パパイヤ鈴木) : Papaiya Suzuki (full name of a particular person) # [id#5089076] ラジカルすずき (ラジカル鈴木) : Rajikaru Suzuki (full name of a particular person) # [id#5259356] きつねざきすずきひなた (狐崎鈴木日向) : Kitsunezakisuzukihinata (place name) # [id#5379158] こすずき (小鈴木) : Kosuzuki (family or surname) # [id#5398812] かみすずき (上鈴木) : Kamisuzuki (family or surname) # [id#5465787] かわすずき (川鈴木) : Kawasuzuki (family or surname) # [id#5499409] おおすずき (大鈴木) : Oosuzuki (family or surname) # [id#5711308] すすき (鈴木) : Susuki (family or surname) # ... ``` ## Exact matching Use exact matching for faster search. Find the word 花火 by idseq (1194580) ```python >>> result = jam.lookup('id#1194580') >>> print(result.names[0]) [id#1194580] はなび (花火) : fireworks ((noun (common) (futsuumeishi))) ``` Find an exact name 花火 by idseq (5170462) ```python >>> result = jam.lookup('id#5170462') >>> print(result.names[0]) [id#5170462] はなび (花火) : Hanabi (female given name or forename) ``` See `jamdict_demo.py` and `jamdict/tools.py` for more information. # Useful links * JMdict: [http://edrdg.org/jmdict/edict_doc.html](http://edrdg.org/jmdict/edict_doc.html) * kanjidic2: [https://www.edrdg.org/wiki/index.php/KANJIDIC_Project](https://www.edrdg.org/wiki/index.php/KANJIDIC_Project) * JMnedict: [https://www.edrdg.org/enamdict/enamdict_doc.html](https://www.edrdg.org/enamdict/enamdict_doc.html) * KRADFILE: [http://www.edrdg.org/krad/kradinf.html](http://www.edrdg.org/krad/kradinf.html) # Contributors - [Le Tuan Anh](https://github.com/letuananh) (Maintainer) - [alt-romes](https://github.com/alt-romes) - [Matteo Fumagalli](https://github.com/matteofumagalli1275) - [Reem Alghamdi](https://github.com/reem-codes) - [Techno-coder](https://github.com/Techno-coder) ================================================ FILE: TODO.md ================================================ ================================================ FILE: _config.yml ================================================ theme: jekyll-theme-minimal ================================================ FILE: data/README.md ================================================ Copy dictionary files (JMdict_e.xml, kanjidic2.xml, kradfile, etc.) here ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line, and also # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) serve: cd _build/dirhtml && python -m http.server 7001 .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/api.rst ================================================ .. _api_index: jamdict APIs ============ An overview of jamdict modules. .. warning:: 👉 ⚠️ THIS SECTION IS STILL UNDER CONSTRUCTION ⚠️ Help is much needed. .. module:: jamdict .. autoclass:: jamdict.util.Jamdict :members: :member-order: groupwise :exclude-members: get_ne, has_jmne, import_data, jmnedict .. autoclass:: jamdict.util.LookupResult :members: :member-order: groupwise .. autoclass:: jamdict.util.IterLookupResult :members: :member-order: groupwise .. module:: jamdict.jmdict .. autoclass:: JMDEntry :members: .. module:: jamdict.kanjidic2 .. autoclass:: Character :members: .. automodule:: jamdict.krad ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath('../')) # -- Project information ----------------------------------------------------- project = 'jamdict' copyright = '2021, Le Tuan Anh' author = 'Le Tuan Anh' # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode', 'sphinx.ext.doctest'] # -- Highlight code block ----------------- pygments_style = 'sphinx' # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'bizstyle' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] ================================================ FILE: docs/contributing.rst ================================================ .. _contributing: Contributing ============ There are many ways to contribute to the Jamdict project. The one that Jamdict development team are focusing on at the moment are: - Fixing :ref:`existing bugs ` - Improving query functions - Improving :ref:`documentation ` - Keeping jamdict database up to date If you have some suggestions or bug reports, please share on `jamdict issues tracker `_. .. _contrib_bugfix: Fixing bugs ----------- If you found a bug please report at https://github.com/neocl/jamdict/issues When it is possible, please also share how to reproduce the bugs and a snapshot of jamdict info to help with the bug finding process. .. code:: bash python3 -m jamdict info Pull requests are welcome. .. _contrib_docs: Updating Documentation ---------------------- 1. Fork `jamdict `_ repository to your own Github account. #. Clone `jamdict` repository to your local machine. .. code:: bash git clone https://github.com//jamdict #. Create a virtual environment (optional, but highly recommended) .. code:: bash # if you use virtualenvwrapper mkvirtualenv jamdev workon jamdev # if you use Python venv python3 -m venv .env . .env/bin/activate python3 -m pip install --upgrade pip wheel Sphinx #. Build the docs .. code:: bash cd jamdict/docs # compile the docs make dirhtml # serve the docs using Python3 built-in development server # Note: this requires Python >= 3.7 to support --directory python3 -m http.server 7000 --directory _build/dirhtml # if you use earlier Python 3, you may use cd _build/dirhtml python3 -m http.server 7000 #. Now the docs should be ready to view at http://localhost:7000 . You can visit that URL on your browser to view the docs. #. More information: - Sphinx tutorial: https://sphinx-tutorial.readthedocs.io/start/ - Using `virtualenv`: https://virtualenvwrapper.readthedocs.io/en/latest/install.html - Using `venv`: https://docs.python.org/3/library/venv.html .. _contrib_dev: Development ----------- Development contributions are welcome. Setting up development environment for Jamdict should be similar to :ref:`contrib_docs`. Please contact the development team if you need more information: https://github.com/neocl/jamdict/issues ================================================ FILE: docs/index.rst ================================================ Jamdict's documentation! ======================== `Jamdict `_ is a Python 3 library for manipulating Jim Breen's JMdict, KanjiDic2, JMnedict and kanji-radical mappings. Welcome ------- Are you new to this documentation? Here are some useful pages: - Want to try out Jamdict package? Try `Jamdict online demo `_ - Want some useful code samples? See :ref:`recipes`. - Want to look deeper into the package? See :ref:`api_index`. - If you want to help developing Jamdict, please visit :ref:`contributing` page. Main features ------------- - Support querying different Japanese language resources - Japanese-English dictionary JMDict - Kanji dictionary KanjiDic2 - Kanji-radical and radical-kanji maps KRADFILE/RADKFILE - Japanese Proper Names Dictionary (JMnedict) - Fast look up (dictionaries are stored in SQLite databases) - Command-line lookup tool :ref:`(Example) ` .. Hide this for now - jamdol (jamdol-flask) - a Python/Flask server that provides Jamdict lookup via REST API (experimental state) :ref:`Contributors ` are welcome! 🙇. If you want to help developing Jamdict, please visit :ref:`contributing` page. Installation ------------ Jamdict and `jamdict-data `_ are both `available on PyPI `_ and can be installed using pip. For more information please see :ref:`installpage` page. .. code:: bash pip install jamdict jamdict-data Also, there is an online demo Jamdict virtual machine to try out on Repl.it https://replit.com/@tuananhle/jamdict-demo Sample jamdict Python code -------------------------- Looking up words >>> from jamdict import Jamdict >>> jam = Jamdict() >>> result = jam.lookup('はな') >>> for word in result.entries: ... print(word) ... [id#1194500] はな (花) : 1. flower/blossom/bloom/petal ((noun (common) (futsuumeishi))) 2. cherry blossom 3. beauty 4. blooming (esp. of cherry blossoms) 5. ikebana 6. Japanese playing cards 7. (the) best [id#1486720] はな (鼻) : nose ((noun (common) (futsuumeishi))) [id#1581610] はし (端) : 1. end (e.g. of street)/tip/point/edge/margin ((noun (common) (futsuumeishi))) 2. beginning/start/first 3. odds and ends/scrap/odd bit/least [id#1634180] はな (洟) : snivel/nasal mucus/snot ((noun (common) (futsuumeishi))) Looking up kanji characters >>> for c in result.chars: ... print(repr(c)) ... 花:7:flower 華:10:splendor,flower,petal,shine,luster,ostentatious,showy,gay,gorgeous 鼻:14:nose,snout 端:14:edge,origin,end,point,border,verge,cape 洟:9:tear,nasal discharge Looking up named entities >>> result = jam.lookup('ディズニー%') >>> for name in result.names: ... print(name) ... [id#5053163] ディズニー : Disney (family or surname/company name) [id#5741091] ディズニーランド : Disneyland (place name) See :ref:`recipes` for more code samples. .. _commandline: Command line tools ------------------ Jamdict can be used from the command line. .. code:: bash python3 -m jamdict lookup 言語学 ======================================== Found entries ======================================== Entry: 1264430 | Kj: 言語学 | Kn: げんごがく -------------------- 1. linguistics ((noun (common) (futsuumeishi))) ======================================== Found characters ======================================== Char: 言 | Strokes: 7 -------------------- Readings: yan2, eon, 언, Ngôn, Ngân, ゲン, ゴン, い.う, こと Meanings: say, word Char: 語 | Strokes: 14 -------------------- Readings: yu3, yu4, eo, 어, Ngữ, Ngứ, ゴ, かた.る, かた.らう Meanings: word, speech, language Char: 学 | Strokes: 8 -------------------- Readings: xue2, hag, 학, Học, ガク, まな.ぶ Meanings: study, learning, science No name was found. To show help you may use .. code:: bash python3 -m jamdict --help Documentation ------------- .. toctree:: :maxdepth: 2 install tutorials recipes api contributing updates Other info ========== Release Notes ------------- Release notes is available :ref:`here `. .. _contributors: Contributors ------------ - `Le Tuan Anh `__ (Maintainer) - `alt-romes `__ - `Matteo Fumagalli `__ - `Reem Alghamdi `__ - `Techno-coder `__ Useful links ------------ - jamdict on PyPI: https://pypi.org/project/jamdict/ - jamdict source code: https://github.com/neocl/jamdict/ - Documentation: https://jamdict.readthedocs.io/ - Dictionaries - JMdict: http://edrdg.org/jmdict/edict_doc.html - kanjidic2: https://www.edrdg.org/wiki/index.php/KANJIDIC_Project - JMnedict: https://www.edrdg.org/enamdict/enamdict_doc.html - KRADFILE: http://www.edrdg.org/krad/kradinf.html Indices and tables ------------------ * :ref:`genindex` * :ref:`modindex` * :ref:`search` ================================================ FILE: docs/install.rst ================================================ .. _installpage: Installation ============= jamdict and jamdict dictionary data are both available on PyPI and can be installed using `pip`. .. code-block:: bash pip install --user jamdict jamdict-data # pip script sometimes doesn't work properly # so you may want to try this instead python3 -m pip install jamdict jamdict-data .. note:: When you use :code:`pip install` in a virtual environment, especially the ones created via :code:`python3 -m venv`, wheel support can be missing. :code:`jamdict-data` relies on wheel/pip to extract xz-compressed database and this may cause a problem. If you encounter any error, please make sure that wheel is available .. code-block:: bash # list all available packages in pip pip list # ensure wheel support in pip pip install -U wheel You may need to uninstall :code:`jamdict-data` before reinstalling it. .. code-block:: bash pip uninstall jamdict-data Download database file manually ------------------------------- This should not be useful anymore from version 0.1a8 with the release of the `jamdict_data `_ package on PyPI. If for some reason you want to download and install jamdict database by yourself, here are the steps: 1. Download the offical, pre-compiled jamdict database (``jamdict-0.1a7.tar.xz``) from Google Drive https://drive.google.com/drive/u/1/folders/1z4zF9ImZlNeTZZplflvvnpZfJp3WVLPk 2. Extract and copy ``jamdict.db`` to jamdict data folder (defaulted to ``~/.jamdict/data/jamdict.db``) 3. To know where to copy data files you can use `python3 -m jamdict info` command via a terminal: .. code:: bash python3 -m jamdict info # Jamdict 0.1a8 # Python library for manipulating Jim Breen's JMdict, KanjiDic2, KRADFILE and JMnedict # # Basic configuration # ------------------------------------------------------------ # JAMDICT_HOME : ~/local/jamdict # jamdict_data availability: False # Config file location : /home/tuananh/.jamdict/config.json # # Custom Data files # ------------------------------------------------------------ # Jamdict DB location: ~/local/jamdict/data/jamdict.db - [OK] # JMDict XML file : ~/local/jamdict/data/JMdict_e.gz - [OK] # KanjiDic2 XML file : ~/local/jamdict/data/kanjidic2.xml.gz - [OK] # JMnedict XML file : ~/local/jamdict/data/JMnedict.xml.gz - [OK] # # Others # ------------------------------------------------------------ # lxml availability: False Build database file from source ------------------------------- Normal users who just want to look up the dictionaries do not have to do this. If you are a developer and want to build jamdict database from source, copy the dictionary source files to jamdict data folder. The original XML files can be downloaded either from the official website https://www.edrdg.org/ or from `this jamdict Google Drive folder `_. To find out where to copy the files or whether they are recognised by jamdict, you may use the command `python3 -m jamdict info` as in the section above. You should make sure that all files under the section `Custom data files` are all marked [OK]. After that you should be able to build the database with the command: .. code:: bash python3 -m jamdict import Note on XML parser: jamdict will use `lxml` instead of Python 3 default `xml` when it is available. ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=_build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% :end popd ================================================ FILE: docs/recipes.rst ================================================ .. _recipes: Common Recipes ============== .. contents:: :local: :depth: 2 .. warning:: 👉 ⚠️ THIS SECTION IS STILL UNDER CONSTRUCTION ⚠️ All code here assumed that you have created a Jamdict object named :samp:`jam`, like this >>> from jamdict import Jamdict >>> jam = Jamdict() High-performance tuning ----------------------- When you need to do a lot of queries on the database, it is possible to load the whole database into memory to boost up querying performance (This will takes about 400 MB of RAM) by using the :class:`memory_mode ` keyword argument, like this: >>> from jamdict import Jamdict >>> jam = Jamdict(memory_mode=True) The first query will be extremely slow (it may take about a minute for the whole database to be loaded into memory) but subsequent queries will be much faster. Iteration search ---------------- Sometimes people want to look through a set of search results only once and determine which items to keep and then discard the rest. In these cases :func:`lookup_iter ` should be used. This function returns an :class:`IterLookupResult ` object immediately after called. Users may loop through ``result.entries``, ``result.chars``, and ``result.names`` exact one loop for each set to find the items that they want. Users will have to store the desired word entries, characters, and names by themselves since they are discarded after yield. >>> res = jam.lookup_iter("花見") >>> for word in res.entries: ... print(word) # do somethign with the word >>> for c in res.chars: ... print(c) >>> for name in res.names: ... print(name) Part-of-speeches and named-entity types --------------------------------------- Use :func:`Jamdict.all_pos ` to list all available part-of-speeches and :func:`Jamdict.all_ne_type ` named-entity types: >>> for pos in jam.all_pos(): ... print(pos) # pos is a string >>> for ne_type in jam.all_ne_type(): ... print(ne_type) # ne_type is a string To filter words by part-of-speech use the keyword argument ``pos`` in :func:`loookup() ` or :func:`lookup_iter() ` functions. For example to look for all "かえる" that are nouns use: >>> result = jam.lookup("かえる", pos=["noun (common) (futsuumeishi)"]) To search for all named-entities that are "surname" use: >>> result = jam.lookup("surname") Kanjis and radical/components (KRAD/RADK mappings) -------------------------------------------------- Jamdict has built-in support for KRAD/RADK (i.e. kanji-radical and radical-kanji mapping). The terminology of radicals/components used by Jamdict can be different from else where. - A radical in Jamdict is a principal component, each character has only one radical. - A character may be decomposed into several writing components. By default jamdict provides two maps: - jam.krad is a Python dict that maps characters to list of components. - jam.radk is a Python dict that maps each available components to a list of characters. .. code:: python # Find all writing components (often called "radicals") of the character 雲 print(jam.krad['雲']) # ['一', '雨', '二', '厶'] # Find all characters with the component 鼎 chars = jam.radk['鼎'] print(chars) # {'鼏', '鼒', '鼐', '鼎', '鼑'} # look up the characters info result = jam.lookup(''.join(chars)) for c in result.chars: print(c, c.meanings()) # 鼏 ['cover of tripod cauldron'] # 鼒 ['large tripod cauldron with small'] # 鼐 ['incense tripod'] # 鼎 ['three legged kettle'] # 鼑 [] Finding name entities --------------------- .. code:: bash # Find all names that contain the string 鈴木 result = jam.lookup('%鈴木%') for name in result.names: print(name) # [id#5025685] キューティーすずき (キューティー鈴木) : Kyu-ti- Suzuki (1969.10-) (full name of a particular person) # [id#5064867] パパイヤすずき (パパイヤ鈴木) : Papaiya Suzuki (full name of a particular person) # [id#5089076] ラジカルすずき (ラジカル鈴木) : Rajikaru Suzuki (full name of a particular person) # [id#5259356] きつねざきすずきひなた (狐崎鈴木日向) : Kitsunezakisuzukihinata (place name) # [id#5379158] こすずき (小鈴木) : Kosuzuki (family or surname) # [id#5398812] かみすずき (上鈴木) : Kamisuzuki (family or surname) # [id#5465787] かわすずき (川鈴木) : Kawasuzuki (family or surname) # [id#5499409] おおすずき (大鈴木) : Oosuzuki (family or surname) # [id#5711308] すすき (鈴木) : Susuki (family or surname) # ... Exact matching -------------- Use exact matching for faster search .. code:: python # Find an entry (word, name entity) by idseq result = jam.lookup('id#5711308') print(result.names[0]) # [id#5711308] すすき (鈴木) : Susuki (family or surname) result = jam.lookup('id#1467640') print(result.entries[0]) # ねこ (猫) : 1. cat 2. shamisen 3. geisha 4. wheelbarrow 5. clay bed-warmer 6. bottom/submissive partner of a homosexual relationship # use exact matching to increase searching speed (thanks to @reem-codes) result = jam.lookup('猫') for entry in result.entries: print(entry) # [id#1467640] ねこ (猫) : 1. cat ((noun (common) (futsuumeishi))) 2. shamisen 3. geisha 4. wheelbarrow 5. clay bed-warmer 6. bottom/submissive partner of a homosexual relationship # [id#2698030] ねこま (猫) : cat ((noun (common) (futsuumeishi))) Low-level data queries ---------------------- It’s possible to access to the dictionary data by querying database directly using lower level APIs. However these are prone to future changes so please keep that in mind. When you create a Jamdict object, you have direct access to the underlying databases, via these properties .. code:: python from jamdict import Jamdict jam = Jamdict() >>> jam.jmdict # jamdict.JMDictSQLite object for accessing word dictionary >>> jam.kd2 # jamdict.KanjiDic2SQLite object, for accessing kanji dictionary >>> jam.jmnedict # jamdict.JMNEDictSQLite object, for accessing named-entities dictionary You can perform database queries on each of these databases by obtaining a database cursor with ``ctx()`` function (i.e. database query context). For example the following code list down all existing part-of-speeches in the database. .. code:: python # returns a list of sqlite3.Row object pos_rows = jam.jmdict.ctx().select("SELECT DISTINCT text FROM pos") # access columns in each query row by name all_pos = [x['text'] for x in pos_rows] # sort all POS all_pos.sort() for pos in all_pos: print(pos) For more information, please see `Jamdict database schema `_. Say we want to get all irregular suru verbs, we can start with finding all Sense IDs with pos = ``suru verb - irregular``, and then find all the Entry idseq connected to those Senses. Words (and also named entities) can be retrieved directly using their ``idseq``. Each word may have many Senses (meaning) and each Sense may have different pos. :: # Entry (idseq) --(has many)--> Sense --(has many)--> pos .. note:: Tips: Since we hit the database so many times (to find the IDs, to retrieve each word, etc.), we also should consider to reuse the database connection using database context to have better performance (``with jam.jmdict.ctx() as ctx:`` and ``ctx=ctx`` in the code below). Here is the sample code: .. code:: python # find all idseq of lexical entry (i.e. words) that have at least 1 sense with pos = suru verb - irregular with jam.jmdict.ctx() as ctx: # query all word's idseqs rows = ctx.select( query="SELECT DISTINCT idseq FROM Sense WHERE ID IN (SELECT sid FROM pos WHERE text = ?) LIMIT 10000", params=("suru verb - irregular",)) for row in rows: # reuse database connection with ctx=ctx for better performance word = jam.jmdict.get_entry(idseq=row['idseq'], ctx=ctx) print(word) ================================================ FILE: docs/requirements.txt ================================================ jamdict Sphinx ================================================ FILE: docs/tutorials.rst ================================================ Tutorials ========= Getting started --------------- Just install ``jamdict`` and ``jamdict_data`` packages via pip and you are ready to go. .. code:: python from jamdict import Jamdict jam = Jamdict() The most useful function is :func:`jamdict.util.Jamdict.lookup`. For example: .. code:: python # use wildcard matching to find any word, or Kanji character, or name # that starts with 食べ and ends with る result = jam.lookup('食べ%る') To access the result object you may use: .. code:: python # print all word entries for entry in result.entries: print(entry) # [id#1358280] たべる (食べる) : 1. to eat ((Ichidan verb|transitive verb)) 2. to live on (e.g. a salary)/to live off/to subsist on # [id#1358300] たべすぎる (食べ過ぎる) : to overeat ((Ichidan verb|transitive verb)) # [id#1852290] たべつける (食べ付ける) : to be used to eating ((Ichidan verb|transitive verb)) # [id#2145280] たべはじめる (食べ始める) : to start eating ((Ichidan verb)) # [id#2449430] たべかける (食べ掛ける) : to start eating ((Ichidan verb)) # [id#2671010] たべなれる (食べ慣れる) : to be used to eating/to become used to eating/to be accustomed to eating/to acquire a taste for ((Ichidan verb)) # [id#2765050] たべられる (食べられる) : 1. to be able to eat ((Ichidan verb|intransitive verb)) 2. to be edible/to be good to eat ((pre-noun adjectival (rentaishi))) # [id#2795790] たべくらべる (食べ比べる) : to taste and compare several dishes (or foods) of the same type ((Ichidan verb|transitive verb)) # [id#2807470] たべあわせる (食べ合わせる) : to eat together (various foods) ((Ichidan verb)) # print all related characters for c in result.chars: print(repr(c)) # 食:9:eat,food # 喰:12:eat,drink,receive (a blow),(kokuji) # 過:12:overdo,exceed,go beyond,error # 付:5:adhere,attach,refer to,append # 始:8:commence,begin # 掛:11:hang,suspend,depend,arrive at,tax,pour # 慣:14:accustomed,get used to,become experienced # 比:4:compare,race,ratio,Philippines # 合:6:fit,suit,join,0.1 ================================================ FILE: docs/updates.rst ================================================ .. _updates: Jamdict Changelog ================= jamdict 0.1a11 -------------- - 2021-05-25 - Added ``lookup_iter()`` for iteration search - Added ``pos`` filter for filtering words by part-of-speeches - Added ``all_pos()`` and ``all_ne_type()`` to Jamdict to list part-of-speeches and named-entity types - Better version checking in ``__version__.py`` - Improved documentation - 2021-05-29 - (.post1) Sorted kanji readings to have on & kun readings listed first - (.post1) Add ``on_readings``, ``kun_readings``, and ``other_readings`` filter to ``kanjidic2.RMGroup`` jamdict 0.1a10 -------------- - 2021-05-19 - Added ``memory_mode`` keyword to load database into memory before querying to boost up performance - Improved import performance by using puchikarui's ``buckmode`` - Tested with both puchikarui 0.1.* and 0.2.* jamdict 0.1a9 ------------- - 2021-04-19 - Fix data audit query - Enhanced ``Jamdict()`` constructor. ``Jamdict('/path/to/jamdict.db')`` works properly. - Code quality review - Automated documentation build via `readthedocs.org `__ jamdict 0.1a8 ------------- - 2021-04-15 - Make ``lxml`` optional - Data package can be installed via PyPI with ``jamdict_data`` package - Make configuration file optional as data files can be installed via PyPI. jamdict 0.1a7 ------------- - 2020-05-31 - Added Japanese Proper Names Dictionary (JMnedict) support - Included built-in KRADFILE/RADKFile support - Improved command line tools (json, compact mode, etc.) Older versions -------------- - 2017-08-18 - Support KanjiDic2 (XML/SQLite formats) - 2016-11-09 - Release first version to Github ================================================ FILE: jamdict/__init__.py ================================================ # -*- coding: utf-8 -*- ''' Python library for manipulating Jim Breen's JMdict and KanjiDic2 Latest version can be found at https://github.com/neocl/jamdict This package uses the [EDICT][1] and [KANJIDIC][2] dictionary files. These files are the property of the [Electronic Dictionary Research and Development Group][3], and are used in conformance with the Group's [licence][4]. [1]: http://www.csse.monash.edu.au/~jwb/edict.html [2]: http://www.csse.monash.edu.au/~jwb/kanjidic.html [3]: http://www.edrdg.org/ [4]: http://www.edrdg.org/edrdg/licence.html References: JMDict website: http://www.csse.monash.edu.au/~jwb/edict.html @author: Le Tuan Anh @license: MIT ''' # Copyright (c) 2016, Le Tuan Anh # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ######################################################################## from . import __version__ as version_info from .__version__ import __author__, __email__, __copyright__, __maintainer__ from .__version__ import __credits__, __license__, __description__, __url__ from .__version__ import __version__, __version_long__, __status__ from .jmdict_sqlite import JMDictSQLite from .kanjidic2_sqlite import KanjiDic2SQLite from .util import Jamdict, JMDictXML, KanjiDic2XML from .krad import KRad __all__ = ['Jamdict', 'JMDictSQLite', 'JMDictXML', 'KanjiDic2SQLite', 'KanjiDic2XML', 'KRad', "__version__", "__author__", "__description__", "__copyright__", "version_info"] ================================================ FILE: jamdict/__main__.py ================================================ from . import tools tools.main() ================================================ FILE: jamdict/__version__.py ================================================ # -*- coding: utf-8 -*- # jamdict's package version information __author__ = "Le Tuan Anh" __email__ = "tuananh.ke@gmail.com" __copyright__ = "Copyright (c) 2016, Le Tuan Anh" __credits__ = [] __license__ = "MIT License" __description__ = "Python library for using Japanese dictionaries and resources (Jim Breen's JMdict, KanjiDic2, KRADFILE, JMnedict)" __url__ = "https://github.com/neocl/jamdict" __maintainer__ = "Le Tuan Anh" # ------------------------------------------------------------------------------ # Version configuration (enforcing PEP 440) # ------------------------------------------------------------------------------ __status__ = "3 - Alpha" __version_tuple__ = (0, 1, 0, 11, 2) __version_status__ = '' # a specific value ('rc', 'dev', etc.) or leave blank to be auto-filled # ------------------------------------------------------------------------------ __status_map__ = {'3 - Alpha': 'a', '4 - Beta': 'b', '5 - Production/Stable': '', '6 - Mature': ''} if not __version_status__: __version_status__ = __status_map__[__status__] if len(__version_tuple__) == 3: __version_build__ = '' elif len(__version_tuple__) == 4: __version_build__ = f"{__version_tuple__[3]}" elif len(__version_tuple__) == 5: __version_build__ = f"{__version_tuple__[3]}.post{__version_tuple__[4]}" else: raise ValueError("Invalid version information") if __version_tuple__[2] == 0: __version_main__ = f"{'.'.join(str(n) for n in __version_tuple__[:2])}" else: __version_main__ = f"{'.'.join(str(n) for n in __version_tuple__[:3])}" __version__ = f"{__version_main__}{__version_status__}{__version_build__}" __version_long__ = f"{__version_main__} - {__status__.split('-')[1].strip()} {__version_build__}" ================================================ FILE: jamdict/config.py ================================================ # -*- coding: utf-8 -*- """ Jamdict configuration management """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os from pathlib import Path import logging from chirptext import AppConfig from chirptext.chio import read_file, write_file # ---------------------------------------------------------------------- # Configuration # ---------------------------------------------------------------------- MY_DIR = os.path.dirname(__file__) CONFIG_TEMPLATE = os.path.join(MY_DIR, 'data', 'config_template.json') __jamdict_home = os.environ.get('JAMDICT_HOME', MY_DIR) __app_config = AppConfig('jamdict', mode=AppConfig.JSON, working_dir=__jamdict_home) def _get_config_manager(): ''' Internal function for retrieving application config manager object Don't use this directly, use read_config() method instead ''' return __app_config def _ensure_config(config_path='~/.jamdict/config.json', mkdir=True): _path = Path(os.path.expanduser(config_path)) # auto create config dir if mkdir: _path.parent.mkdir(exist_ok=True) if not _path.exists(): default_config = read_file(CONFIG_TEMPLATE) logging.getLogger(__name__).warning(f"Jamdict configuration file could not be found. A new configuration file will be generated at {_path}") logging.getLogger(__name__).debug(f"Default config: {default_config}") write_file(_path, default_config) def read_config(config_file=None, force_refresh=False, ensure_config=False): ''' Read jamdict configuration (jamdict home folder, database name, etc.) from config file. When no configuration is available, jamdict will default JAMDICT_HOME to ``~/.jamdict`` This function should be called right after import statements (i.e. before jam = Jamdict()) The "standard" locations for configuration file include but not limited to: ~/.jamdict/config.json ~/.config/jamdict/config.json ./data/jamdict.json ./jamdict.json ./data/.jamdict.json ./.jamdict.json :param config_file: Path to configuration file. When config_file is None, jamdict will try to guess the location of the file. :param force_refresh: Force to re-read configuration from file :param ensure_config: Create configuration file automatically if it does not exist ''' if ensure_config and not config_file and not __app_config.locate_config(): # [2021-04-15] data can be installed via PyPI # configuration file can be optional now # load config from default template _ensure_config() if force_refresh or not __app_config.config: if config_file and os.path.isfile(config_file): __app_config.load(config_file) else: __app_config.load(CONFIG_TEMPLATE) # read config config = __app_config.config return config def home_dir(): ''' Find JAMDICT_HOME folder. if there is an environment variable that points to an existing directory (e.g. export JAMDICT_HOME=/home/user/jamdict) that folder will be used instead of the configured in jamdict JSON config file ''' _config = read_config() # [2020-06-01] Allow JAMDICT_HOME to be overridden by environment variables if 'JAMDICT_HOME' in os.environ: _env_jamdict_home = os.path.abspath(os.path.expanduser(os.environ['JAMDICT_HOME'])) if os.path.isdir(_env_jamdict_home): logging.getLogger(__name__).debug("JAMDICT_HOME: {}".format(_env_jamdict_home)) return _env_jamdict_home return _config.get('JAMDICT_HOME', __jamdict_home) def data_dir(): _config = read_config() _data_dir = _config.get('JAMDICT_DATA', '{JAMDICT_HOME}/data').format(JAMDICT_HOME=home_dir()) return _data_dir def get_file(file_key): ''' Get configured path by key ''' _config = read_config() _data_dir = data_dir() _home = home_dir() _value = _config.get(file_key) return _value.format(JAMDICT_DATA=_data_dir, JAMDICT_HOME=_home) if _value else '' ================================================ FILE: jamdict/data/config_template.json ================================================ { "JAMDICT_HOME": "~/.jamdict", "JAMDICT_DATA": "{JAMDICT_HOME}/data", "JAMDICT_DB": "{JAMDICT_DATA}/jamdict.db", "JMDICT_XML": "{JAMDICT_DATA}/JMdict_e.gz", "JMNEDICT_XML": "{JAMDICT_DATA}/JMnedict.xml.gz", "KD2_XML": "{JAMDICT_DATA}/kanjidic2.xml.gz", "KRADFILE": "{JAMDICT_DATA}/kradfile-u.gz" } ================================================ FILE: jamdict/data/setup_jmdict.sql ================================================ /* Add meta info */ CREATE TABLE IF NOT EXISTS meta ( key TEXT PRIMARY KEY NOT NULL, value TEXT NOT NULL ); ------------------------------------------------------------------------------------- -- JMDict ------------------------------------------------------------------------------------- CREATE TABLE Entry ( idseq INTEGER NOT NULL UNIQUE ); -- Entry's links (EntryInfo) CREATE TABLE Link ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,tag TEXT ,desc TEXT ,uri TEXT ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); -- Entry's bibinfo (EntryInfo) CREATE TABLE Bib ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,tag TEXT ,text TEXT ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); -- Entry's etym (EntryInfo) CREATE TABLE Etym ( idseq INTEGER ,text TEXT ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); -- Entry's audit (EntryInfo) CREATE TABLE Audit ( idseq INTEGER ,upd_date TEXT ,upd_detl TEXT ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); ------------------------------------------------------------------------------------- -- Kanji reading(s) of an entry ------------------------------------------------------------------------------------- CREATE TABLE Kanji ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,text TEXT ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); -- Kanji's info CREATE TABLE KJI ( kid INTEGER ,text TEXT ,FOREIGN KEY (kid) REFERENCES Kanji(id) ); -- Kanji priority CREATE TABLE KJP ( kid INTEGER ,text TEXT ,FOREIGN KEY (kid) REFERENCES Kanji(id) ); ------------------------------------------------------------------------------------- -- Kana reading(s) of an entry ------------------------------------------------------------------------------------- CREATE TABLE Kana ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,text TEXT ,nokanji BOOLEAN ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); -- re_restr CREATE TABLE KNR ( kid INTEGER ,text TEXT ,FOREIGN KEY (kid) REFERENCES Kana(id) ); -- Kana's info CREATE TABLE KNI ( kid INTEGER ,text TEXT ,FOREIGN KEY (kid) REFERENCES Kana(id) ); -- Kana priority CREATE TABLE KNP ( kid INTEGER ,text TEXT ,FOREIGN KEY (kid) REFERENCES Kana(id) ); ------------------------------------------------------------------------------------- -- Senses of an entry ------------------------------------------------------------------------------------- CREATE TABLE Sense ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); CREATE TABLE stagk ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE stagr ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE pos ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE xref ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE antonym ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE field ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE misc ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE SenseInfo ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE SenseSource ( sid INTEGER ,text TEXT ,lang TEXT ,lstype TEXT ,wasei TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE dialect ( sid INTEGER ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); CREATE TABLE SenseGloss ( sid INTEGER ,lang TEXT ,gend TEXT ,text TEXT ,FOREIGN KEY (sid) REFERENCES Sense(id) ); ------------------------------------------------------------------------------------- -- INDICES - JMDict ------------------------------------------------------------------------------------- CREATE INDEX Link_idseq ON Link(idseq); CREATE INDEX Link_tag ON Link(tag); CREATE INDEX Bib_idseq ON Link(idseq); CREATE INDEX Etym_idseq ON Etym(idseq); CREATE INDEX Audit_idseq ON Audit(idseq); CREATE INDEX Kanji_idseq ON Kanji(idseq); CREATE INDEX Kanji_text ON Kanji(text); CREATE INDEX KJI_kid ON KJI(kid); CREATE INDEX KJP_kid ON KJP(kid); CREATE INDEX Kana_idseq ON Kana(idseq); CREATE INDEX Kana_text ON Kana(text); CREATE INDEX KNR_kid ON KNR(kid); CREATE INDEX KNR_text ON KNR(text); CREATE INDEX KNI_kid ON KNI(kid); CREATE INDEX KNI_text ON KNI(text); CREATE INDEX KNP_kid ON KNP(kid); CREATE INDEX KNP_text ON KNP(text); CREATE INDEX Sense_idseq ON Sense(idseq); CREATE INDEX stagk_sid ON stagk(sid); CREATE INDEX stagk_text ON stagk(text); CREATE INDEX stagr_sid ON stagr(sid); CREATE INDEX stagr_text ON stagr(text); CREATE INDEX pos_sid ON pos(sid); CREATE INDEX pos_text ON pos(text); CREATE INDEX xref_sid ON xref(sid); CREATE INDEX xref_text ON xref(text); CREATE INDEX antonym_sid ON antonym(sid); CREATE INDEX antonym_text ON antonym(text); CREATE INDEX field_sid ON field(sid); CREATE INDEX field_text ON field(text); CREATE INDEX misc_sid ON misc(sid); CREATE INDEX misc_text ON misc(text); CREATE INDEX SenseInfo_sid ON SenseInfo(sid); CREATE INDEX SenseInfo_text ON SenseInfo(text); CREATE INDEX SenseSource_sid ON SenseSource(sid); CREATE INDEX SenseSource_text ON SenseSource(text); CREATE INDEX dialect_sid ON dialect(sid); CREATE INDEX dialect_text ON dialect(text); CREATE INDEX SenseGloss_sid ON SenseGloss(sid); CREATE INDEX SenseGloss_lang ON SenseGloss(lang); CREATE INDEX SenseGloss_gend ON SenseGloss(gend); CREATE INDEX SenseGloss_text ON SenseGloss(text); ================================================ FILE: jamdict/data/setup_jmnedict.sql ================================================ /* Add meta info */ CREATE TABLE IF NOT EXISTS meta ( key TEXT PRIMARY KEY NOT NULL, value TEXT NOT NULL ); ------------------------------------------------------------------------------------- -- JMDict ------------------------------------------------------------------------------------- CREATE TABLE NEEntry ( idseq INTEGER NOT NULL UNIQUE ); ------------------------------------------------------------------------------------- -- Kanji reading(s) of an entry ------------------------------------------------------------------------------------- CREATE TABLE NEKanji ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,text TEXT ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); ------------------------------------------------------------------------------------- -- Kana reading(s) of an entry ------------------------------------------------------------------------------------- CREATE TABLE NEKana ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,text TEXT ,nokanji BOOLEAN ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); ------------------------------------------------------------------------------------- -- Senses of an entry ------------------------------------------------------------------------------------- CREATE TABLE NETranslation ( ID INTEGER PRIMARY KEY ,idseq INTEGER ,FOREIGN KEY (idseq) REFERENCES Entry(idseq) ); CREATE TABLE NETransType ( tid INTEGER ,text TEXT ,FOREIGN KEY (tid) REFERENCES NETranslation(id) ); CREATE TABLE NETransXRef ( tid INTEGER ,text TEXT ,FOREIGN KEY (tid) REFERENCES NETranslation(id) ); CREATE TABLE NETransGloss ( tid INTEGER ,lang TEXT ,gend TEXT ,text TEXT ,FOREIGN KEY (tid) REFERENCES NETranslation(id) ); ------------------------------------------------------------------------------------- -- INDICES - JMneDict ------------------------------------------------------------------------------------- CREATE INDEX NEKanji_idseq ON NEKanji(idseq); CREATE INDEX NEKanji_text ON NEKanji(text); CREATE INDEX NEKana_idseq ON NEKana(idseq); CREATE INDEX NEKana_text ON NEKana(text); CREATE INDEX NETranslation_idseq ON NETranslation(idseq); CREATE INDEX NETransType_tid ON NETransType(tid); CREATE INDEX NETransType_text ON NETransType(text); CREATE INDEX NETransXRef_tid ON NETransXRef(tid); CREATE INDEX NETransXRef_text ON NETransXRef(text); CREATE INDEX NETransGloss_tid ON NETransGloss(tid); CREATE INDEX NETransGloss_lang ON NETransGloss(lang); CREATE INDEX NETransGloss_text ON NETransGloss(text); ================================================ FILE: jamdict/data/setup_kanjidic2.sql ================================================ /* Add meta info */ CREATE TABLE IF NOT EXISTS meta ( key TEXT UNIQUE, value TEXT NOT NULL ); ------------------------------------------------------------------------------------- -- KanjiDic2 tables ------------------------------------------------------------------------------------- CREATE TABLE character ( ID INTEGER PRIMARY KEY AUTOINCREMENT, literal TEXT NOT NULL, stroke_count INTEGER, grade TEXT, freq TEXT, jlpt TEXT ); CREATE TABLE codepoint ( cid INTEGER ,cp_type TEXT ,value TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE radical ( cid INTEGER ,rad_type TEXT ,value TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE stroke_miscount ( cid INTEGER ,value INTEGER ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE variant ( cid INTEGER ,var_type TEXT ,value TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE rad_name ( cid INTEGER ,value TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE dic_ref ( cid INTEGER ,dr_type TEXT ,value TEXT n ,m_vol TEXT ,m_page TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE query_code ( cid INTEGER ,qc_type TEXT ,value TEXT ,skip_misclass TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE nanori ( cid INTEGER ,value TEXT ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE rm_group ( ID INTEGER PRIMARY KEY AUTOINCREMENT ,cid INTEGER ,FOREIGN KEY (cid) REFERENCES character(ID) ); CREATE TABLE reading ( gid INTEGER ,r_type TEXT ,value TEXT ,on_type TEXT ,r_status TEXT ,FOREIGN KEY (gid) REFERENCES rm_group(id) ); CREATE TABLE meaning ( gid INTEGER ,value TEXT ,m_lang TEXT ,FOREIGN KEY (gid) REFERENCES rm_group(id) ); ------------------------------------------------------------------------------------- -- INDICES - KanjiDic2 ------------------------------------------------------------------------------------- CREATE INDEX character_literal ON character(literal); CREATE INDEX character_stroke_count ON character(stroke_count); CREATE INDEX character_grade ON character(grade); CREATE INDEX character_jlpt ON character(jlpt); CREATE INDEX codepoint_value ON codepoint(value); CREATE INDEX radical_value ON radical(value); CREATE INDEX variant_value ON variant(value); CREATE INDEX rad_name_value ON rad_name(value); CREATE INDEX dic_ref_value ON dic_ref(value); CREATE INDEX query_code_value ON query_code(value); CREATE INDEX nanori_value ON nanori(value); CREATE INDEX rm_group_cid ON rm_group(cid); CREATE INDEX reading_r_type ON reading(r_type); CREATE INDEX reading_value ON reading(value); CREATE INDEX meaning_value ON meaning(value); CREATE INDEX meaning_m_lang ON meaning(m_lang); ================================================ FILE: jamdict/jmdict.py ================================================ # -*- coding: utf-8 -*- """ JMdict Models """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging import warnings from typing import List try: from lxml import etree _LXML_AVAILABLE = True except Exception as e: # logging.getLogger(__name__).debug("lxml is not available, fall back to xml.etree.ElementTree") from xml.etree import ElementTree as etree _LXML_AVAILABLE = False from chirptext import chio logger = logging.getLogger(__name__) ######################################################################## class JMDEntry(object): ''' Represents a dictionary Word entry. Entries consist of kanji elements, reading elements, general information and sense elements. Each entry must have at least one reading element and one sense element. Others are optional. XML DTD ''' def __init__(self, idseq=''): # A unique numeric sequence number for each entry self.idseq = idseq # ent_seq self.kanji_forms: List[KanjiForm] = [] # k_ele* self.kana_forms: List[KanaForm] = [] # r_ele+ => KanaForm[] self.info: EntryInfo = None # info? => EntryInfo self.senses: List[Sense] = [] # sense+ def __len__(self): return len(self.senses) def __getitem__(self, idx): return self.senses[idx] def set_info(self, info): if self.info: logging.warning("WARNING: multiple info tag") self.info = info def text(self, compact=True, separator=' ', no_id=False): tmp = [] if not compact and not no_id: tmp.append('[id#%s]' % self.idseq) if self.kana_forms: tmp.append(self.kana_forms[0].text) if self.kanji_forms: tmp.append("({})".format(self.kanji_forms[0].text)) if self.senses: tmp.append(':') if len(self.senses) == 1: tmp.append(self.senses[0].text(compact=compact)) else: for sense, idx in zip(self.senses, range(len(self.senses))): tmp.append('{i}. {s}'.format(i=idx + 1, s=sense.text(compact=compact))) return separator.join(tmp) def __repr__(self): return self.text(compact=True) def __str__(self): return self.text(compact=False) def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): ed = {'idseq': self.idseq, 'kanji': [x.to_dict() for x in self.kanji_forms], 'kana': [x.to_dict() for x in self.kana_forms], 'senses': [x.to_dict() for x in self.senses]} if self.info: ed['info'] = self.info.to_dict() return ed class KanjiForm(object): ''' The kanji element, or in its absence, the reading element, is the defining component of each entry. The overwhelming majority of entries will have a single kanji element associated with a word in Japanese. Where there are multiple kanji elements within an entry, they will be orthographical variants of the same word, either using variations in okurigana, or alternative and equivalent kanji. Common "mis-spellings" may be included, provided they are associated with appropriate information fields. Synonyms are not included; they may be indicated in the cross-reference field associated with the sense element. DTD text --- a kanji written form of an entry, string info --- coded information field, a list of strings pri --- relative priority of the entry, a list of strings ''' def __init__(self, text=''): '''This element will contain a word or short phrase in Japanese which is written using at least one non-kana character (usually kanji, but can be other characters). The valid characters are kanji, kana, related characters such as chouon and kurikaeshi, and in exceptional cases, letters from other alphabets. ''' self.text = text # '''This is a coded information field related specifically to the orthography of the keb, and will typically indicate some unusual aspect, such as okurigana irregularity.''' self.info = [] # * '''This and the equivalent re_pri field are provided to record information about the relative priority of the entry, and consist of codes indicating the word appears in various references which can be taken as an indication of the frequency with which the word is used. This field is intended for use either by applications which want to concentrate on entries of a particular priority, or to generate subset files. The current values in this field are: - news1/2: appears in the "wordfreq" file compiled by Alexandre Girardi from the Mainichi Shimbun. (See the Monash ftp archive for a copy.) Words in the first 12,000 in that file are marked "news1" and words in the second 12,000 are marked "news2". - ichi1/2: appears in the "Ichimango goi bunruishuu", Senmon Kyouiku Publishing, Tokyo, 1998. (The entries marked "ichi2" were demoted from ichi1 because they were observed to have low frequencies in the WWW and newspapers.) - spec1 and spec2: a small number of words use this marker when they are detected as being common, but are not included in other lists. - gai1/2: common loanwords, based on the wordfreq file. - nfxx: this is an indicator of frequency-of-use ranking in the wordfreq file. "xx" is the number of the set of 500 words in which the entry can be found, with "01" assigned to the first 500, "02" to the second, and so on. (The entries with news1, ichi1, spec1 and gai1 values are marked with a "(P)" in the EDICT and EDICT2 files.) The reason both the kanji and reading elements are tagged is because on occasions a priority is only associated with a particular kanji/reading pair.''' self.pri = [] # * def set_text(self, text): if self.text: logging.warning("WARNING: duplicated text for k_ele") self.text = text def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): kjd = {'text': self.text} if self.info: kjd['info'] = self.info if self.pri: kjd['pri'] = self.pri return kjd def __repr__(self): return str(self) def __str__(self): return self.text class KanaForm(object): ''' The reading element typically contains the valid readings of the word(s) in the kanji element using modern kanadzukai. Where there are multiple reading elements, they will typically be alternative readings of the kanji element. In the absence of a kanji element, i.e. in the case of a word or phrase written entirely in kana, these elements will define the entry. text --- a kana written form of an entry, string nokanji --- True means this entry cannot be regarded as a true reading of the kanji, boolean restr --- use to restrict the reading to a subset of the available kanji forms, list of string info --- coded information field, a list of strings pri --- relative priority of the entry, a list of strings ''' def __init__(self, text='', nokanji=False): '''this element content is restricted to kana and related characters such as chouon and kurikaeshi. Kana usage will be consistent between the keb and reb elements; e.g. if the keb contains katakana, so too will the reb.''' self.text = text # '''This element, which will usually have a null value, indicates that the reb, while associated with the keb, cannot be regarded as a true reading of the kanji. It is typically used for words such as foreign place names, gairaigo which can be in kanji or katakana, etc.''' self.nokanji = nokanji # ? '''This element is used to indicate when the reading only applies to a subset of the keb elements in the entry. In its absence, all readings apply to all kanji elements. The contents of this element must exactly match those of one of the keb elements.''' self.restr = [] # * '''General coded information pertaining to the specific reading. Typically it will be used to indicate some unusual aspect of the reading.''' self.info = [] # * '''See the comment on ke_pri above.''' self.pri = [] # * def set_text(self, text): if self.text: logging.warning("WARNING: duplicated text for k_ele") self.text = text def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): knd = {'text': self.text, 'nokanji': self.nokanji} if self.restr: knd['restr'] = self.restr if self.info: knd['info'] = self.info if self.pri: knd['pri'] = self.pri return knd def __repr__(self): return str(self) def __str__(self): return self.text class EntryInfo(object): """General coded information relating to the entry as a whole. DTD: """ def __init__(self): self.links: List[Link] = [] # link* self.bibinfo: List[BibInfo] = [] # bibl* '''This field is used to hold information about the etymology of the kanji or kana parts of the entry. For gairaigo, etymological information may also be in the element.''' self.etym = [] # * self.audit: List[Audit] = [] # audit* def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'links': [x.to_dict() for x in self.links], 'bibinfo': [x.to_dict() for x in self.bibinfo], 'etym': self.etym, 'audit': [x.to_dict() for x in self.audit]} class Link(object): '''This element holds details of linking information to entries in other electronic repositories. The link_tag will be coded to indicate the type of link (text, image, sound), the link_desc will provided a textual label for the link, and the link_uri contains the actual URI. ''' def __init__(self, tag, desc, uri): self.tag: str = tag # self.desc: str = desc # self.uri: str = uri # def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'tag': self.tag, 'desc': self.desc, 'uri': self.uri} class BibInfo(object): '''Bibliographic information about the entry. The bib_tag will a coded reference to an entry in an external bibliographic database. The bib_txt field may be used for brief (local) descriptions. ''' def __init__(self, tag='', text=''): self.tag: str = tag self.text: str = text def set_tag(self, tag): if self.tag: logging.warning("WARNING: duplicate tag in bibinfo") self.tag = tag def set_text(self, text): if self.text: logging.warning("WARNING: duplicate text in bibinfo") self.text = text def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'tag': self.tag, 'text': self.text} class Audit(object): '''The audit element will contain the date and other information about updates to the entry. Can be used to record the source of the material. ''' def __init__(self, upd_date, upd_detl): self.upd_date = upd_date # self.upd_detl = upd_detl # def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'upd_date': self.upd_date, 'upd_detl': self.upd_detl} class Sense(object): '''The sense element will record the translational equivalent of the Japanese word, plus other related information. Where there are several distinctly different meanings of the word, multiple sense elements will be employed. ''' def __init__(self): '''These elements, if present, indicate that the sense is restricted to the lexeme represented by the keb and/or reb.''' self.stagk = [] # self.stagr = [] # '''Part-of-speech information about the entry/sense. Should use appropriate entity codes. In general where there are multiple senses in an entry, the part-of-speech of an earlier sense will apply to later senses unless there is a new part-of-speech indicated.''' self.pos = [] # '''This element is used to indicate a cross-reference to another entry with a similar or related meaning or sense. The content of this element is typically a keb or reb element in another entry. In some cases a keb will be followed by a reb and/or a sense number to provide a precise target for the cross-reference. Where this happens, a JIS "centre-dot" (0x2126) is placed between the components of the cross-reference. ''' self.xref = [] # xref '''This element is used to indicate another entry which is an antonym of the current entry/sense. The content of this element must exactly match that of a keb or reb element in another entry.''' self.antonym = [] # '''Information about the field of application of the entry/sense. When absent, general application is implied. Entity coding for specific fields of application.''' self.field = [] # '''This element is used for other relevant information about the entry/sense. As with part-of-speech, information will usually apply to several senses.''' self.misc = [] # '''The sense-information elements provided for additional information to be recorded about a sense. Typical usage would be to indicate such things as level of currency of a sense, the regional variations, etc.''' self.info = [] # self.lsource: List[LSource] = [] # '''For words specifically associated with regional dialects in Japanese, the entity code for that dialect, e.g. ksb for Kansaiben.''' self.dialect = [] # self.gloss: List[SenseGloss] = [] # '''The example elements provide for pairs of short Japanese and target-language phrases or sentences which exemplify the usage of the Japanese head-word and the target-language gloss. Words in example fields would typically not be indexed by a dictionary application.''' # It seems that this field is not used anymore! self.examples = [] # def __repr__(self): return str(self) def __str__(self): return self.text(compact=False) def text(self, compact=True): tmp = [str(x) for x in self.gloss] if not compact and self.pos: return '{gloss} ({pos})'.format(gloss='/'.join(tmp), pos=('(%s)' % '|'.join(self.pos))) else: return '/'.join(tmp) def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): sd = {} if self.stagk: sd['stagk'] = self.stagk if self.stagr: sd['stagr'] = self.stagr if self.pos: sd['pos'] = self.pos if self.xref: sd['xref'] = self.xref if self.antonym: sd['antonym'] = self.antonym if self.field: sd['field'] = self.field if self.misc: sd['misc'] = self.misc if self.info: sd['SenseInfo'] = self.info if self.lsource: sd['SenseSource'] = [x.to_dict() for x in self.lsource] if self.dialect: sd['dialect'] = self.dialect if self.gloss: sd['SenseGloss'] = [x.to_dict() for x in self.gloss] return sd class Translation(Sense): ''' The trans element will record the translational equivalent of the Japanese name, plus other related information. (JMendict) ''' def __init__(self): super().__init__() self.name_type = [] # mapped to name_type* self.xref = [] # mapped to xref self.gloss = [] # mapped to trans_det def name_type_human(self): return [JMENDICT_TYPE_MAP[x] if x in JMENDICT_TYPE_MAP else x for x in self.name_type] def text(self, compact=True): tmp = [str(x) for x in self.gloss] types = "/".join(self.name_type) if compact else "/".join(self.name_type_human()) return '{gloss} ({types})'.format(gloss='/'.join(tmp), types=types) def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): sd = super().to_dict() sd['name_type'] = self.name_type return sd class SenseGloss(object): '''Within each sense will be one or more "glosses", i.e. target-language words or phrases which are equivalents to the Japanese word. This element would normally be present, however it may be omitted in entries which are purely for a cross-reference. DTD: The xml:lang attribute defines the target language of the gloss. It will be coded using the three-letter language code from the ISO 639 standard. When absent, the value "eng" (i.e. English) is the default value. The g_gend attribute defines the gender of the gloss (typically a noun in the target language. When absent, the gender is either not relevant or has yet to be provided. These elements highlight particular target-language words which are strongly associated with the Japanese word. The purpose is to establish a set of target-language words which can effectively be used as head-words in a reverse target-language/Japanese relationship.''' def __init__(self, lang, gend, text): self.lang = lang self.gend = gend self.text = text def __repr__(self): return str(self) def __str__(self): tmp = [self.text] if self.lang and self.lang != 'eng': # lang = eng is trivial tmp.append('(lang:%s)' % self.lang) if self.gend: tmp.append('(gend:%s)' % self.gend) return ' '.join(tmp) def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): gd = {} if self.lang: gd['lang'] = self.lang if self.gend: gd['gend'] = self.gend if self.text: gd['text'] = self.text return gd class LSource: '''This element records the information about the source language(s) of a loan-word/gairaigo. If the source language is other than English, the language is indicated by the xml:lang attribute. The element value (if any) is the source word or phrase. The xml:lang attribute defines the language(s) from which a loanword is drawn. It will be coded using the three-letter language code from the ISO 639-2 standard. When absent, the value "eng" (i.e. English) is the default value. The bibliographic (B) codes are used. The ls_type attribute indicates whether the lsource element fully or partially describes the source word or phrase of the loanword. If absent, it will have the implied value of "full". Otherwise it will contain "part". The ls_wasei attribute indicates that the Japanese word has been constructed from words in the source language, and not from an actual phrase in that language. Most commonly used to indicate "waseieigo".''' def __init__(self, lang, lstype, wasei, text): self.lang = lang self.lstype = lstype self.wasei = wasei self.text = text def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'lang': self.lang, 'lstype': self.lstype, 'wasei': self.wasei, 'text': self.text} JMENDICT_TYPES = (("surname", "family or surname"), ("place", "place name"), ("unclass", "unclassified name"), ("company", "company name"), ("product", "product name"), ("work", "work of art, literature, music, etc. name"), ("masc", "male given name or forename"), ("fem", "female given name or forename"), ("person", "full name of a particular person"), ("given", "given name or forename, gender not specified"), ("station", "railway station"), ("organization", "organization name"), ("ok", "old or irregular kana form")) JMENDICT_TYPE_MAP = dict(JMENDICT_TYPES) JMENDICT_TYPE_MAP_DECODE = {v: k for k, v in JMENDICT_TYPES} class Meta(object): def __init__(self, key='', value=''): self.key = key self.value = value def __repr__(self): return "{{{}: {}}}".format(self.key, self.value) def __str__(self): return repr(self) class JMDictXMLParser(object): '''JMDict XML parser ''' def __init__(self): pass def parse_file(self, jmdict_file_path): ''' Parse JMDict_e.xml file and return a list of JMDEntry objects ''' actual_path = os.path.abspath(os.path.expanduser(jmdict_file_path)) logger.debug('Loading data from file: {}'.format(actual_path)) with chio.open(actual_path, mode='rb') as jmfile: tree = etree.iterparse(jmfile) entries = [] for event, element in tree: if event == 'end' and element.tag == 'entry': entries.append(self.parse_entry_tag(element)) # and then we can clear the element to save memory element.clear() return entries def parse_entry_tag(self, etag): '''Parse a lxml XML Node and generate a JMDEntry entry''' entry = JMDEntry() # parse ent_seq for child in etag: if child.tag == 'ent_seq': self.parse_ent_seq(child, entry) elif child.tag == 'k_ele': self.parse_k_ele(child, entry) elif child.tag == 'r_ele': self.parse_r_ele(child, entry) elif child.tag == 'info': self.parse_info(child, entry) elif child.tag == 'sense': self.parse_sense(child, entry) elif child.tag == 'trans': # JMendict support self.parse_ne_translation(child, entry) else: raise Exception("Invalid tag: %s" % child.tag) return entry def parse_ent_seq(self, seq_tag, entry): idseq = seq_tag.text if entry.idseq: raise Exception("WARNING: duplicated ent_seq tag") entry.idseq = idseq def get_single(self, tag_name, a_tag): children = a_tag.findall(tag_name) if len(children) == 0: return None elif len(children) > 1: raise Exception("There are %s %s tags in %s" % (len(children), tag_name, a_tag.tag)) else: return children[0] def parse_k_ele(self, k_ele, entry): kr = KanjiForm() for child in k_ele: if child.tag == 'keb': kr.set_text(child.text) elif child.tag == 'ke_inf': kr.info.append(child.text) elif child.tag == 'ke_pri': kr.pri.append(child.text) else: raise Exception("WARNING: invalid tag %s in k_ele" % child.tag) # parse kebs entry.kanji_forms.append(kr) return kr def parse_r_ele(self, r_ele, entry): kr = KanaForm() for child in r_ele: if child.tag == 'reb': kr.set_text(child.text) elif child.tag == 're_nokanji': kr.nokanji = True elif child.tag == 're_restr': kr.restr.append(child.text) elif child.tag == 're_inf': kr.info.append(child.text) elif child.tag == 're_pri': kr.pri.append(child.text) else: raise Exception("WARNING: invalid tag %s in r_ele" % child.tag) # parse kebs entry.kana_forms.append(kr) return kr def parse_info(self, info_tag, entry): einfo = EntryInfo() for child in info_tag: if child.tag == 'links': self.parse_link(child, einfo) elif child.tag == 'bibl': self.parse_bibinfo(child, einfo) elif child.tag == 'etym': einfo.etym.append(child.text) elif child.tag == 'audit': self.parse_audit(child, einfo) else: raise Exception("WARNING: invalid tag in info tag (child.tag = %s)" % child.tag) entry.set_info(einfo) return einfo def parse_link(self, link_tag, entry_info): tag = self.get_single('link_tag', link_tag).text desc = self.get_single('link_desc', link_tag).text uri = self.get_single('link_uri', link_tag).text link = Link(tag, desc, uri) entry_info.links.append(link) return link def parse_bibinfo(self, bib_tag, entry_info): bib = BibInfo() for child in bib_tag: if child.tag == 'bib_tag': bib.set_tag(child.text) elif child.tag == 'bib_txt': bib.set_text(child.text) else: raise Exception("WARNING: invalid tag in bibinfo (child.tag = %s)" % child.tag) entry_info.bibinfo.append(bib) return bib def parse_ne_translation(self, trans_tag, entry): translation = Translation() for child in trans_tag: if child.tag == 'name_type': _name_type = JMENDICT_TYPE_MAP_DECODE[child.text] if child.text in JMENDICT_TYPE_MAP_DECODE else child.text translation.name_type.append(_name_type) elif child.tag == 'trans_det': # add sensegloss lang = self.get_attrib(trans_tag, 'xml:lang', default_value='eng') gloss = SenseGloss(lang=lang, gend='', text=child.text) translation.gloss.append(gloss) elif child.tag == 'xref': translation.xref.append(child.text) else: raise Exception("Invalid tag: {} in JMendict/trans tag".format(child.tag)) entry.senses.append(translation) return translation def parse_sense(self, sense_tag, entry): sense = Sense() for child in sense_tag: if child.tag == 'stagk': sense.stagk.append(child.text) elif child.tag == 'stagr': sense.stagr.append(child.text) elif child.tag == 'pos': sense.pos.append(child.text) elif child.tag == 'xref': sense.xref.append(child.text) elif child.tag == 'ant': sense.antonym.append(child.text) elif child.tag == 'field': sense.field.append(child.text) elif child.tag == 'misc': sense.misc.append(child.text) elif child.tag == 's_inf': sense.info.append(child.text) elif child.tag == 'dial': sense.dialect.append(child.text) elif child.tag == 'example': sense.examples.append(child.text) elif child.tag == 'lsource': self.parse_lsource(child, sense) elif child.tag == 'gloss': self.parse_sensegloss(child, sense) else: raise Exception("WARNING: invalid tag in sense tag (child.tag = %s) content = %s" % (child.tag, etree.tostring(child))) entry.senses.append(sense) return sense def get_attrib(self, a_tag, attr_name, default_value=''): if attr_name == 'xml:lang': attr_name = '''{http://www.w3.org/XML/1998/namespace}lang''' if attr_name in a_tag.attrib: return a_tag.attrib[attr_name] else: return default_value def parse_sensegloss(self, gloss_tag, sense): lang = self.get_attrib(gloss_tag, 'xml:lang') gend = self.get_attrib(gloss_tag, 'g_gend') text = gloss_tag.text # TODO: pri tag? raw text? gloss = SenseGloss(lang, gend, text) sense.gloss.append(gloss) return gloss def parse_lsource(self, lsource_tag, sense): lang = self.get_attrib(lsource_tag, 'xml:lang') lstype = self.get_attrib(lsource_tag, 'ls_type') wasei = self.get_attrib(lsource_tag, 'ls_wasei') lsource = LSource(lang, lstype, wasei, lsource_tag.text) sense.lsource.append(lsource) return lsource ================================================ FILE: jamdict/jmdict_sqlite.py ================================================ # -*- coding: utf-8 -*- """ JMDict in SQLite format """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL from .jmdict import Meta, JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiForm, KanaForm, Sense, SenseGloss, LSource # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'data') JMDICT_SETUP_FILE = os.path.join(SCRIPT_FOLDER, 'setup_jmdict.sql') JMDICT_VERSION = '1.08' JMDICT_URL = 'http://www.csse.monash.edu.au/~jwb/edict.html' SETUP_SCRIPT = '''INSERT INTO meta VALUES ('jmdict.version', '{jv}'); INSERT INTO meta VALUES ('jmdict.url', '{ju}'); INSERT INTO meta VALUES ('generator', 'jamdict'); INSERT INTO meta VALUES ('generator_version', '{gv}'); INSERT INTO meta VALUES ('generator_url', '{gu}');'''.format( jv=JMDICT_VERSION, ju=JMDICT_URL, gv=JAMDICT_VERSION, gu=JAMDICT_URL ) def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------- # Models # ------------------------------------------------------------------------------- class JMDictSchema(Schema): KEY_JMD_VER = "jmdict.version" KEY_JMD_URL = "jmdict.url" def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) self.add_script(SETUP_SCRIPT) self.add_file(JMDICT_SETUP_FILE) # Meta self.add_table('meta', ['key', 'value'], proto=Meta).set_id('key') self.add_table('Entry', ['idseq']) self.add_table('Link', ['ID', 'idseq', 'tag', 'desc', 'uri']) self.add_table('Bib', ['ID', 'idseq', 'tag', 'text']) self.add_table('Etym', ['idseq', 'text']) self.add_table('Audit', ['idseq', 'upd_date', 'upd_detl']) # Kanji self.add_table('Kanji', ['ID', 'idseq', 'text']) self.add_table('KJI', ['kid', 'text']) self.add_table('KJP', ['kid', 'text']) # Kana self.add_table('Kana', ['ID', 'idseq', 'text', 'nokanji']) self.add_table('KNI', ['kid', 'text']) self.add_table('KNP', ['kid', 'text']) self.add_table('KNR', ['kid', 'text']) # Senses self.add_table('Sense', ['ID', 'idseq']) self.add_table('stagk', ['sid', 'text']) self.add_table('stagr', ['sid', 'text']) self.add_table('pos', ['sid', 'text']) self.add_table('xref', ['sid', 'text']) self.add_table('antonym', ['sid', 'text']) self.add_table('field', ['sid', 'text']) self.add_table('misc', ['sid', 'text']) self.add_table('SenseInfo', ['sid', 'text']) self.add_table('SenseSource', ['sid', 'text', 'lang', 'lstype', 'wasei']) self.add_table('dialect', ['sid', 'text']) self.add_table('SenseGloss', ['sid', 'lang', 'gend', 'text']) class JMDictSQLite(JMDictSchema): def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) def update_jmd_meta(self, version, url, ctx=None): # create a default context if none was provided if ctx is None: with self.open(ctx) as ctx: return self.update_jmd_meta(version, url, ctx=ctx) # else (a context is provided) # version jv = ctx.meta.by_id(self.KEY_JMD_VER) if not jv: ctx.meta.insert(self.KEY_JMD_VER, version) else: jv.value = version ctx.meta.save(jv) # url ju = ctx.meta.by_id(self.KEY_JMD_URL) if not ju: ctx.meta.insert(self.KEY_JMD_URL, version) else: ju.value = url ctx.meta.save(ju) def all_pos(self, ctx=None): if ctx is None: return self.all_pos(ctx=self.ctx()) else: return [x['text'] for x in ctx.execute("SELECT DISTINCT text FROM pos")] def _build_search_query(self, query, pos=None): where = [] params = [] if query.startswith('id#'): query_int = int(query[3:]) if query_int >= 0: getLogger().debug("Searching by ID: {}".format(query_int)) where.append("idseq = ?") params.append(query_int) elif query and query != "%": _is_wildcard_search = '_' in query or '@' in query or '%' in query if _is_wildcard_search: where.append("(idseq IN (SELECT idseq FROM Kanji WHERE text like ?) OR idseq IN (SELECT idseq FROM Kana WHERE text like ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text like ?))") else: where.append("(idseq IN (SELECT idseq FROM Kanji WHERE text == ?) OR idseq IN (SELECT idseq FROM Kana WHERE text == ?) OR idseq IN (SELECT idseq FROM sense JOIN sensegloss ON sense.ID == sensegloss.sid WHERE text == ?))") params += (query, query, query) if pos: if isinstance(pos, str): getLogger().warning("POS filter should be a collection, not a string") pos = [pos] # allow to search by POS slots = len(pos) if where: where.append("AND") where.append(f"idseq IN (SELECT idseq FROM Sense WHERE ID IN (SELECT sid FROM pos WHERE text IN ({','.join('?' * slots)})))") params += pos # else (a context is provided) logging.getLogger(__name__).debug(f"Search query: {where} -- Params: {params}") return where, params def search(self, query, ctx=None, pos=None, **kwargs): # ensure context if ctx is None: with self.ctx() as ctx: return self.search(query, ctx=ctx, pos=pos) where, params = self._build_search_query(query, pos=pos) where.insert(0, 'SELECT idseq FROM Entry WHERE ') entries = [] for (idseq,) in ctx.conn.cursor().execute(' '.join(where), params): entries.append(self.get_entry(idseq, ctx=ctx)) return entries def search_iter(self, query, ctx=None, pos=None, **kwargs): # ensure context if ctx is None: with self.ctx() as ctx: return self.search(query, ctx=ctx, pos=pos, iter_mode=iter_mode) where, params = self._build_search_query(query, pos=pos) where.insert(0, 'SELECT idseq FROM Entry WHERE ') for (idseq,) in ctx.conn.cursor().execute(' '.join(where), params): yield self.get_entry(idseq, ctx=ctx) def get_entry(self, idseq, ctx=None): # ensure context if ctx is None: with self.ctx() as new_context: return self.get_entry(idseq, new_context) # else (a context is provided) # select entry & info entry = JMDEntry(idseq) # links, bibs, etym, audit ... dblinks = ctx.Link.select('idseq=?', (idseq,)) dbbibs = ctx.Bib.select('idseq=?', (idseq,)) dbetym = ctx.Etym.select('idseq=?', (idseq,)) dbaudit = ctx.Audit.select('idseq=?', (idseq,)) if dblinks or dbbibs or dbetym or dbaudit: entry.info = EntryInfo() if dblinks: for l in dblinks: entry.info.links.append(Link(l.tag, l.desc, l.uri)) if dbbibs: for b in dbbibs: entry.info.bibinfo.append(BibInfo(b.tag, b.text)) if dbetym: for e in dbetym: entry.info.etym.append(e) if dbaudit: for a in dbaudit: entry.info.audit.append(Audit(a.upd_date, a.upd_detl)) # select kanji kanjis = ctx.Kanji.select('idseq=?', (idseq,)) for dbkj in kanjis: kj = KanjiForm(dbkj.text) kjis = ctx.KJI.select('kid=?', (dbkj.ID,)) for i in kjis: kj.info.append(i.text) kjps = ctx.KJP.select('kid=?', (dbkj.ID,)) for p in kjps: kj.pri.append(p.text) entry.kanji_forms.append(kj) # select kana kanas = ctx.Kana.select('idseq=?', (idseq,)) for dbkn in kanas: kn = KanaForm(dbkn.text, dbkn.nokanji) knis = ctx.KNI.select('kid=?', (dbkn.ID,)) for i in knis: kn.info.append(i.text) knps = ctx.KNP.select('kid=?', (dbkn.ID,)) for p in knps: kn.pri.append(p.text) knrs = ctx.KNR.select('kid=?', (dbkn.ID,)) for r in knrs: kn.restr.append(r.text) entry.kana_forms.append(kn) # select senses senses = ctx.Sense.select('idseq=?', (idseq,)) for dbs in senses: s = Sense() # stagk ks = ctx.stagk.select('sid=?', (dbs.ID,)) for k in ks: s.stagk.append(k.text) # stagr rs = ctx.stagr.select('sid=?', (dbs.ID,)) for r in rs: s.stagr.append(r.text) # pos ps = ctx.pos.select('sid=?', (dbs.ID,)) for p in ps: s.pos.append(p.text) # xref xs = ctx.xref.select('sid=?', (dbs.ID,)) for x in xs: s.xref.append(x.text) # antonym ans = ctx.antonym.select('sid=?', (dbs.ID,)) for a in ans: s.antonym.append(a.text) # field fs = ctx.field.select('sid=?', (dbs.ID,)) for f in fs: s.field.append(f.text) # misc ms = ctx.misc.select('sid=?', (dbs.ID,)) for m in ms: s.misc.append(m.text) # SenseInfo sis = ctx.SenseInfo.select('sid=?', (dbs.ID,)) for si in sis: s.info.append(si.text) # SenseSource lss = ctx.SenseSource.select('sid=?', (dbs.ID,)) for ls in lss: s.lsource.append(LSource(ls.lang, ls.lstype, ls.wasei, ls.text)) # dialect ds = ctx.dialect.select('sid=?', (dbs.ID,)) for d in ds: s.dialect.append(d.text) # SenseGloss gs = ctx.SenseGloss.select('sid=?', (dbs.ID,)) for g in gs: s.gloss.append(SenseGloss(g.lang, g.gend, g.text)) entry.senses.append(s) return entry def insert_entries(self, entries, ctx=None): # ensure context if ctx is None: with self.ctx() as new_context: return self.insert_entries(entries, ctx=new_context) # else getLogger().debug("JMdict bulk insert {} entries".format(len(entries))) for entry in entries: self.insert_entry(entry, ctx) def insert_entry(self, entry, ctx=None): # ensure context if ctx is None: with self.ctx() as ctx: return self.insert_entry(entry, ctx=ctx) # else (a context is provided) self.Entry.insert(entry.idseq, ctx=ctx) # insert info if entry.info: # links for lnk in entry.info.links: ctx.Link.insert(entry.idseq, lnk.tag, lnk.desc, lnk.uri) # bibs for bib in entry.info.bibinfo: ctx.Bib.insert(entry.idseq, bib.tag, bib.text) # etym for e in entry.info.etym: ctx.Etym.insert(entry.idseq, e) # audit for a in entry.info.audit: ctx.Audit.insert(entry.idseq, a.upd_date, a.upd_detl) # insert kanji for kj in entry.kanji_forms: kjid = ctx.Kanji.insert(entry.idseq, kj.text) # KJI for kji in kj.info: ctx.KJI.insert(kjid, kji) # KJP for kjp in kj.pri: ctx.KJP.insert(kjid, kjp) # insert kana for kn in entry.kana_forms: knid = ctx.Kana.insert(entry.idseq, kn.text, kn.nokanji) # KNI for kni in kn.info: ctx.KNI.insert(knid, kni) # KNP for knp in kn.pri: ctx.KNP.insert(knid, knp) # KNR for knr in kn.restr: ctx.KNR.insert(knid, knr) # insert senses for s in entry.senses: sid = ctx.Sense.insert(entry.idseq) # stagk for sk in s.stagk: ctx.stagk.insert(sid, sk) # stagr for sr in s.stagr: ctx.stagr.insert(sid, sr) # pos for pos in s.pos: ctx.pos.insert(sid, pos) # xref for xr in s.xref: ctx.xref.insert(sid, xr) # antonym for a in s.antonym: ctx.antonym.insert(sid, a) # field for f in s.field: ctx.field.insert(sid, f) # misc for m in s.misc: ctx.misc.insert(sid, m) # SenseInfo for i in s.info: ctx.SenseInfo.insert(sid, i) # SenseSource for l in s.lsource: ctx.SenseSource.insert(sid, l.text, l.lang, l.lstype, l.wasei) # dialect for d in s.dialect: ctx.dialect.insert(sid, d) # SenseGloss for g in s.gloss: ctx.SenseGloss.insert(sid, g.lang, g.gend, g.text) ================================================ FILE: jamdict/jmnedict_sqlite.py ================================================ # -*- coding: utf-8 -*- """ Japanese Multilingual Named Entity Dictionary (JMnedict) in SQLite format References: ENAMDICT/JMnedict - Japanese Proper Names Dictionary Files https://www.edrdg.org/enamdict/enamdict_doc.html """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2020 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging from typing import Sequence from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL from .jmdict import Meta, JMDEntry, KanjiForm, KanaForm, Translation, SenseGloss # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'data') JMNEDICT_SETUP_FILE = os.path.join(SCRIPT_FOLDER, 'setup_jmnedict.sql') JMNEDICT_VERSION = '1.08' JMNEDICT_URL = 'https://www.edrdg.org/enamdict/enamdict_doc.html' JMNEDICT_DATE = '2020-05-29' JMNEDICT_SETUP_SCRIPT = '''INSERT INTO meta VALUES ('jmnedict.version', '{jv}'); INSERT INTO meta VALUES ('jmnedict.url', '{ju}'); INSERT INTO meta VALUES ('jmnedict.date', '{jud}'); INSERT INTO meta SELECT 'generator', 'jamdict' WHERE NOT EXISTS (SELECT 1 FROM meta WHERE key = 'generator'); INSERT INTO meta SELECT 'generator_version', '{gv}' WHERE NOT EXISTS (SELECT 1 FROM meta WHERE key = 'generator_version'); INSERT INTO meta SELECT 'generator_url', '{gu}' WHERE NOT EXISTS (SELECT 1 FROM meta WHERE key = 'generator_url');'''.format( jv=JMNEDICT_VERSION, ju=JMNEDICT_URL, jud=JMNEDICT_DATE, gv=JAMDICT_VERSION, gu=JAMDICT_URL ) def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------- # Models # ------------------------------------------------------------------------------- class JMNEDictSchema(Schema): def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) self.add_script(JMNEDICT_SETUP_SCRIPT) self.add_file(JMNEDICT_SETUP_FILE) # Meta self.add_table('meta', ['key', 'value'], proto=Meta).set_id('key') self.add_table('NEEntry', ['idseq']) # Kanji self.add_table('NEKanji', ['ID', 'idseq', 'text']) # Kana self.add_table('NEKana', ['ID', 'idseq', 'text', 'nokanji']) # Translation (~Sense of JMdict) self.add_table('NETranslation', ['ID', 'idseq']) self.add_table('NETransType', ['tid', 'text']) self.add_table('NETransXRef', ['tid', 'text']) self.add_table('NETransGloss', ['tid', 'lang', 'gend', 'text']) class JMNEDictSQLite(JMNEDictSchema): def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) def all_ne_type(self, ctx=None): if ctx is None: return self.all_ne_type(ctx=self.ctx()) else: return [x['text'] for x in ctx.execute("SELECT DISTINCT text FROM NETransType")] def _build_ne_search_query(self, query): _is_wildcard_search = '_' in query or '@' in query or '%' in query if _is_wildcard_search: where = "idseq IN (SELECT idseq FROM NEKanji WHERE text like ?) OR idseq IN (SELECT idseq FROM NEKana WHERE text like ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransGloss ON NETranslation.ID == NETransGloss.tid WHERE NETransGloss.text like ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransType ON NETranslation.ID == NETransType.tid WHERE NETransType.text like ?)" else: where = "idseq IN (SELECT idseq FROM NEKanji WHERE text == ?) OR idseq IN (SELECT idseq FROM NEKana WHERE text == ?) OR idseq IN (SELECT idseq FROM NETranslation JOIN NETransGloss ON NETranslation.ID == NETransGloss.tid WHERE NETransGloss.text == ?) or idseq in (SELECT idseq FROM NETranslation JOIN NETransType ON NETranslation.ID == NETransType.tid WHERE NETransType.text == ?)" params = [query, query, query, query] try: if query.startswith('id#'): query_int = int(query[3:]) if query_int >= 0: where = "idseq = ?" params = [query_int] except Exception: pass getLogger().debug(f"where={where} | params={params}") return where, params def search_ne(self, query, ctx=None, **kwargs) -> Sequence[JMDEntry]: if ctx is None: with self.ctx() as ctx: return self.search_ne(query, ctx=ctx) where, params = self._build_ne_search_query(query) where = 'SELECT idseq FROM NEEntry WHERE ' + where entries = [] for (idseq,) in ctx.conn.cursor().execute(where, params): entries.append(self.get_ne(idseq, ctx=ctx)) return entries def search_ne_iter(self, query, ctx=None, **kwargs): if ctx is None: with self.ctx() as ctx: return self.search_ne(query, ctx=ctx) where, params = self._build_ne_search_query(query) where = 'SELECT idseq FROM NEEntry WHERE ' + where for (idseq,) in ctx.conn.cursor().execute(where, params): yield self.get_ne(idseq, ctx=ctx) def get_ne(self, idseq, ctx=None) -> JMDEntry: # ensure context if ctx is None: with self.ctx() as new_context: return self.get_entry(idseq, new_context) # else (a context is provided) # select entry & info entry = JMDEntry(idseq) # select kanji kanjis = ctx.NEKanji.select('idseq=?', (idseq,)) for dbkj in kanjis: kj = KanjiForm(dbkj.text) entry.kanji_forms.append(kj) # select kana kanas = ctx.NEKana.select('idseq=?', (idseq,)) for dbkn in kanas: kn = KanaForm(dbkn.text, dbkn.nokanji) entry.kana_forms.append(kn) # select senses senses = ctx.NETranslation.select('idseq=?', (idseq,)) for dbs in senses: s = Translation() # name_type nts = ctx.NETransType.select('tid=?', (dbs.ID,)) for nt in nts: s.name_type.append(nt.text) # xref xs = ctx.NETransXRef.select('tid=?', (dbs.ID,)) for x in xs: s.xref.append(x.text) # SenseGloss gs = ctx.NETransGloss.select('tid=?', (dbs.ID,)) for g in gs: s.gloss.append(SenseGloss(g.lang, g.gend, g.text)) entry.senses.append(s) return entry def insert_name_entities(self, entries, ctx=None): # ensure context if ctx is None: with self.ctx() as new_context: return self.insert_name_entities(entries, ctx=new_context) # else for entry in entries: self.insert_name_entity(entry, ctx) def insert_name_entity(self, entry, ctx=None): # ensure context if ctx is None: with self.ctx() as ctx: return self.insert_name_entity(entry, ctx=ctx) # else (a context is provided) self.NEEntry.insert(entry.idseq, ctx=ctx) # insert kanji for kj in entry.kanji_forms: ctx.NEKanji.insert(entry.idseq, kj.text) # insert kana for kn in entry.kana_forms: ctx.NEKana.insert(entry.idseq, kn.text, kn.nokanji) # insert translations for s in entry.senses: tid = ctx.NETranslation.insert(entry.idseq) # insert name_type for nt in s.name_type: ctx.NETransType.insert(tid, nt) # xref for xr in s.xref: ctx.NETransXRef.insert(tid, xr) # Gloss for g in s.gloss: ctx.NETransGloss.insert(tid, g.lang, g.gend, g.text) ================================================ FILE: jamdict/kanjidic2.py ================================================ # -*- coding: utf-8 -*- """ Kanjidic2 models """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging import warnings from typing import List try: from lxml import etree _LXML_AVAILABLE = True except Exception as e: # logging.getLogger(__name__).debug("lxml is not available, fall back to xml.etree.ElementTree") from xml.etree import ElementTree as etree _LXML_AVAILABLE = False from chirptext import chio from chirptext.sino import Radical as KangxiRadical from .krad import KRad # ------------------------------------------------------------------------------ # Configuration # ------------------------------------------------------------------------------ krad = KRad() def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------ # Models # ------------------------------------------------------------------------------ class KanjiDic2(object): def __init__(self, file_version, database_version, date_of_creation): """ """ self.file_version = file_version self.database_version = database_version self.date_of_creation = date_of_creation self.characters = [] def __len__(self): return len(self.characters) def __getitem__(self, idx): return self.characters[idx] class Character(object): """ Represent a kanji character. """ def __init__(self): """ """ self.ID = None self.literal = '' # The character itself in UTF8 coding. self.codepoints: List[CodePoint] = [] # self.radicals: List[Radical] = [] # self.__canon_radical = None self.stroke_count = None # first stroke_count in misc self.grade = None # / self.stroke_miscounts = [] # /stroke_count[1:] self.variants: List[Variant] = [] # / self.freq = None # / self.rad_names = [] # / a list of strings self.jlpt = None # / self.dic_refs: List[DicRef] = [] # DicRef[] self.query_codes: List[QueryCode] = [] # QueryCode[] self.rm_groups: List[RMGroup] = [] # reading_meaning groups self.nanoris = [] # a list of strings @property def text(self): return self.literal def __repr__(self): meanings = self.meanings(english_only=True) return "{l}:{sc}:{meanings}".format(l=self.literal, sc=self.stroke_count, meanings=','.join(meanings)) def __str__(self): return self.literal def meanings(self, english_only=False): ''' Accumulate all meanings as a list of string. Each string is a meaning (i.e. sense) ''' meanings = [] for rm in self.rm_groups: for m in rm.meanings: if english_only and m.m_lang != '': continue meanings.append(m.value) return meanings @property def components(self): ''' Kanji writing components that compose this character ''' if self.literal in krad.krad: return krad.krad[self.literal] else: return [] @property def radical(self): if self.__canon_radical is None: for rad in self.radicals: if rad.rad_type == 'classical': self.__canon_radical = KangxiRadical.kangxi()[rad.value] return self.__canon_radical def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'literal': self.literal, 'codepoints': [cp.to_dict() for cp in self.codepoints], 'radicals': [r.to_dict() for r in self.radicals], 'stroke_count': self.stroke_count, 'grade': self.grade if self.grade else '', 'stroke_miscounts': self.stroke_miscounts, 'variants': [v.to_dict() for v in self.variants], 'freq': self.freq if self.freq else 0, 'rad_names': self.rad_names, 'jlpt': self.jlpt if self.jlpt else '', 'dic_refs': [r.to_dict() for r in self.dic_refs], 'q_codes': [q.to_dict() for q in self.query_codes], 'rm': [rm.to_dict() for rm in self.rm_groups], 'nanoris': list(self.nanoris)} class CodePoint(object): def __init__(self, cp_type='', value=''): """ """ self.cid = None self.cp_type = cp_type self.value = value def __repr__(self): if self.r_type: return "({t}) {v}".format(t=self.cp_type, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'type': self.cp_type, 'value': self.value} class Radical(object): def __init__(self, rad_type='', value=''): """ """ self.cid = None self.rad_type = rad_type self.value = value def __repr__(self): if self.rad_type: return "({t}) {v}".format(t=self.rad_type, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'type': self.rad_type, 'value': self.value} class Variant(object): def __init__(self, var_type='', value=''): """ """ self.cid = None self.var_type = var_type self.value = value def __repr__(self): if self.var_type: return "({t}) {v}".format(t=self.var_type, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'type': self.var_type, 'value': self.value} class DicRef(object): def __init__(self, dr_type='', value='', m_vol='', m_page=''): """ """ self.cid = None self.dr_type = dr_type self.value = value self.m_vol = m_vol self.m_page = m_page def __repr__(self): if self.dr_type: return "({t}) {v}".format(t=self.dr_type, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'type': self.dr_type, 'value': self.value, "m_vol": self.m_vol, "m_page": self.m_page} class QueryCode(object): def __init__(self, qc_type='', value='', skip_misclass=""): """ """ self.cid = None self.qc_type = qc_type self.value = value self.skip_misclass = skip_misclass def __repr__(self): if self.qc_type: return "({t}) {v}".format(t=self.qc_type, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'type': self.qc_type, 'value': self.value, "skip_misclass": self.skip_misclass} class RMGroup(object): def __init__(self, readings=None, meanings=None): """ """ self.ID = None self.cid = None self.readings: List[Reading] = readings if readings else [] self.meanings: List[Meaning] = meanings if meanings else [] def __repr__(self): return "R: {} | M: {}".format( ", ".join([r.value for r in self.readings]), ", ".join(m.value for m in self.meanings)) def __str__(self): return repr(self) @property def on_readings(self): return [r for r in self.readings if r.r_type == 'ja_on'] @property def kun_readings(self): return [r for r in self.readings if r.r_type == 'ja_kun'] @property def other_readings(self): return [r for r in self.readings if r.r_type not in('ja_kun', 'ja_on')] def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): sorted_readings = sorted(self.readings, key=lambda x: x.r_type.startswith('ja_'), reverse=True) return {'readings': [r.to_dict() for r in sorted_readings], 'meanings': [m.to_dict() for m in self.meanings]} class Reading(object): def __init__(self, r_type='', value='', on_type="", r_status=""): """ """ self.gid = None self.r_type = r_type self.value = value self.on_type = on_type self.r_status = r_status def __repr__(self): if self.r_type: return "({t}) {v}".format(t=self.r_type, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'type': self.r_type, 'value': self.value, 'on_type': self.on_type, 'r_status': self.r_status} class Meaning(object): def __init__(self, value='', m_lang=''): """ """ self.gid = None self.m_lang = m_lang self.value = value def __repr__(self): if self.m_lang: return "({l}) {v}".format(l=self.m_lang, v=self.value) else: return self.value def __str__(self): return self.value def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self): return {'m_lang': self.m_lang, 'value': self.value} class Kanjidic2XMLParser(object): """ JMDict XML parser """ def __init__(self): pass def get_attrib(self, a_tag, attr_name, default_value=''): if attr_name == 'xml:lang': attr_name = '''{http://www.w3.org/XML/1998/namespace}lang''' if attr_name in a_tag.attrib: return a_tag.attrib[attr_name] else: return default_value def parse_file(self, kd2_file_path): ''' Parse all characters from Kanjidic2 XML file ''' actual_path = os.path.abspath(os.path.expanduser(kd2_file_path)) getLogger().debug('Loading data from file: {}'.format(actual_path)) with chio.open(actual_path, mode='rb') as kd2file: tree = etree.iterparse(kd2file) kd2 = None for event, element in tree: if event == 'end': if element.tag == 'header': kd2 = self.parse_header(element) element.clear() elif element.tag == 'character': kd2.characters.append(self.parse_char(element)) element.clear() return kd2 def parse_header(self, e): fv = None dbv = None doc = None for child in e: if child.tag == 'file_version': fv = child.text elif child.tag == 'database_version': dbv = child.text elif child.tag == 'date_of_creation': doc = child.text return KanjiDic2(fv, dbv, doc) def parse_char(self, e): char = Character() for child in e: if child.tag == 'literal': char.literal = child.text elif child.tag == 'codepoint': self.parse_codepoint(child, char) elif child.tag == 'radical': self.parse_radical(child, char) elif child.tag == 'misc': self.parse_misc(child, char) elif child.tag == 'dic_number': self.parse_dic_refs(child, char) elif child.tag == 'query_code': self.parse_query_code(child, char) elif child.tag == 'reading_meaning': self.parse_reading_meaning(child, char) else: getLogger().warning("Unknown tag in child: {}".format(child.tag)) return char def parse_codepoint(self, e, char): for child in e: if child.tag == 'cp_value': cp = CodePoint(self.get_attrib(child, 'cp_type'), child.text) char.codepoints.append(cp) else: getLogger().warning("Unknown tag: {}".format(child.tag)) def parse_radical(self, e, char): for child in e: if child.tag == 'rad_value': rad = Radical(self.get_attrib(child, "rad_type"), child.text) char.radicals.append(rad) else: getLogger().warning("Unknown tag: {}".format(child.tag)) def parse_misc(self, e, char): for child in e: # grade?, stroke_count+, variant*, freq?, rad_name*,jlpt? if child.tag == 'grade': char.grade = child.text elif child.tag == 'stroke_count': if char.stroke_count is None: char.stroke_count = int(child.text) else: char.stroke_miscounts.append(int(child.text)) elif child.tag == 'variant': v = Variant(self.get_attrib(child, "var_type"), child.text) char.variants.append(v) elif child.tag == 'freq': char.freq = child.text elif child.tag == 'rad_name': char.rad_names.append(child.text) elif child.tag == 'jlpt': char.jlpt = child.text else: getLogger().warning("Unknown tag: {}".format(child.tag)) def parse_dic_refs(self, e, char): for child in e: if child.tag == 'dic_ref': dr_type = self.get_attrib(child, "dr_type") m_vol = self.get_attrib(child, "m_vol") m_page = self.get_attrib(child, "m_page") dr = DicRef(dr_type, child.text, m_vol, m_page) char.dic_refs.append(dr) else: getLogger().warning("Unknown tag: {}".format(child.tag)) def parse_query_code(self, e, char): for child in e: if child.tag == "q_code": qc_type = self.get_attrib(child, "qc_type") skip_misclass = self.get_attrib(child, "skip_misclass") char.query_codes.append(QueryCode(qc_type, child.text, skip_misclass)) else: getLogger().warning("Unknown tag: {}".format(child.tag)) def parse_reading_meaning(self, e, char): for child in e: if child.tag == "nanori": char.nanoris.append(child.text) elif child.tag == "rmgroup": rmgroup = RMGroup() char.rm_groups.append(rmgroup) for grandchild in child: if grandchild.tag == 'reading': r_type = self.get_attrib(grandchild, "r_type") on_type = self.get_attrib(grandchild, "on_type") r_status = self.get_attrib(grandchild, "r_status") r = Reading(r_type, grandchild.text, on_type, r_status) rmgroup.readings.append(r) elif grandchild.tag == 'meaning': m = Meaning(grandchild.text, self.get_attrib(grandchild, "m_lang")) rmgroup.meanings.append(m) else: getLogger().warning("Unknown tag: {}".format(grandchild.tag)) else: getLogger().warning("Unknown tag: {}".format(child.tag)) ================================================ FILE: jamdict/kanjidic2_sqlite.py ================================================ # -*- coding: utf-8 -*- """ KanjiDic2 in SQLite format References: KANJIDIC2 project https://www.edrdg.org/wiki/index.php/KANJIDIC_Project """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2017 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL from .jmdict import Meta from .kanjidic2 import Character, CodePoint, Radical, Variant, DicRef, QueryCode, RMGroup, Reading, Meaning # ------------------------------------------------------------------------------ # Configuration # ------------------------------------------------------------------------------ MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) SCRIPT_FOLDER = os.path.join(MY_FOLDER, 'data') KANJIDIC2_VERSION = '1.6' KANJIDIC2_URL = 'https://www.edrdg.org/wiki/index.php/KANJIDIC_Project' KANJIDIC2_DATE = 'April 2008' KANJIDIC2_SETUP_FILE = os.path.join(SCRIPT_FOLDER, 'setup_kanjidic2.sql') KANJIDIC2_SETUP_SCRIPT = ''' INSERT INTO meta VALUES ('kanjidic2.version', '{kdv}'); INSERT INTO meta VALUES ('kanjidic2.url', '{kdu}'); INSERT INTO meta VALUES ('kanjidic2.date', '{kdd}'); INSERT INTO meta SELECT 'generator', 'jamdict' WHERE NOT EXISTS (SELECT 1 FROM meta WHERE key='generator'); INSERT INTO meta SELECT 'generator_version', '{gv}' WHERE NOT EXISTS (SELECT 1 FROM meta WHERE key='generator_version'); INSERT INTO meta SELECT 'generator_url', '{gu}' WHERE NOT EXISTS (SELECT 1 FROM meta WHERE key='generator_url');'''.format( kdv=KANJIDIC2_VERSION, kdu=KANJIDIC2_URL, kdd=KANJIDIC2_DATE, gv=JAMDICT_VERSION, gu=JAMDICT_URL ) def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------ # Models # ------------------------------------------------------------------------------ class KanjiDic2Schema(Schema): KEY_FILE_VER = 'kanjidic2.file_version' KEY_DB_VER = 'kanjidic2.database_version' KEY_CREATED_DATE = 'kanjidic2.date_of_creation' def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) self.add_file(KANJIDIC2_SETUP_FILE) self.add_script(KANJIDIC2_SETUP_SCRIPT) # Meta self.add_table('meta', ['key', 'value'], proto=Meta).set_id('key') self.add_table('character', ['ID', 'literal', 'stroke_count', 'grade', 'freq', 'jlpt'], proto=Character, alias="char").set_id('ID') self.add_table('codepoint', ['cid', 'cp_type', 'value'], proto=CodePoint) self.add_table('radical', ['cid', 'rad_type', 'value'], proto=Radical) self.add_table('stroke_miscount', ['cid', 'value'], alias="smc") self.add_table('variant', ['cid', 'var_type', 'value'], proto=Variant) self.add_table('rad_name', ['cid', 'value']) self.add_table('dic_ref', ['cid', 'dr_type', 'value', 'm_vol', 'm_page'], proto=DicRef) self.add_table('query_code', ['cid', 'qc_type', 'value', 'skip_misclass'], proto=QueryCode) self.add_table('nanori', ['cid', 'value']) self.add_table('rm_group', ['ID', 'cid'], proto=RMGroup, alias='rmg').set_id('ID') self.add_table('reading', ['gid', 'r_type', 'value', 'on_type', 'r_status'], proto=Reading) self.add_table('meaning', ['gid', 'value', 'm_lang'], proto=Meaning) class KanjiDic2SQLite(KanjiDic2Schema): def __init__(self, db_path, *args, **kwargs): super().__init__(db_path, *args, **kwargs) def update_kd2_meta(self, file_version, database_version, date_of_creation, ctx=None): # ensure context if ctx is None: with self.ctx() as new_context: return self.update_kd2_meta(file_version, database_version, date_of_creation, new_context) # else # file_version fv = ctx.meta.by_id(self.KEY_FILE_VER) if not fv: ctx.meta.insert(self.KEY_FILE_VER, file_version) else: fv.value = file_version ctx.meta.save(fv) # database_version dv = ctx.meta.by_id(self.KEY_DB_VER) if not dv: ctx.meta.insert(self.KEY_DB_VER, database_version) else: dv.value = database_version ctx.meta.save(dv) # date_of_creation doc = ctx.meta.by_id(self.KEY_CREATED_DATE) if not doc: ctx.meta.insert(self.KEY_CREATED_DATE, date_of_creation) else: doc.value = date_of_creation ctx.meta.save(doc) def insert_chars(self, chars, ctx=None): # ensure context if ctx is None: with self.ctx() as ctx: return self.insert_chars(chars, ctx=ctx) # else for c in chars: self.insert_char(c, ctx=ctx) def insert_char(self, c, ctx=None): # ensure context if ctx is None: with self.ctx() as ctx: return self.insert_char(c, ctx=ctx) # else c.ID = ctx.character.save(c) # save codepoints for cp in c.codepoints: cp.cid = c.ID ctx.codepoint.save(cp) # radicals for r in c.radicals: r.cid = c.ID ctx.radical.save(r) # stroke_miscount for smc in c.stroke_miscounts: ctx.smc.insert(c.ID, smc) # variants for v in c.variants: v.cid = c.ID ctx.variant.save(v) # radnames for rn in c.rad_names: ctx.rad_name.insert(c.ID, rn) # dic_refs for dr in c.dic_refs: dr.cid = c.ID ctx.dic_ref.save(dr) # query_codes for qc in c.query_codes: qc.cid = c.ID ctx.query_code.save(qc) # nanoris for n in c.nanoris: ctx.nanori.insert(c.ID, n) # reading groups for rmg in c.rm_groups: rmg.cid = c.ID rmg.ID = ctx.rmg.save(rmg) # save readings inside for r in rmg.readings: r.gid = rmg.ID ctx.reading.save(r) # save meanings inside for m in rmg.meanings: m.gid = rmg.ID ctx.meaning.save(m) def search_chars_iter(self, chars, ctx=None): if ctx is None: with self.ctx() as ctx: return self.search_chars_iter(chars, ctx=ctx) for c in chars: res = self.get_char(c, ctx=ctx) if res is not None: yield res def get_char(self, literal, ctx=None): if ctx is None: with self.ctx() as ctx: return self.get_char(literal, ctx=ctx) # context was ensured c = ctx.char.select_single('literal=?', (literal,)) if not c: getLogger().debug("character {} could not be found".format(literal)) return None else: return self.char_by_id(c.ID, ctx) def char_by_id(self, cid, ctx=None): if ctx is None: with self.ctx() as ctx: return self.select_char(cid, ctx=ctx) # context was ensured c = ctx.char.by_id(cid) c.codepoints = ctx.codepoint.select('cid=?', (cid,)) c.radicals = ctx.radical.select('cid=?', (cid,)) for smc in ctx.smc.select('cid=?', (cid,)): c.stroke_miscounts.append(smc.value) c.variants = ctx.variant.select('cid=?', (cid,)) for r in ctx.rad_name.select('cid=?', (cid,)): c.rad_names.append(r.value) c.dic_refs = ctx.dic_ref.select('cid=?', (cid,)) c.query_codes = ctx.query_code.select('cid=?', (cid,)) for n in ctx.nanori.select('cid=?', (cid,)): c.nanoris.append(n.value) c.rm_groups = ctx.rmg.select('cid=?', (cid,)) for rmg in c.rm_groups: rmg.readings = ctx.reading.select('gid=?', (rmg.ID,)) rmg.meanings = ctx.meaning.select('gid=?', (rmg.ID,)) return c ================================================ FILE: jamdict/krad.py ================================================ # -*- coding: utf-8 -*- """ jamdict.krad is a module for retrieving kanji components (i.e. radicals) """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging import threading from collections import defaultdict as dd from typing import Mapping from chirptext import chio # ------------------------------------------------------------------------------ # Configuration # ------------------------------------------------------------------------------ MY_FOLDER = os.path.dirname(os.path.abspath(__file__)) DATA_FOLDER = os.path.join(MY_FOLDER, 'data') KRADFILE = os.path.join(DATA_FOLDER, 'kradfile-u.gz') RADKFILE = os.path.join(DATA_FOLDER, 'radkfile.gz') logger = logging.getLogger(__name__) ######################################################################## class KRad: ''' This class contains mapping from radicals to kanjis (radk) and kanjis to radicals (krad) ''' def __init__(self, **kwargs): """ Kanji-Radical mapping """ self.__krad_map: Mapping = None self.__radk_map: Mapping = None self.__rads = {} self.lock = threading.Lock() def _build_krad_map(self): with self.lock: lines = chio.read_file(KRADFILE, mode='rt').splitlines() # build the krad map self.__krad_map = {} self.__radk_map = dd(set) for line in lines: if line.startswith("#"): continue else: parts = line.split(':', maxsplit=1) if len(parts) == 2: rads = [r.strip() for r in parts[1].split()] char_literal = parts[0].strip() self.__krad_map[char_literal] = rads for rad in rads: self.__radk_map[rad].add(char_literal) @property def radk(self) -> Mapping: if self.__radk_map is None: self._build_krad_map() return self.__radk_map @property def krad(self) -> Mapping: if self.__krad_map is None: self._build_krad_map() return self.__krad_map ================================================ FILE: jamdict/tools.py ================================================ # -*- coding: utf-8 -*- """ Jamdict console app """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2017 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import json import logging from chirptext import __version__ as chirptext_version from puchikarui import __version__ as puchikarui_version from chirptext import confirm, TextReport, Timer from chirptext.cli import CLIApp, setup_logging import jamdict # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- if os.path.isfile('logging.json'): setup_logging('logging.json', 'logs') else: setup_logging(os.path.join(jamdict.config.home_dir(), 'logging.json'), 'logs') def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------- # Functions # ------------------------------------------------------------------------------- def get_jam(cli, args): if not args.jdb: args.jdb = None if args.config: jamdict.config.read_config(args.config) if args.kd2 or args.jmne: cli.logger.warning("Jamdict database location: {}".format(args.jdb)) cli.logger.warning("Kanjidic2 database location: {}".format(args.kd2)) jmd = jamdict.Jamdict(db_file=args.jdb, kd2_file=args.kd2, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml, jmnedict_file=args.jmne, jmnedict_xml_file=args.jmnexml) else: cli.logger.debug("Using the same database for both JMDict and Kanjidic2") jmd = jamdict.Jamdict(db_file=args.jdb, kd2_file=args.jdb, jmnedict_file=args.jdb, jmd_xml_file=args.jmdxml, kd2_xml_file=args.kd2xml, jmnedict_xml_file=args.jmnexml) if jmd.kd2 is None: cli.logger.warning("Kanjidic2 database could not be found") return jmd def import_data(cli, args): '''Generate Jamdict SQLite database from XML data files''' rp = TextReport() t = Timer(report=rp) show_info(cli, args) jam = get_jam(cli, args) if not jam.db_file: print("Database path is not available") elif os.path.isfile(jam.db_file): if not confirm("Database file exists. Do you want to overwite (This action cannot be undone! yes/no?) "): cli.logger.warning("Program aborted.") exit() else: os.unlink(jam.db_file) # perform input print(f"Importing data to: {jam.db_file}") t.start("Creating Jamdict SQLite database. This process may take very long time ...") jam.import_data() t.stop() def dump_result(results, report=None): if report is None: report = TextReport() if results.entries: report.print("=" * 40) report.print("Found entries") report.print("=" * 40) for e in results.entries: kj = ', '.join([k.text for k in e.kanji_forms]) kn = ', '.join([k.text for k in e.kana_forms]) report.print("Entry: {} | Kj: {} | Kn: {}".format(e.idseq, kj, kn)) report.print("-" * 20) for idx, s in enumerate(e.senses): report.print("{idx}. {s}".format(idx=idx + 1, s=s)) report.print('') else: report.print("No dictionary entry was found.") if results.chars: report.print("=" * 40) report.print("Found characters") report.print("=" * 40) for c in results.chars: report.print("Char: {} | Strokes: {}".format(c, c.stroke_count)) report.print("-" * 20) for rmg in c.rm_groups: report.print("Readings:", ", ".join([r.value for r in rmg.readings])) report.print("Meanings:", ", ".join([m.value for m in rmg.meanings if not m.m_lang or m.m_lang == 'en'])) report.print('') report.print('') else: report.print("No character was found.") if results.names: report.print("=" * 40) report.print("Found name entities") report.print("=" * 40) for e in results.names: kj = ', '.join([k.text for k in e.kanji_forms]) kn = ', '.join([k.text for k in e.kana_forms]) report.print("Names: {} | Kj: {} | Kn: {}".format(e.idseq, kj, kn)) report.print("-" * 20) for idx, s in enumerate(e.senses): report.print("{idx}. {s}".format(idx=idx + 1, s=s)) report.print('') else: report.print("No name was found.") def lookup(cli, args): '''Lookup words by kanji/kana''' jam = get_jam(cli, args) if jam.ready: results = jam.lookup(args.query, strict_lookup=args.strict) report = TextReport(args.output) if args.format == 'json': report.print(json.dumps(results.to_dict(), ensure_ascii=args.ensure_ascii, indent=args.indent if args.indent else None)) else: if args.compact: report.print(results.text(separator='\n------\n', entry_sep='\n')) else: dump_result(results, report=report) else: getLogger().warning(f"Jamdict database is not available.\nThere are 3 ways to install data: \n 1) install jamdict_data via PyPI using `pip install jamdict_data` \n 2) download prebuilt dictionary database file from: {jamdict.__url__}, \n 3) or build your own database file from XML source files.") def file_status(file_path): if file_path: real_path = os.path.abspath(os.path.expanduser(file_path)) if os.path.isfile(real_path): return '[OK]' return '[NOT FOUND]' def hello_jamdict(cli, args): ''' Say hello and test if Jamdict is working ''' jam = get_jam(cli, args) if jam.ready: results = jam.lookup("一期一会") dump_result(results, report=TextReport()) else: getLogger().warning("Hello there, unfortunately jamdict data is not available. Please try to install using `pip install jamdict-data`") def show_info(cli, args): ''' Show jamdict configuration (data folder, configuration file location, etc.) ''' output = TextReport(args.output) if 'output' in args else TextReport() if args.config: jamdict.config.read_config(args.config) output.print("Jamdict " + jamdict.version_info.__version__) output.print(jamdict.version_info.__description__) jam = get_jam(cli, args) output.header("Basic configuration") jamdict_home = jamdict.config.home_dir() if not os.path.isdir(os.path.expanduser(jamdict_home)): jamdict_home += " [Missing]" else: jamdict_home += " [OK]" output.print(f"JAMDICT_HOME: {jamdict_home}") if jamdict.util._JAMDICT_DATA_AVAILABLE: import jamdict_data data_pkg = f"version {jamdict_data.__version__} [OK]" else: data_pkg = "Not installed" output.print(f"jamdict-data: {data_pkg}") if args.config: _config_path = args.config + " [Custom]" if not os.path.isfile(args.config): _config_path += " [Missing]" else: _config_path = jamdict.config._get_config_manager().locate_config() if not _config_path: _config_path = "Not available.\n Run `python3 -m jamdict config` to create configuration file if needed." output.print(f"Config file : {_config_path}") output.header("Data files") output.print(f"Jamdict DB location: {jam.db_file} - {file_status(jam.db_file)}") output.print(f"JMDict XML file : {jam.jmd_xml_file} - {file_status(jam.jmd_xml_file)}") output.print(f"KanjiDic2 XML file : {jam.kd2_xml_file} - {file_status(jam.kd2_xml_file)}") output.print(f"JMnedict XML file : {jam.jmnedict_xml_file} - {file_status(jam.jmnedict_xml_file)}") if jam.ready: output.header("Jamdict database metadata") try: for meta in jam.jmdict.meta.select(): output.print(f"{meta.key}: {meta.value}") except Exception as e: print(e) output.print("Error happened while retrieving database meta data") output.header("Others") output.print(f"puchikarui: version {puchikarui_version}") output.print(f"chirptext : version {chirptext_version}") output.print(f"lxml : {jamdict.jmdict._LXML_AVAILABLE}") def show_version(cli, args): ''' Show Jamdict version ''' if args.verbose: print("Jamdict {v} - {d}".format(d=jamdict.version_info.__description__, v=jamdict.version_info.__version__)) else: print("Jamdict {}".format(jamdict.version_info.__version__)) def config_jamdict(cli, args): ''' Create Jamdict configuration file ''' if args.config: jamdict.config._ensure_config(args.config) else: jamdict.config._ensure_config() show_info(cli, args) # ------------------------------------------------------------------------------- # Main # ------------------------------------------------------------------------------- def add_data_config(parser): parser.add_argument('-c', '--config', help='Path to Jamdict config file (i.e. ~/.jamdict/config.json)', default=None) parser.add_argument('-J', '--jdb', help='Path to JMDict SQLite file', default=None) parser.add_argument('-j', '--jmdxml', help='Path to JMdict XML file', default=None) parser.add_argument('-k', '--kd2xml', help='Path to KanjiDic2 XML file', default=None) parser.add_argument('-e', '--jmnexml', help='Path to JMnedict XML file', default=None) parser.add_argument('-K', '--kd2', help='Path to KanjiDic2 SQLite file', default=None) parser.add_argument('-E', '--jmne', help='Path to JMnedict SQLite file', default=None) def main(): '''Main entry of jamtk ''' app = CLIApp(desc='Jamdict command-line toolkit', logger=__name__, show_version=show_version) add_data_config(app.parser) # import task import_task = app.add_task('import', func=import_data) add_data_config(import_task) # show info info_task = app.add_task('info', func=show_info) info_task.add_argument('-o', '--output', help='Write information to a text file') add_data_config(info_task) # show version version_task = app.add_task('version', func=show_version) add_data_config(version_task) # create config file config_task = app.add_task('config', func=config_jamdict) add_data_config(config_task) # hello hello_task = app.add_task('hello', func=hello_jamdict) add_data_config(hello_task) # look up task lookup_task = app.add_task('lookup', func=lookup) lookup_task.add_argument('query', help='kanji/kana') lookup_task.add_argument('-f', '--format', help='json or text') lookup_task.add_argument('--compact', action='store_true') lookup_task.add_argument('-s', '--strict', action='store_true') lookup_task.add_argument('--ensure_ascii', help='Force JSON dumps to ASCII only', action='store_true') lookup_task.add_argument('--indent', help='JSON default indent', default=2, type=int) lookup_task.add_argument('-o', '--output', help='Path to a file to output lookup result, leave blank to write to console standard output') lookup_task.set_defaults(func=lookup) add_data_config(lookup_task) # run app app.run() if __name__ == "__main__": main() ================================================ FILE: jamdict/util.py ================================================ # -*- coding: utf-8 -*- """ Jamdict public APIs """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import logging import threading import warnings from pathlib import Path from collections import defaultdict as dd from collections import OrderedDict from typing import List, Sequence from chirptext.deko import HIRAGANA, KATAKANA _MEMORY_MODE = False try: from puchikarui import MemorySource _MEMORY_MODE = True except ImportError: pass from puchikarui import ExecutionContext from . import config from .jmdict import JMDictXMLParser, JMDEntry from .krad import KRad from .jmdict_sqlite import JMDictSQLite from .kanjidic2 import Kanjidic2XMLParser, Character from .kanjidic2_sqlite import KanjiDic2SQLite from .jmnedict_sqlite import JMNEDictSQLite try: import jamdict_data _JAMDICT_DATA_AVAILABLE = True except Exception: _JAMDICT_DATA_AVAILABLE = False ######################################################################## def getLogger(): return logging.getLogger(__name__) ######################################################################## class LookupResult(object): """ Contain lookup results (words, Kanji characters, or named entities) from Jamdict. A typical jamdict lookup is like this: >>> jam = Jamdict() >>> result = jam.lookup('食べ%る') The command above returns a :any:`LookupResult` object which contains found words (:any:`entries`), kanji characters (:any:`chars`), and named entities (:any:`names`). """ def __init__(self, entries, chars, names=None): self.__entries: Sequence[JMDEntry] = entries if entries else [] self.__chars: Sequence[Character] = chars if chars else [] self.__names: Sequence[JMDEntry] = names if names else [] @property def entries(self) -> Sequence[JMDEntry]: """ A list of words entries :returns: a list of :class:`JMDEntry ` object :rtype: List[JMDEntry] """ return self.__entries @entries.setter def entries(self, values: Sequence[JMDEntry]): self.__entries = values @property def chars(self) -> Sequence[Character]: """ A list of found kanji characters :returns: a list of :class:`Character ` object :rtype: Sequence[Character] """ return self.__chars @chars.setter def chars(self, values: Sequence[Character]): self.__chars = values @property def names(self) -> Sequence[JMDEntry]: """ A list of found named entities :returns: a list of :class:`JMDEntry ` object :rtype: Sequence[JMDEntry] """ return self.__names @names.setter def names(self, values: Sequence[JMDEntry]): self.__names = values def text(self, compact=True, entry_sep='。', separator=' | ', no_id=False, with_chars=True) -> str: """ Generate a text string that contains all found words, characters, and named entities. :param compact: Make the output string more compact (fewer info, fewer whitespaces, etc.) :param no_id: Do not include jamdict's internal object IDs (for direct query via API) :param entry_sep: The text to separate entries :param with_chars: Include characters information :returns: A formatted string ready for display """ output = [] if self.entries: entry_txts = [] for idx, e in enumerate(self.entries, start=1): entry_txt = e.text(compact=compact, separator=' ', no_id=no_id) entry_txts.append("#{}: {}".format(idx, entry_txt)) output.append("[Entries]") output.append(entry_sep) output.append(entry_sep.join(entry_txts)) elif not compact: output.append("No entries") if self.chars and with_chars: if compact: chars_txt = ', '.join(str(c) for c in self.chars) else: chars_txt = ', '.join(repr(c) for c in self.chars) if output: output.append(separator) # TODO: section separator? output.append("[Chars]") output.append(entry_sep) output.append(chars_txt) if self.names: name_txts = [] for idx, n in enumerate(self.names, start=1): name_txt = n.text(compact=compact, separator=' ', no_id=no_id) name_txts.append("#{}: {}".format(idx, name_txt)) if output: output.append(separator) output.append("[Names]") output.append(entry_sep) output.append(entry_sep.join(name_txts)) return "".join(output) if output else "Found nothing" def __repr__(self): return self.text(compact=True) def __str__(self): return self.text(compact=False) def to_json(self): warnings.warn("to_json() is deprecated and will be removed in the next major release. Use to_dict() instead.", DeprecationWarning, stacklevel=2) return self.to_dict() def to_dict(self) -> dict: return {'entries': [e.to_dict() for e in self.entries], 'chars': [c.to_dict() for c in self.chars], 'names': [n.to_dict() for n in self.names]} class IterLookupResult(object): """ Contain lookup results (words, Kanji characters, or named entities) from Jamdict. A typical jamdict lookup is like this: >>> res = jam.lookup_iter("花見") ``res`` is an :class:`IterLookupResult` object which contains iterators to scan through found words (``entries``), kanji characters (``chars``), and named entities (:any:`names`) one by one. >>> for word in res.entries: ... print(word) # do somethign with the word >>> for c in res.chars: ... print(c) >>> for name in res.names: ... print(name) """ def __init__(self, entries, chars=None, names=None): self.__entries = entries if entries is not None else [] self.__chars = chars if chars is not None else [] self.__names = names if names is not None else [] @property def entries(self): """ Iterator for looping one by one through all found entries, can only be used once """ return self.__entries @property def chars(self): """ Iterator for looping one by one through all found kanji characters, can only be used once """ return self.__chars @property def names(self): """ Iterator for looping one by one through all found named entities, can only be used once """ return self.__names class JamdictSQLite(KanjiDic2SQLite, JMNEDictSQLite, JMDictSQLite): def __init__(self, db_file, *args, **kwargs): super().__init__(db_file, *args, **kwargs) class Jamdict(object): """ Main entry point to access all available dictionaries in jamdict. >>> from jamdict import Jamdict >>> jam = Jamdict() >>> result = jam.lookup('食べ%る') # print all word entries >>> for entry in result.entries: >>> print(entry) # print all related characters >>> for c in result.chars: >>> print(repr(c)) To filter results by ``pos``, for example look for all "かえる" that are nouns, use: >>> result = jam.lookup("かえる", pos=["noun (common) (futsuumeishi)"]) To search for named-entities by type, use the type string as query. For example to search for all "surname" use: >>> result = jam.lookup("surname") To find out which part-of-speeches or named-entities types are available in the dictionary, use :func:`Jamdict.all_pos ` and :func:`Jamdict.all_ne_type `. Jamdict >= 0.1a10 support ``memory_mode`` keyword argument for reading the whole database into memory before querying to boost up search speed. The database may take about a minute to load. Here is the sample code: >>> jam = Jamdict(memory_mode=True) When there is no suitable database available, Jamdict will try to use database from `jamdict-data `_ package by default. If there is a custom database available in configuration file, Jamdict will prioritise to use it over the ``jamdict-data`` package. """ def __init__(self, db_file=None, kd2_file=None, jmd_xml_file=None, kd2_xml_file=None, auto_config=True, auto_expand=True, reuse_ctx=True, jmnedict_file=None, jmnedict_xml_file=None, memory_mode=False, **kwargs): # data sources self.reuse_ctx = reuse_ctx self._db_sqlite = None self._kd2_sqlite = None self._jmne_sqlite = None self._jmd_xml = None self._kd2_xml = None self._jmne_xml = None self.__krad_map = None self.__jm_ctx = None # for reusing database context self.__memory_mode = memory_mode # file paths configuration self.auto_expand = auto_expand self.jmd_xml_file = jmd_xml_file if jmd_xml_file else config.get_file('JMDICT_XML') if auto_config else None self.kd2_xml_file = kd2_xml_file if kd2_xml_file else config.get_file('KD2_XML') if auto_config else None self.jmnedict_xml_file = jmnedict_xml_file if jmnedict_xml_file else config.get_file('JMNEDICT_XML') if auto_config else None if auto_expand: if self.jmd_xml_file: self.jmd_xml_file = os.path.expanduser(self.jmd_xml_file) if self.kd2_xml_file: self.kd2_xml_file = os.path.expanduser(self.kd2_xml_file) if self.jmnedict_xml_file: self.jmnedict_xml_file = os.path.expanduser(self.jmnedict_xml_file) self.db_file = db_file if db_file else config.get_file('JAMDICT_DB') if auto_config else None if not self.db_file or (self.db_file != ':memory:' and not os.path.isfile(self.db_file)): if _JAMDICT_DATA_AVAILABLE: self.db_file = jamdict_data.JAMDICT_DB_PATH elif self.jmd_xml_file and os.path.isfile(self.jmd_xml_file): getLogger().warning("JAMDICT_DB could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict import` first") self.kd2_file = kd2_file if kd2_file else self.db_file if auto_config else None if not self.kd2_file or (self.kd2_file != ':memory:' and not os.path.isfile(self.kd2_file)): if _JAMDICT_DATA_AVAILABLE: self.kd2_file = None # jamdict_data.JAMDICT_DB_PATH elif self.kd2_xml_file and os.path.isfile(self.kd2_xml_file): getLogger().warning("Kanjidic2 database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict import` first") self.jmnedict_file = jmnedict_file if jmnedict_file else self.db_file if auto_config else None if not self.jmnedict_file or (self.jmnedict_file != ':memory:' and not os.path.isfile(self.jmnedict_file)): if _JAMDICT_DATA_AVAILABLE: self.jmnedict_file = None # jamdict_data.JAMDICT_DB_PATH elif self.jmnedict_xml_file and os.path.isfile(self.jmnedict_xml_file): getLogger().warning("JMNE database could NOT be found. Searching will be extremely slow. Please run `python3 -m jamdict import` first") @property def ready(self) -> bool: """ Check if Jamdict database is available """ return os.path.isfile(self.db_file) and self.jmdict is not None def __del__(self): if self.__jm_ctx is not None: try: # try to close default SQLite context if needed self.__jm_ctx.close() except Exception: pass def __make_db_ctx(self) -> ExecutionContext: """ Try to reuse context if allowed """ try: if not self.reuse_ctx: return self.jmdict.ctx() elif self.__jm_ctx is None and self.db_file and (self.db_file == ":memory:" or os.path.isfile(self.db_file)): self.__jm_ctx = self.jmdict.ctx() except Exception: getLogger().warning("JMdict data could not be accessed.") return self.__jm_ctx @property def db_file(self): return self.__db_file @db_file.setter def db_file(self, value): if self.auto_expand and value and value != ':memory:': self.__db_file = os.path.abspath(os.path.expanduser(value)) else: self.__db_file = value @property def kd2_file(self): return self.__kd2_file @kd2_file.setter def kd2_file(self, value): if self.auto_expand and value and value != ':memory:': self.__kd2_file = os.path.abspath(os.path.expanduser(value)) else: self.__kd2_file = value @property def jmnedict_file(self): return self.__jmnedict_file @jmnedict_file.setter def jmnedict_file(self, value): if self.auto_expand and value and value != ':memory:': self.__jmnedict_file = os.path.abspath(os.path.expanduser(value)) else: self.__jmnedict_file = value @property def memory_mode(self): """ if memory_mode = True, Jamdict DB will be loaded into RAM before querying for better performance """ return self.__memory_mode @property def jmdict(self): if not self._db_sqlite and self.db_file: with threading.Lock(): # Use 1 DB for all if self.memory_mode and _MEMORY_MODE: data_source = MemorySource(self.db_file) else: if self.memory_mode and not _MEMORY_MODE: logging.getLogger(__name__).error("Memory mode could not be enabled because puchikarui version is too old. Fallback to normal file DB mode") data_source = self.db_file self._db_sqlite = JamdictSQLite(data_source, auto_expand_path=self.auto_expand) return self._db_sqlite @property def kd2(self): if self._kd2_sqlite is None: if self.kd2_file is not None and os.path.isfile(self.kd2_file): with threading.Lock(): if self.memory_mode and _MEMORY_MODE: data_source = MemorySource(self.kd2_file) else: if self.memory_mode and not _MEMORY_MODE: logging.getLogger(__name__).error("Memory mode could not be enabled because puchikarui version is too old. Fallback to normal file DB mode") data_source = self.kd2_file self._kd2_sqlite = KanjiDic2SQLite(data_source, auto_expand_path=self.auto_expand) elif not self.kd2_file or self.kd2_file == self.db_file: self._kd2_sqlite = self.jmdict return self._kd2_sqlite @property def jmnedict(self): """ JM NE SQLite database access object """ if self._jmne_sqlite is None: if self.jmnedict_file is not None: with threading.Lock(): if self.memory_mode and _MEMORY_MODE: data_source = MemorySource(self.jmnedict_file) else: if self.memory_mode and not _MEMORY_MODE: logging.getLogger(__name__).error("Memory mode could not be enabled because puchikarui version is too old. Fallback to normal file DB mode") data_source = self.jmnedict_file self._jmne_sqlite = JMNEDictSQLite(data_source, auto_expand_path=self.auto_expand) elif not self.jmnedict_file or self.jmnedict_file == self.db_file: self._jmne_sqlite = self.jmdict return self._jmne_sqlite @property def jmdict_xml(self): if not self._jmd_xml and self.jmd_xml_file: with threading.Lock(): getLogger().info("Loading JMDict from XML file at {}".format(self.jmd_xml_file)) self._jmd_xml = JMDictXML.from_file(self.jmd_xml_file) getLogger().info("Loaded JMdict entries: {}".format(len(self._jmd_xml))) return self._jmd_xml @property def krad(self): """ Break a kanji down to writing components >>> jam = Jamdict() >>> print(jam.krad['雲']) ['一', '雨', '二', '厶'] """ if not self.__krad_map: with threading.Lock(): self.__krad_map = KRad() return self.__krad_map.krad @property def radk(self): """ Find all kanji with a writing component >>> jam = Jamdict() >>> print(jam.radk['鼎']) {'鼏', '鼒', '鼐', '鼎', '鼑'} """ if not self.__krad_map: with threading.Lock(): self.__krad_map = KRad() return self.__krad_map.radk @property def kd2_xml(self): if not self._kd2_xml and self.kd2_xml_file: with threading.Lock(): getLogger().info("Loading KanjiDic2 from XML file at {}".format(self.kd2_xml_file)) self._kd2_xml = KanjiDic2XML.from_file(self.kd2_xml_file) getLogger().info("Loaded KanjiDic2 entries: {}".format(len(self._kd2_xml))) return self._kd2_xml @property def jmne_xml(self): if not self._jmne_xml and self.jmnedict_xml_file: with threading.Lock(): getLogger().info("Loading JMnedict from XML file at {}".format(self.jmnedict_xml_file)) self._jmne_xml = JMNEDictXML.from_file(self.jmnedict_xml_file) getLogger().info("Loaded JMnedict entries: {}".format(len(self._jmne_xml))) return self._jmne_xml def has_kd2(self) -> bool: return self.db_file is not None or self.kd2_file is not None or self.kd2_xml_file is not None def has_jmne(self, ctx=None) -> bool: """ Check if current database has jmne support """ if ctx is None: ctx = self.__make_db_ctx() m = ctx.meta.select_single('key=?', ('jmnedict.version',)) if ctx is not None else None return m is not None and len(m.value) > 0 def is_available(self) -> bool: # this function is for developer only # don't expose it to the public # ready should be used instead return (self.db_file is not None or self.jmd_xml_file is not None or self.kd2_file is not None or self.kd2_xml_file is not None or self.jmnedict_file is not None or self.jmnedict_xml_file is not None) def import_data(self): """ Import JMDict and KanjiDic2 data from XML to SQLite """ if self.db_file and not os.path.exists(self.db_file): Path(self.db_file).touch() ctx = self.__make_db_ctx() ctx.buckmode() ctx.auto_commit = False if self.jmdict and self.jmdict_xml: getLogger().info("Importing JMDict data") self.jmdict.insert_entries(self.jmdict_xml, ctx=ctx) # import KanjiDic2 if self.kd2_xml is not None and os.path.isfile(self.kd2_xml_file): getLogger().info("Importing KanjiDic2 data") if self.jmdict is not None and self.kd2_file == self.db_file: self.jmdict.insert_chars(self.kd2_xml, ctx=ctx) elif self.kd2 is not None: getLogger().warning(f"Building Kanjidic2 DB using a different DB context {self.kd2_file} vs {self.db_file}") with self.kd2.ctx() as kd_ctx: self.kd2.insert_chars(self.kd2_xml, ctx=kd_ctx) else: getLogger().warning(f"Kanjidic2 DB path could not be found") else: print(f"kd2_xml: {self.kd2_xml}") print(f"kd2_xml_file: {self.kd2_xml_file}") getLogger().warning("KanjiDic2 XML data is not available - skipped!") # import JMNEdict if self.jmne_xml is not None and os.path.isfile(self.jmnedict_xml_file): getLogger().info("Importing JMNEdict data") if self.jmdict is not None and self.jmnedict_file == self.db_file: self.jmnedict.insert_name_entities(self.jmne_xml, ctx=ctx) elif self.jmnedict is not None: getLogger().warning(f"Building Kanjidic2 DB using a different DB context {self.jmne_file} vs {self.db_file}") with self.jmnedict.ctx() as ne_ctx: self.jmnedict.insert_name_entities(self.jmne_xml, ctx=ne_ctx) else: getLogger().warning(f"JMNE DB path could not be found") else: getLogger().warning("JMNEdict XML data is not available - skipped!") _buckmode_off = getattr(ctx, "buckmode_off", None) if _buckmode_off is not None: _buckmode_off() ctx.commit() def get_ne(self, idseq, ctx=None) -> JMDEntry: """ Get name entity by idseq in JMNEdict """ if self.jmnedict is not None: if ctx is None: ctx = self.__make_db_ctx() return self.jmnedict.get_ne(idseq, ctx=ctx) elif self.jmnedict_xml_file: return self.jmne_xml.lookup(idseq) else: raise LookupError("There is no JMnedict data source available") def get_char(self, literal, ctx=None) -> Character: if self.kd2 is not None: if ctx is None: ctx = self.__make_db_ctx() return self.kd2.get_char(literal, ctx=ctx) elif self.kd2_xml: return self.kd2_xml.lookup(literal) else: raise LookupError("There is no KanjiDic2 data source available") def get_entry(self, idseq) -> JMDEntry: if self.jmdict: return self.jmdict.get_entry(idseq) elif self.jmdict_xml: return self.jmdict_xml.lookup(idseq)[0] else: raise LookupError("There is no backend data available") def all_pos(self, ctx=None) -> List[str]: """ Find all available part-of-speeches :returns: A list of part-of-speeches (a list of strings) """ if ctx is None: ctx = self.__make_db_ctx() return self.jmdict.all_pos(ctx=ctx) def all_ne_type(self, ctx=None) -> List[str]: """ Find all available named-entity types :returns: A list of named-entity types (a list of strings) """ if ctx is None: ctx = self.__make_db_ctx() return self.jmnedict.all_ne_type(ctx=ctx) def lookup(self, query, strict_lookup=False, lookup_chars=True, ctx=None, lookup_ne=True, pos=None, **kwargs) -> LookupResult: """ Search words, characters, and characters. Keyword arguments: :param query: Text to query, may contains wildcard characters. Use `?` for 1 exact character and `%` to match any number of characters. :param strict_lookup: only look up the Kanji characters in query (i.e. discard characters from variants) :type strict_lookup: bool :param: lookup_chars: set lookup_chars to False to disable character lookup :type lookup_chars: bool :param pos: Filter words by part-of-speeches :type pos: list of strings :param ctx: database access context, can be reused for better performance. Normally users do not have to touch this and database connections will be reused by default. :param lookup_ne: set lookup_ne to False to disable name-entities lookup :type lookup_ne: bool :returns: Return a LookupResult object. :rtype: :class:`jamdict.util.LookupResult` >>> # match any word that starts with "食べ" and ends with "る" (anything from between is fine) >>> jam = Jamdict() >>> results = jam.lookup('食べ%る') """ if not self.is_available(): raise LookupError("There is no backend data available") elif (not query or query == "%") and not pos: raise ValueError("Query and POS filter cannot be both empty") if ctx is None: ctx = self.__make_db_ctx() entries = [] chars = [] names = [] if self.jmdict is not None: entries = self.jmdict.search(query, pos=pos, ctx=ctx) elif self.jmdict_xml: entries = self.jmdict_xml.lookup(query) if lookup_chars and self.has_kd2(): # lookup each character in query and kanji readings of each found entries chars_to_search = OrderedDict({c: c for c in query}) if not strict_lookup and entries: # auto add characters from entries for e in entries: for k in e.kanji_forms: for c in k.text: if c not in HIRAGANA and c not in KATAKANA: chars_to_search[c] = c for c in chars_to_search: result = self.get_char(c, ctx=ctx) if result is not None: chars.append(result) # lookup name-entities if lookup_ne and self.has_jmne(ctx=ctx): names = self.jmnedict.search_ne(query, ctx=ctx) # finish return LookupResult(entries, chars, names) def lookup_iter(self, query, strict_lookup=False, lookup_chars=True, lookup_ne=True, ctx=None, pos=None, **kwargs) -> LookupResult: """ Search for words, characters, and characters iteratively. An :class:`IterLookupResult` object will be returned instead of the normal ``LookupResult``. ``res.entries``, ``res.chars``, ``res.names`` are iterators instead of lists and each of them can only be looped through once. Users have to store the results manually. >>> res = jam.lookup_iter("花見") >>> for word in res.entries: ... print(word) # do somethign with the word >>> for c in res.chars: ... print(c) >>> for name in res.names: ... print(name) Keyword arguments: :param query: Text to query, may contains wildcard characters. Use `?` for 1 exact character and `%` to match any number of characters. :param strict_lookup: only look up the Kanji characters in query (i.e. discard characters from variants) :type strict_lookup: bool :param: lookup_chars: set lookup_chars to False to disable character lookup :type lookup_chars: bool :param pos: Filter words by part-of-speeches :type pos: list of strings :param ctx: database access context, can be reused for better performance. Normally users do not have to touch this and database connections will be reused by default. :param lookup_ne: set lookup_ne to False to disable name-entities lookup :type lookup_ne: bool :returns: Return an IterLookupResult object. :rtype: :class:`jamdict.util.IterLookupResult` """ if not self.is_available(): raise LookupError("There is no backend data available") elif (not query or query == "%") and not pos: raise ValueError("Query and POS filter cannot be both empty") if ctx is None: ctx = self.__make_db_ctx() # Lookup entries, chars, and names entries = None chars = None names = None if self.jmdict is not None: entries = self.jmdict.search_iter(query, pos=pos, ctx=ctx) if lookup_chars and self.has_kd2(): chars_to_search = OrderedDict({c: c for c in query if c not in HIRAGANA and c not in KATAKANA}) chars = self.kd2.search_chars_iter(chars_to_search, ctx=ctx) # lookup name-entities if lookup_ne and self.has_jmne(ctx=ctx): names = self.jmnedict.search_ne_iter(query, ctx=ctx) # finish return IterLookupResult(entries, chars, names) class JMDictXML(object): """ JMDict API for looking up information in XML """ def __init__(self, entries): self.entries = entries self._seqmap = {} # entryID - entryObj map self._textmap = dd(set) # compile map for entry in self.entries: self._seqmap[entry.idseq] = entry for kn in entry.kana_forms: self._textmap[kn.text].add(entry) for kj in entry.kanji_forms: self._textmap[kj.text].add(entry) def __len__(self): return len(self.entries) def __getitem__(self, idx): return self.entries[idx] def lookup(self, a_query) -> Sequence[JMDEntry]: if a_query in self._textmap: return tuple(self._textmap[a_query]) elif a_query.startswith('id#'): entry_id = a_query[3:] if entry_id in self._seqmap: return (self._seqmap[entry_id],) # found nothing return () @staticmethod def from_file(filename): parser = JMDictXMLParser() return JMDictXML(parser.parse_file(os.path.abspath(os.path.expanduser(filename)))) class JMNEDictXML(JMDictXML): pass class KanjiDic2XML(object): def __init__(self, kd2): """ """ self.kd2 = kd2 self.char_map = {} for char in self.kd2: if char.literal in self.char_map: getLogger().warning("Duplicate character entry: {}".format(char.literal)) self.char_map[char.literal] = char def __len__(self): return len(self.kd2) def __getitem__(self, idx): return self.kd2[idx] def lookup(self, char): if char in self.char_map: return self.char_map[char] else: return None @staticmethod def from_file(filename): parser = Kanjidic2XMLParser() return KanjiDic2XML(parser.parse_file(filename)) ================================================ FILE: jamdict_demo.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Jamdict demo application Latest version can be found at https://github.com/neocl/jamdict This package uses the [EDICT][1] and [KANJIDIC][2] dictionary files. These files are the property of the [Electronic Dictionary Research and Development Group][3], and are used in conformance with the Group's [licence][4]. [1]: http://www.csse.monash.edu.au/~jwb/edict.html [2]: http://www.csse.monash.edu.au/~jwb/kanjidic.html [3]: http://www.edrdg.org/ [4]: http://www.edrdg.org/edrdg/licence.html References: JMDict website: http://www.csse.monash.edu.au/~jwb/edict.html """ # Copyright (c) 2016, Le Tuan Anh # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ######################################################################## import json from jamdict import Jamdict ######################################################################## # Create an instance of Jamdict jam = Jamdict() print("Jamdict DB file: {}".format(jam.db_file)) if not jam.ready: print("""Jamdict DB is not available. Database can be installed via PyPI: pip install jamdict-data Or downloaded from: https://jamdict.readthedocs.io/en/latest/install.html To create a config file, run: python3 -m jamdict config Program aborted.""") exit() # Lookup by kana result = jam.lookup('おかえし') for entry in result.entries: print(entry) # Lookup by kanji print("-----------------") result = jam.lookup('御土産') for entry in result.entries: print(entry) # Lookup a name # a name entity is also a jamdict.jmdict.JMDEntry object # excep that the senses is a list of Translation objects instead of Sense objects print("-----------------") if jam.has_jmne(): result = jam.lookup('鈴木') for name in result.names: print(name) # Use wildcard matching # Find all names ends with -jida print("-----------------") result = jam.lookup('%じだ') for name in result.names: print(name) # ------------------------------------------------------------------------------ # lookup entry by idseq print("---------------------") otenki = jam.lookup('id#1002470').entries[0] # extract all kana forms kana_forms = ' '.join([x.text for x in otenki.kana_forms]) # extract all kanji forms kanji_forms = ' '.join([x.text for x in otenki.kanji_forms]) print("Entry #{id}: Kanji: {kj} - Kana: {kn}".format(id=otenki.idseq, kj=kanji_forms, kn=kana_forms)) # extract all sense glosses for idx, sense in enumerate(otenki): print("{i}. {s}".format(i=idx, s=sense)) # Look up radical & writing components of kanji characters # 1. Lookup kanji's components print("---------------------") result = jam.lookup('筋斗雲') for c in result.chars: meanings = ', '.join(c.meanings()) # has components print(f"{c}: {meanings}") print(f" Radical: {c.radical}") print(f" Components: {c.components}") # 2. Lookup kanjis by component print("---------------------") chars = jam.radk['鼎'] # this returns a list of strings (each string is the literal of a character) result = jam.lookup(''.join(chars)) for c in result.chars: meanings = ', '.join(c.meanings()) # has components print(f"{c}: {meanings}") print(f" Radical: {c.radical}") print(f" Components: {c.components}") # using JSON print("---------------------") result = jam.lookup('こうしえん') print(result.text(separator='\n')) print("---------------------") otenki_dict = result.to_json() # get a dict structure to produce a JSON string json_string = json.dumps(otenki_dict, ensure_ascii=False, indent=2) print(json_string) ================================================ FILE: jamdol-flask.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ jamdol - JAMDict OnLine (REST server) Latest version can be found at https://github.com/neocl/jamdict """ # Copyright (c) 2017, Le Tuan Anh # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import json import logging import flask from flask import Flask, Response from functools import wraps from flask import request from chirptext.cli import setup_logging from jamdict import Jamdict from jamdict import __version__ # --------------------------------------------------------------------- # CONFIGURATION # --------------------------------------------------------------------- setup_logging('logging.json', 'logs') app = Flask(__name__, static_url_path="") jmd = Jamdict() def getLogger(): return logging.getLogger(__name__) # --------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------- def jsonp(func): @wraps(func) def decorated_function(*args, **kwargs): data = func(*args, **kwargs) callback = request.args.get('callback', False) if callback: content = "{}({})".format(callback, json.dumps(data)) return Response(content, mimetype="application/javascript") else: content = json.dumps(data) return Response(content, mimetype="application/json") return decorated_function # --------------------------------------------------------------------- # Views # --------------------------------------------------------------------- @app.route('/jamdol/entry/', methods=['GET']) @jsonp def get_entry(idseq): results = {'entries': [jmd.get_entry(idseq).to_json()], 'chars': []} return results @app.route('/jamdol/search/', methods=['GET']) @app.route('/jamdol/search//', methods=['GET']) @jsonp def search(query, strict=None): getLogger().info("Query = {}".format(query)) results = jmd.lookup(query, strict_lookup=strict) return results.to_json() @app.route('/jamdol/', methods=['GET']) def index(): return Response('jamdol {jd} - jamdol-flask/Flask-{fv}'.format(jd=__version__, fv=flask.__version__), mimetype='text/html') @app.route('/jamdol/version', methods=['GET']) @jsonp def version(): return {'product': 'jamdol', 'version': __version__, 'server': 'jamdol-flask/Flask-{}'.format(flask.__version__)} # --------------------------------------------------------------------- # Views # --------------------------------------------------------------------- if __name__ == '__main__': app.run() ================================================ FILE: jmd ================================================ #!/bin/bash export JAMDICT_HOME=~/local/jamdict cd ${JAMDICT_HOME} python3 -m jamdict.tools lookup "$@" ================================================ FILE: logging.json ================================================ { "version": 1, "disable_existing_loggers": false, "formatters": { "simple": { "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" } }, "handlers": { "console": { "class": "logging.StreamHandler", "level": "DEBUG", "formatter": "simple", "stream": "ext://sys.stdout" }, "file_handler_important": { "class": "logging.handlers.RotatingFileHandler", "level": "WARNING", "formatter": "simple", "filename": "logs/logging_important.log", "maxBytes": 1000000, "backupCount": 20, "encoding": "utf8" }, "file_handler_verbose": { "class": "logging.handlers.RotatingFileHandler", "level": "DEBUG", "formatter": "simple", "filename": "logs/logging_details.log", "maxBytes": 1000000, "backupCount": 20, "encoding": "utf8" } }, "loggers": { "__main__": { "level": "INFO", "handlers": ["file_handler_verbose"], "propagate": "no" }, "jamdol-flask": { "level": "INFO" } , "jamdict.util": { "level": "INFO" }, "jamdict.jmdict_sqlite": { "level": "ERROR" }, "chirptext.dekomecab": { "level": "ERROR" } }, "root": { "level": "WARNING", "handlers": ["console", "file_handler_important"] } } ================================================ FILE: release.sh ================================================ #!/bin/bash # pandoc --from=markdown --to=rst README.md -o README.rst python3 setup.py sdist ================================================ FILE: requirements.txt ================================================ chirptext >= 0.1, <= 0.2 puchikarui >= 0.1, < 0.3 ================================================ FILE: run ================================================ #!/bin/bash export FLASK_APP=jamdol-flask.py flask run --port 5002 ================================================ FILE: setup.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Setup script for jamdict Latest version can be found at https://github.com/neocl/jamdict :copyright: (c) 2012 Le Tuan Anh :license: MIT, see LICENSE for more details. ''' import io from setuptools import setup def read(*filenames, **kwargs): ''' Read contents of multiple files and join them together ''' encoding = kwargs.get('encoding', 'utf-8') sep = kwargs.get('sep', '\n') buf = [] for filename in filenames: with io.open(filename, encoding=encoding) as f: buf.append(f.read()) return sep.join(buf) # readme_file = 'README.rst' if os.path.isfile('README.rst') else 'README.md' readme_file = 'README.md' long_description = read(readme_file) pkg_info = {} exec(read('jamdict/__version__.py'), pkg_info) with open('requirements.txt', 'r') as infile: requirements = infile.read().splitlines() print(requirements) setup( name='jamdict', # package file name (-version.tar.gz) version=pkg_info['__version__'], url=pkg_info['__url__'], project_urls={ "Bug Tracker": "https://github.com/neocl/jamdict/issues", "Source Code": "https://github.com/neocl/jamdict/" }, keywords=['dictionary', 'japanese', 'kanji', 'japanese-language', 'jmdict', 'japanese-study', 'kanjidic2', 'japanese-dictionary', 'jamdict'], license=pkg_info['__license__'], author=pkg_info['__author__'], tests_require=requirements, install_requires=requirements, python_requires=">=3.6", author_email=pkg_info['__email__'], description=pkg_info['__description__'], long_description=long_description, long_description_content_type='text/markdown', packages=['jamdict'], package_data={'jamdict': ['data/*.sql', 'data/*.json', 'data/*.gz']}, include_package_data=True, platforms='any', test_suite='test', # Reference: https://pypi.python.org/pypi?%3Aaction=list_classifiers classifiers=['Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Development Status :: {}'.format(pkg_info['__status__']), 'Natural Language :: Japanese', 'Natural Language :: English', 'Environment :: Plugins', 'Intended Audience :: Developers', 'License :: OSI Approved :: {}'.format(pkg_info['__license__']), 'Operating System :: OS Independent', 'Topic :: Database', 'Topic :: Text Processing :: Linguistic', 'Topic :: Software Development :: Libraries :: Python Modules'] ) ================================================ FILE: test/__init__.py ================================================ # -*- coding: utf-8 -*- """ Jamdict Test Scripts """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os from chirptext.cli import setup_logging TEST_DIR = os.path.abspath(os.path.dirname(__file__)) TEST_DATA = os.path.join(TEST_DIR, 'data') setup_logging(os.path.join(TEST_DIR, 'logging.json'), os.path.join(TEST_DIR, 'logs')) ================================================ FILE: test/data/JMdict_mini.xml ================================================ ]> 1000000 くりかえし &n; repetition mark in katakana 1000010 くりかえし &n; voiced repetition mark in katakana 1000020 くりかえし &n; repetition mark in hiragana 1000030 くりかえし &n; voiced repetition mark in hiragana 1000040 おなじ おなじく &n; ditto mark 1000050 どうじょう &n; "as above" mark 1000060 くりかえし おなじ おなじく のま どうのじてん &n; repetition of kanji (sometimes voiced) 1000080 漢数字ゼロ かんすうじゼロ &n; ○・まる・1 漢数字 "kanji" zero 1000090 まる &n; 丸・まる・1 circle (sometimes used for zero) 二重丸・にじゅうまる ×・ばつ・1 'correct' (when marking) 〇〇・まるまる・1 symbol used as a placeholder (either because a number of other words could be used in that position, or because of censorship) 句点 period full stop 半濁点 maru mark semivoiced sound p-sound 1000100 ABC順 エービーシーじゅん &n; alphabetical order 1000110 CDプレーヤー spec1 CDプレイヤー シーディープレーヤー CDプレーヤー spec1 シーディープレイヤー CDプレイヤー &n; CD player 1000130 N響 エヌきょう &n; &abbr; NHK Symphony Orchestra 1000140 Oバック オーバック &n; O-back skirt with peek-a-boo hole in rump 1000150 RS232ケーブル アールエスにさんにケーブル &n; rs232 cable 1000160 Tシャツ ティーシャツ spec1 &n; T-shirt 1000170 Tバック spec1 ティーバック spec1 &n; T-back bikini thong 1000200 あうんの呼吸 阿吽の呼吸 あうんのこきゅう &exp; &n; the harmonizing, mentally and physically, of two parties engaged in an activity singing from the same hymn-sheet dancing to the same beat 1000210 あおば &n; やまびこ &obs; (former) regular (stops at every station) Tohoku-line Shinkansen 1000220 明白 ichi1 news1 nf10 めいはく ichi1 news1 nf10 &adj-na; obvious clear plain evident apparent explicit overt 1000225 明白 &ateji; 偸閑 &ateji; 白地 &ateji; あからさま &adj-na; &adj-no; &uk; plain frank candid open direct straightforward unabashed blatant flagrant 1000230 明かん あかん アカン &exp; &uk; &ksb; useless no good hopeless 1000260 悪どい &iK; あくどい &adj-i; &uk; gaudy showy excessive crooked vicious 1000280 論う あげつらう &v5u; &vt; &uk; to discuss to find fault with to criticize to criticise 1000290 あさひ &n; Jouetsu line express Shinkansen 1000300 あしらう &v5u; &vt; to arrange to treat to handle to deal with 1000310 馬酔木 あせび あしび あせぼ あせぶ アセビ &n; &uk; Japanese andromeda (Pieris japonica) lily-of-the-valley 1000320 彼処 ichi1 彼所 あそこ ichi1 あすこ かしこ あしこ &ok; あこ &ok; &pn; &adj-no; 何処 此処 其処 &uk; there (place physically distant from both speaker and listener) over there that place yonder あそこ あすこ &n; &col; genitals あれほど that far (something psychologically distant from both speaker and listener) that much that point 1000360 あっさり ichi1 &adv-to; &vs; &on-mim; easily readily quickly &on-mim; lightly (flavored food, applied makeup) 1000390 あっという間に あっと言う間に あっとゆう間に あっというまに あっという間に あっと言う間に あっとゆうまに あっと言う間に あっとゆう間に &exp; just like that in the twinkling of an eye in the blink of an eye in the time it takes to say "ah!" 1000400 あっぷあっぷ &adv; &n; &vs; floundering while nearly drowning suffering 1000410 あどけない &adj-i; innocent cherubic 1000420 彼の spec1 あの spec1 かの &adj-pn; 何の・どの 此の 其の・1 彼・あれ・1 &uk; that (someone or something distant from both speaker and listener, or situation unfamiliar to both speaker and listener) 1000430 あのう spec1 あの spec1 あのー say well errr ... 1000440 あの人 彼の人 あのひと &pn; &adj-no; he she that person &arch; you 1000450 あの方 spec1 彼の方 あのかた spec1 &pn; &adj-no; &hon; that gentleman (lady) he she 1000460 溢れる あぶれる &v1; &vi; &uk; to fail (in getting a job) to miss out (at fishing, hunting, etc.) &uk; to be left out to be crowded out 1000470 あべこべ ichi1 &adj-na; &n; &on-mim; contrary opposite inverse 1000480 阿呆陀羅 あほんだら あほだら &n; &uk; &ksb; fool oaf airhead 1000490 甘子 天魚 雨子 あまご アマゴ &n; 皐月鱒 &uk; land-locked variety of red-spotted masu trout (Oncorhynchus masou ishikawae) amago 1000500 あやす &v5s; &vt; to cuddle to comfort to rock to soothe to dandle to humor to humour to lull 1000510 あやふや ichi1 &adj-na; &n; &on-mim; uncertain vague ambiguous 1000520 あら ichi1 &fem; oh ah 1000525 𩺊 あら アラ &n; &uk; saw-edged perch (Niphon spinosus) 1000580 ichi1 あれ ichi1 &ok; &n; 何れ・1 此れ・1 其れ・1 &uk; that (indicating something distant from both speaker and listener (in space, time or psychologically), or something understood without naming it directly) that person (used to refer to one's equals or inferiors) &arch; over there あれ &col; down there (i.e. one's genitals) あれ &col; period menses &uk; hey (expression of surprise, suspicion, etc.) huh? eh? &n; &uk; that (something mentioned before which is distant psychologically or in terms of time) 1000590 あんな ichi1 &adj-pn; こんな そんな どんな・1 such (about something or someone distant from both speaker and listener, or about a situation unfamiliar to both speaker and listener) so that sort of 1000600 いい加減にしなさい いいかげんにしなさい &exp; shape up! act properly! 1000610 いい年をして いいとしをして &exp; いい年して (in spite of) being old enough to know better 1000620 否々 ichi1 否否 いやいや ichi1 いえいえ 嫌々・いやいや・3 &uk; no! no no! no, not at all 1000630 如何わしい いかがわしい &adj-i; &uk; suspicious dubious unreliable &uk; indecent unseemly 1000640 いかす イカす &v5s; &vi; to be smart to be cool to be sharp to be stylish 1000650 いかなる場合でも いかなるばあいでも &exp; in any case whatever the case may be 1000660 如何にも ichi1 いかにも ichi1 &adv; &uk; indeed really phrase indicating agreement 1000710 幾つも いくつも &adj-no; &uk; many great number of 1000730 行けない いけない ichi1 &exp; &uk; wrong not good of no use &uk; hopeless past hope &uk; must not do 1000740 いごっそう &n; &tsb; stubborn person strong-minded person obstinate person 1000750 いざ &adv; now come (now) well 1000760 いざこざ ichi1 &n; trouble quarrel difficulties complication tangle 1000770 いじいじ &adv; &adv-to; &vs; &on-mim; hesitantly timidly diffidently 1000780 いじける &v1; &vi; to grow timid (e.g. from an inferiority complex) to lose one's nerve to become perverse to become contrary 1000790 いじましい &adj-i; piddling paltry 1000800 いじらしい &adj-i; innocent lovable sweet pitiful pathetic 1000810 いじり回す 弄り回す 弄りまわす いじりまわす &v5s; to tinker with to fumble with to twist up 1000820 いそいそ ichi1 &adv; &n; &vs; &adv-to; &on-mim; cheerfully excitedly 1000830 いちゃいちゃ &adv; &n; &vs; flirt make out 1000840 いちゃつく &v5k; &vi; to flirt with to dally 1000860 何時もより いつもより &adv; &uk; more than usual 1000870 いとも簡単に いともかんたんに &adv; very easily 1000880 鯔背 いなせ &adj-na; &uk; gallant dashing smart 1000885 嘶く いななく &v5k; &vi; to neigh 1000890 嘶き いななき &n; &uk; neigh whinny bray 1000900 いびる &v5r; &vt; to pick on to tease 1000910 嫌に 厭に いやに &adv; &uk; awfully terribly 1000920 いらっしゃい spec1 いらしゃい &ik; &n; いらっしゃる・1 &hon; used as a polite imperative come go stay いらっしゃいませ welcome! 1000930 いらっしゃいませ ichi1 いらしゃいませ &ik; いっらしゃいませ &ik; &exp; welcome (in shops, etc.) 1000940 いらっしゃる ichi1 &v5aru; &vi; &hon; sometimes erroneously written 居らっしゃる to come to go to be (somewhere) &v5aru; &aux-v; &hon; after a -te form, or the particle "de" is (doing) are (doing) 1000960 うじうじ &adv; &n; &vs; &on-mim; irresolute hesitant 1000970 うじゃうじゃ &adv; &n; &vs; &on-mim; in swarms in clusters &on-mim; tediously slowly 1000980 うずうず ichi1 &adv; &adv-to; &vs; &on-mim; sorely tempted itching to do something eager 1000990 うぞうぞ &adv; &on-mim; irrepressibly aroused (esp. sexually) stimulated 1001000 うだうだ &adv; &on-mim; idle long-winded and meaningless 1001010 うっかり ichi1 &adv; &n; &vs; &on-mim; carelessly thoughtlessly inadvertently 1001020 うっとりさせる &v1; うっとり うっとりする to enchant to enrapture to enthrall (enthral) to charm 1001030 うとうと ichi1 うとっと &adv; &n; &vs; &adv-to; &on-mim; falling in a doze dozing 1001040 うねうね &adv; &n; &vs; &on-mim; winding meandering zigzag sinuous tortuous 1001050 畝ねり うねり ichi1 &n; &uk; wave motion undulation winding heaving sea swell roller 1001060 うろうろ ichi1 うそうそ &adv; &adv-to; &vs; 彷徨く・1 &on-mim; loiteringly aimless wandering restlessly 1001070 狼狽える うろたえる &v1; &vi; &uk; to be flustered to lose one's presence of mind 1001090 うん spec1 yes yeah uh huh 1001100 うんこ spec1 &n; &vs; &col; &chn; poop faeces feces 1001110 うんざり ichi1 &adv; &n; &vs; &on-mim; tedious boring being fed up with 1001120 うんち spec1 &n; &vs; &col; &chn; poop faeces feces 1001130 海鷂魚 &oK; &oK; えい &gikun; エイ &n; &uk; ray (fish) stingray 1001140 ええ ichi1 えー &ik; yes that is correct right um errr huh? grrr gah Must I? &adj-f; 良い・1 &ksb; good 1001150 えっと spec1 えーと えーっと ええと ええっと let me see well errr ... uhh ... 1001160 えげつない &adj-i; dirty vulgar nasty 1001170 斉魚 えつ エツ &n; &uk; Japanese grenadier anchovy (Coilia nasus) 1001180 お出でになる 御出でになる おいでになる &exp; &v5r; お出で・1 &uk; &hon; to be to come to go 1001190 おいでやす &exp; &kyb; welcome (in shops, etc.) 1001200 おおい おーい おい hey! oi! ahoy! おい &pn; &ksb; I me 1001220 おけさ &n; style of Japanese folk song associated with Niigata Prefecture 1001230 おけさ節 おけさぶし &n; style of Japanese folk song associated with Niigata Prefecture 1001240 おこしやす &exp; &kyb; welcome (in shops, etc.) 1001250 おざなりになって &exp; to say commonplace things 1001260 おしっこ spec1 &n; &vs; &col; &chn; wee-wee pee-pee number one 1001280 おしゃぶり &n; teething ring pacifier dummy 1001290 悍ましい おぞましい &adj-i; &uk; disgusting repulsive 1001300 おたおた &adv; &n; &vs; &on-mim; (shocked) speechless flustered flurried &exp; help 1001310 おちんちん &n; ちんちん・3 &chn; penis 1001330 おっかない &adj-i; &col; &ktb; frightening scary &col; &ktb; extreme exaggerated huge 1001340 落っこちる おっこちる &v1; &vi; 落ちる・1 &ktb; to fall down to drop 1001350 おっさん spec1 &n; &abbr; &derog; &fam; middle-aged man &abbr; &fam; Buddhist priest 1001360 おっちょこちょい &adj-na; &n; careless person scatterbrain birdbrain 1001370 おっぱい spec1 オッパイ &n; &chn; breasts boobies tits &chn; breast milk 1001390 御田 おでん spec1 &n; &food; &uk; oden dish of various ingredients, e.g. egg, daikon, potato, chikuwa, konnyaku, etc. stewed in soy-flavored dashi 1001400 おどおど ichi1 &adv; &adv-to; &vs; &on-mim; coweringly hesitantly 1001410 お鍋 御鍋 おなべ &n; &pol; pot &arch; typical name for a female servant in the Edo-period working at night &uk; &sl; often derog. female with symptoms of gender identity disorder (e.g. a transvestite) 1001420 ichi1 おなら ichi1 &n; &uk; usually 屁 is へ, and おなら is in kana wind gas fart something worthless something not worth considering 1001430 おばこ &n; &thb; young girl daughter unmarried girl 1001440 おべっか &n; flattery 1001450 おべんちゃら &n; (excessive) flattery smarminess fawning 1001480 お負けに 御負けに おまけに &conj; &exp; &uk; to make matters worse besides what's more in addition on top of that 1001490 お呪い 御呪い おまじない &n; &uk; good luck charm &exp; uttered when using magic abracadabra presto 1001500 おまんこ おめこ &n; &vulg; &sl; vulva vagina female genitalia &vs; &vulg; &sl; to have sexual intercourse to screw 1001510 御襁褓気触れ お襁褓気触れ オムツ気触れ おむつかぶれ 御襁褓気触れ お襁褓気触れ オムツかぶれ オムツ気触れ &n; &uk; diaper rash nappy rash 1001520 お粧し 御粧し おめかし &n; &vs; &uk; dressing up 1001540 お目出度うご座います 御目出度う御座います おめでとうございます spec1 &exp; &uk; congratulations 1001560 おや ichi1 &conj; &n; oh! oh? my! 1001570 おやおや my goodness! oh my! oh dear! 1001580 おろおろ ichi1 オロオロ &adv; &adv-to; &vs; &on-mim; nervous flustered in a dither all shook up 1001600 おろちょろ &n; &vs; lounging about 1001610 おデブさん &n; デブ &joc; chubby person 1001620 お握り ichi1 御握り おにぎり ichi1 &n; 握り飯 &uk; &pol; rice ball (often triangular, sometimes with a filling and wrapped in nori) 1001640 お蔭 御蔭 お陰 御陰 おかげ &n; &uk; grace (of God) benevolence (of Buddha) blessing &uk; assistance help aid お蔭で &uk; effects influence 1001650 お蔭様 お蔭さま 御蔭様 おかげさま &n; &pol; &uk; (somebody's) backing assistance thanks to (somebody) 1001660 お下げ 御下げ おさげ &n; &uk; wearing one's hair in braids 1001670 お化け news1 nf23 御化け ichi1 おばけ ichi1 news1 nf23 &n; goblin apparition monster ghost 1001680 お化け屋敷 おばけ屋敷 御化け屋敷 おばけやしき &n; haunted house 1001690 お嫁さん およめさん &n; bride 1001710 お菓子 ichi1 御菓子 おかし ichi1 &n; confections sweets candy 1001720 お願いします spec1 御願いします おねがいします spec1 &exp; &hum; please 1001730 お願いいたします お願い致します おねがいいたします &exp; お願いします &hum; please 1001740 お帰りなさい ichi1 御帰りなさい おかえりなさい ichi1 welcome home 1001750 お気の毒 おきのどく &adj-na; 気の毒 pitiful pity 1001760 お客さん 御客さん おきゃくさん &n; &hon; guest visitor customer 1001770 お客さま お客様 御客様 おきゃくさま &n; &hon; guest visitor customer 1001780 お休みなさい ichi1 御休みなさい おやすみなさい ichi1 &exp; &uk; good night 1001790 お宮 おみや &n; Shinto shrine 1001800 お宮参り 御宮参り おみやまいり &n; 宮参り shrine visit 1001810 お供 御供 お伴 御伴 おとも &n; &vs; 供・とも attendant companion 1001820 お金 ichi1 news1 nf04 御金 おかね ichi1 news1 nf04 &n; 金・かね・1 &pol; money 1001830 お兄さん ichi1 御兄さん おにいさん ichi1 &n; 兄さん &hon; older brother elder brother (vocative) young man buddy fella laddie 1001840 お兄ちゃん 御兄ちゃん おにいちゃん &n; &fam; familiar form of "older brother" &fam; form of address for young adult male mister 1001850 お結び 御結び おむすび &n; rice ball 1001860 お見えになる 御見えになる おみえになる &exp; &v5r; &hon; to arrive 1001870 お見舞い お見舞 御見舞い 御見舞 おみまい &n; 見舞い calling on someone who is ill enquiry inquiry 1001880 お古 御古 おふる &n; used article (esp. clothes) 1001890 お好み焼き お好み焼 御好み焼き 御好み焼 おこのみやき &n; okonomiyaki savoury pancake containing meat or seafood and vegetables 1001900 お構いなく 御構いなく おかまいなく &exp; &pol; please don't fuss over me 1001910 お絞り ichi1 御絞り おしぼり ichi1 &n; &uk; wet towel (supplied at table) hot, moistened hand towel 1001920 お札 御札 おさつ &n; bill note (currency) 1001930 お雑煮 御雑煮 おぞうに &n; 雑煮・ぞうに &food; &pol; soup containing rice cakes and vegetables (New Year's dish) 1001940 お三時 御三時 おさんじ &n; three-o'clock snack 1001950 お参り ichi1 news2 nf36 御参り おまいり ichi1 news2 nf36 &n; &vs; worship shrine visit 1001960 お産 news1 nf22 御産 おさん news1 nf22 &n; 産・さん・1 &pol; (giving) birth childbirth delivery confinement 1001970 お仕舞い ichi1 お終い 御仕舞い 御終い お仕舞 御仕舞 おしまい ichi1 &n; 仕舞い・1 &uk; &pol; the end closing being done for &exp; &uk; That's it That's enough That's all 1001980 お使い news2 nf36 御使い 御遣い お遣い お使 &io; おつかい news2 nf36 &n; 使い・つかい・1 &pol; errand mission going as envoy &pol; messenger bearer errand boy errand girl &pol; &hon; familiar spirit 1001990 お姉さん ichi1 お姐さん 御姉さん 御姐さん おねえさん ichi1 &n; 姉さん・1 &hon; usu. お姉さん elder sister 姉さん・2 (vocative) young lady 姉さん・3 usu. お姐さん miss (referring to a waitress, etc.) 姉さん・4 usu. お姐さん ma'am (used by geisha to refer to their superiors) older girl (no relation) 1002000 お子さん 御子さん おこさん &n; &hon; child 1002010 お子様 おこさま &n; &pol; child (someone else's) 1002020 お歯黒 御歯黒 鉄漿 おはぐろ かね 鉄漿 てっしょう 鉄漿 おはぐろ &n; tooth blackening tooth-blackening dye 1002030 お辞儀 ichi1 news2 nf39 御辞儀 おじぎ ichi1 news2 nf39 &n; &vs; 辞儀・1 &pol; bow bowing 1002040 お七夜 おしちや &n; name-giving ceremony 1002050 お邪魔します 御邪魔します おじゃまします &exp; excuse me for disturbing (interrupting) you greeting used when entering someone's home 1002060 お守り ichi1 news2 nf26 御守り お守 御守 おまもり ichi1 news2 nf26 &n; charm amulet 1002070 お手玉 御手玉 おてだま &n; beanbag beanbag juggling game &baseb; bobbling the ball 1002080 お手上げ news1 nf22 お手あげ 御手上げ おてあげ news1 nf22 &exp; &adj-no; all over given in given up hope bringing to knees throwing up one's hands 1002090 お手盛り news2 nf33 御手盛り おてもり news2 nf33 &n; making arbitrary decisions which benefit oneself self-approved plan 1002100 お手洗い ichi1 御手洗い おてあらい ichi1 &n; toilet restroom lavatory bathroom 1002110 お手伝いさん ichi1 おてつだいさん ichi1 &n; maid 1002120 お巡りさん ichi1 news2 nf40 御巡りさん おまわりさん ichi1 news2 nf40 &n; 巡査 &fam; police officer 1002130 お召し 御召し お召 御召 おめし &n; &hon; summoning calling riding wearing dressing clothing 御召縮緬 &abbr; (high-quality) silk crepe (fabric) 1002140 お召し物 御召し物 おめしもの &n; &pol; clothing 1002150 お勧め お薦め お奨め 御勧め 御薦め 御奨め おすすめ オススメ おススメ &n; &vs; 勧め &uk; recommendation advice suggestion encouragement 1002160 お上りさん おのぼりさん &n; countryside people (in town) visitor from the country out-of-towner country bumpkin (visiting the big city) yokel 1002170 お嬢さん ichi1 news1 nf21 御嬢さん おじょうさん ichi1 news1 nf21 &n; 嬢さん・じょうさん・1 &hon; daughter 嬢さん・じょうさん・2 young lady 1002180 お食い初め 御食い初め おくいぞめ &n; 食い初め weaning ceremony 1002190 お尻 御尻 おしり オシリ &n; &pol; bottom buttocks 1002200 お寝小 御寝小 おねしょ &n; &vs; &uk; &fem; &chn; bed-wetting 1002210 お新香 御新香 おしんこ &n; 新香 pickled vegetables pickles 1002220 お浸し 御浸し 御したし おひたし お浸し 御浸し おしたし &n; &food; boiled greens in bonito-flavoured soy sauce (vegetable side dish) 1002230 お神酒 御神酒 おみき &n; 神酒 sacred wine or sake sake offered to the gods &joc; sake 1002240 お尋ね者 おたずねもの &n; wanted man person sought by the police 1002250 お世辞 ichi1 news2 nf32 御世辞 おせじ ichi1 news2 nf32 おせいじ &n; 世辞 flattery compliment 1002260 お世話になる 御世話になる おせわになる &exp; &v5r; 世話になる to receive favor (favour) to be much obliged to someone to be indebted to be grateful 1002270 お生憎様 お生憎さま 御生憎様 おあいにくさま &adj-na; &uk; that's too bad (often ironic) too bad for you! 1002280 お先に おさきに &adv; before previously ahead &exp; お先に失礼します &abbr; &hon; Pardon me for leaving (before you) 1002290 お前 ichi1 御前 news1 nf22 おまえ ichi1 news1 nf22 おまい おめえ &pn; &adj-no; &fam; you (formerly honorific, now sometimes derog. term referring to an equal or inferior) おまえ presence (of a god, nobleman, etc.) 1002300 お前さん 御前さん おまえさん &n; you my dear hey 1002310 お膳 おぜん &n; &uk; four-legged tray for festive food 1002320 お祖父さん ichi1 お爺さん 御爺さん 御祖父さん おじいさん ichi1 &n; 祖父さん &uk; usu. お祖父さん grandfather &uk; usu. お爺さん male senior-citizen 1002330 お祖母さん ichi1 お婆さん 御祖母さん 御婆さん おばあさん ichi1 &n; 祖母さん・1 &uk; usu. お祖母さん grandmother 婆さん・2 &uk; usu. お婆さん old woman female senior citizen 1002340 お早うございます spec1 おはようございます spec1 お早う &uk; &pol; may be used more generally at any time of day good morning 1002350 お孫さん 御孫さん おまごさん &n; &hon; grandchild 1002360 お待ち遠様 御待ち遠様 お待ち遠さま おまちどおさま おまちどうさま &ik; &exp; I'm sorry to have kept you waiting 1002370 お袋 ichi1 御袋 お母 &iK; おふくろ ichi1 オフクロ &n; &col; &uk; one's mother 1002390 お大事に spec1 おだいじに spec1 &exp; take care of yourself get well soon God bless you bless you 1002400 お宅 ichi1 御宅 おたく ichi1 &n; &hon; your house your home your family &hon; your husband &hon; your organization &pn; &adj-no; &hon; you (referring to someone of equal status with whom one is not especially close) 1002410 お誕生日おめでとうございます 御誕生日おめでとうございます おたんじょうびおめでとうございます &exp; Happy Birthday 1002420 お知らせ news2 nf29 御知らせ お報せ 御報せ おしらせ news2 nf29 &n; &vs; 知らせ・1 notice notification 1002430 お茶 ichi1 news1 nf06 御茶 おちゃ ichi1 news1 nf06 &n; 茶・1 &pol; tea (usu. green) tea break (at work) 茶の湯 tea ceremony 1002440 お猪口 御猪口 おちょこ &n; 猪口 &uk; small cup sake cup cup-shaped 1002450 お喋り ichi1 御喋り おしゃべり ichi1 &n; &vs; &uk; chattering talk idle talk chat chitchat gossip &adj-na; &n; &uk; chatty talkative chatterbox blabbermouth 1002470 お天気 御天気 おてんき &n; 天気 &pol; weather temper mood 1002480 お転婆 御転婆 おてんば &adj-na; &n; &uk; ontembaar tomboy 1002490 お屠蘇 御屠蘇 おとそ &n; New Year's sake spiced sake 1002500 お土産 ichi1 御土産 おみやげ ichi1 おみあげ &ik; &n; 土産・みやげ &pol; present souvenir 1002510 お湯 御湯 おゆ &n; &pol; hot water &pol; hot bath 1002520 お得 お徳 御得 御徳 おとく &adj-na; &adj-no; economical bargain good value good-value 1002530 お凸 ichi1 御凸 おでこ ichi1 &n; 凸・でこ・1 &uk; brow forehead &uk; prominent forehead beetle brow &uk; &col; not catching anything (in fishing) 1002550 お婆はる おばはる &v5r; &sl; to shamelessly demand one's rights 1002560 お婆ん おばん &n; おばあさん・2 old maid frump hag old woman 1002570 お疲れ様 spec1 お疲れさま spec1 御疲れ様 おつかれさま spec1 &exp; thank you many thanks much appreciated that's enough for today 1002580 お付き 御付き おつき &n; retainer attendant escort 1002590 お父さん ichi1 news1 nf07 御父さん おとうさん ichi1 news1 nf07 おとっさん &ok; &n; 父さん &hon; father dad papa pa pop daddy dada 1002600 お風呂に入る おふろにはいる &exp; &v5r; to take a bath 1002610 お腹 ichi1 御腹 お中 御中 おなか ichi1 &n; &pol; stomach 1002630 お別れ おわかれ &n; farewell 1002640 お返し news1 nf18 御返し おかえし news1 nf18 &n; &vs; return gift return favour (favor) revenge change (in a cash transaction) 1002650 お母さん ichi1 news1 nf05 御母さん おかあさん ichi1 news1 nf05 &n; 母さん・1 &hon; mother 1002660 お母様 御母様 お母さま おかあさま ================================================ FILE: test/data/jamdict.json ================================================ { "JAMDICT_HOME": ".", "JAMDICT_DATA": "{JAMDICT_HOME}/data", "JAMDICT_DB": "{JAMDICT_HOME}/test/data/jamdict.db", "JMDICT_XML": "{JAMDICT_DATA}/JMdict_e.gz", "JMNEDICT_XML": "{JAMDICT_DATA}/JMnedict.xml.gz", "KD2_XML": "{JAMDICT_DATA}/kanjidic2.xml.gz", "KRADFILE": "{JAMDICT_DATA}/kradfile-u.gz" } ================================================ FILE: test/data/jmendict_mini.xml ================================================ ]> 1657560 国労 こくろう &organization; National Railway Workers' Union 2092920 IKEA イケア &company; IKEA 2831743 蒼国来栄吉 そうこくらいえいきち &person; Sōkokurai Eikichi (sumo wrestler from Inner Mongolia, 1984-) Engketübsin 5000000 ゝ泉 ちゅせん &given; Chusen 5000001 しめ &fem; Shime 5000002 〆ヱ しめえ &fem; Shimee (Shimewe) 5000003 〆丸 しめまる &given; Shimemaru 5000004 〆子 しめこ &fem; Shimeko 5000005 〆治 しめじ &given; Shimeji 5000006 〆代 しめよ &fem; Shimeyo 5000007 〆谷 しめたに &surname; Shimetani 5000008 〆木 しめき &surname; Shimeki 5000009 〆野 しめの &surname; Shimeno 5741686 埼銀 さいぎん &company; 埼玉銀行・さいたまぎんこう Saitama Bank 5723538 厦門 廈門 アモイ シアメン シヤメン シャーメン &place; Xiamen (China) Amoy 5741815 神龍 神竜 しんりゅう じんりゅう シェンロン &unclass; Shen Long (spiritual dragon in Chinese mythology) Shen-lung 5001644 かかずゆみ &fem; &person; Kakazu Yumi (1973.6.18-) ================================================ FILE: test/data/kanjidic2_mini.xml ================================================ ]>
4 2017-225 2017-08-13
6301 27-93 64 3 9 119 3 1903 2151 374 447 275 333 660 712 184 801 50 12019 294 451 458 231 178 310 328 94 142 2.15 889 667 1-3-6 3c6.8 5404.1 1348 chi2 ji Trì も.つ -も.ち も.てる hold have avoir tenir duración propiedad cargo sostener mantener segurar ter もち もつ 6642 27-94 72 2 10 33-82 16 4 2126 2462 924 1155 625 830 159 171 19 1086 46 13890 135 42 42 87 57 151 195 24 25 2.1 988 161 1-4-6 4c6.2 6404.1 3848 shi2 si Thì Thời とき -どき time hour temps heure tiempo hora ocasión tempo hora とぎ 6b21 28-01 76 15 3 6 222 2 638 2929 54 58 38 39 471 510 235 226 239 15992P 292 384 385 227 648 308 268 150 660 1120 482 1-2-4 2b4.1 3718.2 362 ci4 cha Thứ つ.ぐ つぎ next order sequence suivant ordre successivement siguiente seguir próximo pedido seqüência すき つぐ よし 6ecb 28-2 85 8 12 1563 1 2626 3238 602 734 437 549 1388 1489 1625 2460 1395 17919P 1327 1549 1651 1489 1559 1776 1222 1400 1-3-9 3a9.27 3813.2 371 zi1 ja nourishing more & more be luxuriant planting turbidity nourrissant de plus en plus luxuriant plantation turbidité florecer ser exuberante nutritivo nutrir mais & mais ser luxuriante planta turbidez しげ しげる 6cbb 28-3 85 4 8 109 2 2528 3100 335 393 250 297 746 807 181 825 214 17256 544 493 502 468 672 527 506 88 531 1174 754 1-3-5 3a5.28 3316.0 377 zhi4 chi Trì Trị おさ.める おさ.まる なお.る なお.す reign be at peace calm down subdue quell govt cure heal rule conserve règne être en paix se calmer maîtriser réprimer gouvernement guérir réparer préserver reinar gobernar política curar calmar curarse reino estar em paz acalmar-se subjugar domar governo cura regra conservar はり はる みち 723e 28-4 89 1 9 14 53-85 16-62 16-63 18-56 29-91 1 69 3521 3587 4407 2230 3001 2074 2867 2250 1945 19750 2154 4-14-1 0a14.3 1022.7 er3 i Nhĩ なんじ しかり その のみ おれ しか you thou second person eso cerca ちか 74bd 28-5 96 1 8 19 24-53 1 71 3683 2911 3624 1854 2527 2075 2194 2019 2860 1866 21309 1329 1887 2057 1537 1916 1947 1314 1506 2-14-5 4f14.2 1010.3 3272 xi3 sae sa Tỉ emperor's seal sceau impérial sello imperial selo imperial 75d4 28-6 104 11 3046 3804 2575 2625 1802 22167 3-5-6 5i6.4 0014.1 548 zhi4 chi Trĩ しもがさ piles hemorrhoids 78c1 28-7 112 6 14 48-35 1686 1 3209 4029 1214 1554 821 1123 1390 1491 1965 1288 24364 881 1548 1650 1574 892 975 1777 1407 1402 1-5-9 5a9.6 1863.2 3371 ci2 ja Từ magnet porcelain aimant porcelaine imán porcelana imã porcelana 793a 28-8 113 5 5 237 2 3228 4060 1936 2435 1229 1694 1086 1167 415 148 575 24623 695 615 625 622 769 723 651 465 814 1413 1095 2-1-4 4e0.1 1090.1 3041 shi4 si gi Thị しめ.す show indicate point out express display montrer indiquer signaler exprimer exposer mostrar enseñar indicar revelar espetáculo indicar apontar expressar mostrar 800c 28-9 126 9 6 3689 4704 2027 2520 1747 3006 2243 264 28871 2-2-4 2r4.3 1022.7 3-4-2 4-6-1 er2 i Nhi しこ.うして しか.して しか.も しか.れども すなわち なんじ しかるに and yet and then but however nevertheless 8033 28-10 128 1 6 1328 4 3697 4715 3516 4329 2190 2948 818 881 1323 331 107 28999 29 56 56 26 79 32 61 169 1582 826 4-6-1 6e0.1 1040.0 1012.0 3255 er3 i Nhĩ みみ ear oreille oreja oído ouvido がみ 81ea 28-11 132 2 6 19 3 3841 4900 3525 4338 2195 2954 36 36 53 340 60 30095 134 62 62 229 242 150 126 14 63 2.19 1587 36 4-6-2 5c1.1 2600.0 879 zi4 ja Tự みずか.ら おの.ずから おの.ずと oneself soi-même uno mismo por sí mismo auto- a si próprio より 8494 28-12 140 9 13 2368 1 4018 5153 2935 1507 2042 2395 2476 1985 31546X 2224 2-3-10 3k10.7 4464.1 shi4 shi2 si Thì Thi う.える ま.く sow (seeds) sembrar esparcir (semillas) まい まか まき 8f9e 28-13 160 135 4 13 77-70 65-13 65-16 633 2 3860 6000 1364 1726 922 1245 1497 1613 868 1918 594 38638 500 688 701 626 945 528 605 931 1785 1511 1-6-7 5b8.4 2064.1 2255 ci2 sa Từ や.める いな.む resign word term expression démission mot terme expression renuncia dimisión palabra discurso retirarse abandonar pedir demissão palavra termo expressão 6c50 28-14 85 9 6 2314 1 2488 3041 223 259 164 197 110 115 250 17122 2134 111 1-3-3 3a3.9 3712.0 360 xi1 xi4 seog Tịch セキ しお うしお せい eventide tide salt water opportunity soirée marée eau salée opportunité ocaso marea agua salada いそ 鹿 9e7f 28-15 198 8 11 957 1 5375 7017 3126 3879 1996 2695 1999 2154 1141 1823 1999 47586 2279 2038 2018 3-3-8 3q8.5 0021.1 0021.2 556 lu4 rog Lộc ロク しか deer cerf venado ciervo しし 5f0f 28-16 56 3 6 251 2 1556 1676 3049 3785 1931 2623 353 377 185 306 349 9663 295 525 534 417 515 311 269 356 796 602 359 3-3-3 4n3.2 4310.0 1456 shi4 sig Thức シキ style ceremony rite function method system form expression type protocole cérémonie rituel fonction processus système forme ceremonia método sistema forma estilo cerimônia rito método sistema formulário のり 8b58 28-17 149 5 19 496 2 4438 5704 1639 2070 1086 1477 482 521 617 2810 486 35974 698 681 694 627 911 725 824 416 512 1709 493 1-7-12 7a12.6 0365.0 3059 shi4 zhi4 sig ji Thức Chí シキ し.る しる.す discriminating know write discernement connaissance écrire discernir distinguir símbolo discriminação saber escrever さと さとる 9d2b 28-18 102 16 3013 6948 46831 1-5-11 5f11.1 6702.7 jeon Dẽ しぎ snipe (kokuji) 7afa 28-19 118 9 8 3367 4247 3248 2280 2642 2684 729 25842 2-6-2 6f2.1 8810.1 zhu2 chug dog Trúc Đốc ジク チク トク bamboo あつし 8ef8 28-20 159 8 12 1261 1 4619 5952 1514 1908 1014 1371 1112 1193 1719 1258 38269 1330 988 1023 1798 1591 1589 1776 1121 1-7-5 7c5.1 5506.0 1576 zhou2 zhou4 zhu2 chug Trục ジク axis pivot stem stalk counter for book scrolls axe pivot tige compteur de livres en rouleaux eje pivote péndulo mango eixo pivô talo caule sufixo para contagem de pergaminhos 5b8d 28-21 40 7 3724 2269 1284 1312 2216 2327 470 7084 2-3-4 3m4.4 3080.0 rou4 yug Nhục ニク ジク しし muscles meat músculos carne しん 96eb 28-22 173 9 11 2219 5043 6519 2760 3430 2405 2778 2799 1494 42219 2-8-3 8d3.1 1023.1 na3 Đà しずく drop trickle dripping (kokuji) gota gotear chorrear calar 4e03 28-23 1 5 1 2 1474 115 4 261 3 3362 4169 2109 2854 7 7 44 17 9 6 30 9 9 7 2 7 3 147 7 1.A 2 7 4-2-2 0a2.13 4071.0 1456 4-2-4 qi1 chil Thất シチ なな なな.つ なの seven sept siete 7 Sete しっ ひち 53f1 28-24 30 8 5 2255 873 691 182 213 2179 477 1886 3248 1323 1-3-2 3d2.2 6401.0 6201.0 chi4 jil Sất シツ シチ しか.る scold reprove regañina reproche regañar reprochar 57f7 28-25 32 8 11 3699 800 1 1097 1039 1680 2114 1102 1501 1506 1623 965 1420 1231 5193 1332 686 699 1152 1418 1823 351 1520 1-8-3 3b8.15 4541.7 4441.7 1457 zhi2 jib Chấp シツ シュウ と.る tenacious take hold grasp take to heart prendre en charge saisir ténacité prendre à coeur dirigir hacer negocios tomar el control tenaz segurar agarrar simpatizar 5931 28-26 37 4 4 5 447 2 178 1141 3511 4325 2189 2947 845 908 523 238 5844 501 311 311 418 447 529 452 346 594 3.13 413 853 4-5-4 0a5.28 2503.0 2463 shi1 sil Thất シツ うしな.う う.せる lose error fault disadvantage loss perdre erreur faute désavantage pérdida error defecto fallo perder malograr errar erro falta desvantagem perda 5ac9 28-27 38 8 13 1205 2317 1248 1255 783 584 2202 1820 6611 1955 1-3-10 3e10.8 4043.4 ji2 jil Tật シツ そね.む ねた.む にく.む jealous envy 816b 28-80 130 8 13 2145 3797 4848 1310 951 2467 1807 29697 2106 1-4-9 4b9.1 7221.4 zhong3 jong Thũng Trũng シュ ショウ は.れる は.れ は.らす く.む はれもの tumor swelling 8da3 28-81 156 8 15 1153 1 4544 5854 3317 4111 2087 2827 820 883 1193 2422 1047 37207 1350 1002 1039 1790 1805 310 1758 828 3-7-8 6e9.1 4180.4 4780.4 1470 qu4 cu4 chwi chug Thú Xúc シュ おもむき おもむ.く purport gist elegance interest proceed to tend become teneur fond aspect apparence sens tendance devenir significado sentido apariencia proceder essência continuar para tender tornar-se 9152 28-82 85 3 10 1006 2 2573 6160 444 532 328 403 1428 1535 781 1066 347 39776 302 517 526 422 972 318 356 144 1789 1440 1-3-7 3a7.1 3116.0 375 jiu3 ju Tửu シュ さけ さか- sake alcohol saké alcool sake alcohol vino de arroz sake álcool 9996 28-83 185 2 9 98 3 5186 6719 2265 2818 1452 1956 70 74 208 920 138 44489 139 148 148 239 281 155 177 393 1910 70 2-3-6 2o7.2 8060.1 979 shou3 su Thủ Thú シュ くび neck counter for songs and poems cou compteur de chansons ou poèmes cuello principio pescoço おびと こべ 5112 28-84 9 8 16 2162 1 561 330 174 197 127 153 1166 1250 1968 2432 1579 1220 1352 1417 1506 1072 1820 1891 138 1174 1-2-14 2a14.1 2122.7 2153 ru2 yu Nho ジュ Confucian confucéen confucionismo Confucianista 53d7 28-85 29 87 3 8 136 2 2826 678 2421 3076 1569 2146 735 794 223 730 272 3159 303 260 260 240 394 319 304 298 269 2.9 270 743 2-4-4 2h6.2 2040.7 2067 2-1-7 2-6-2 shou4 su Thụ ジュ う.ける -う.け う.かる accept undergo answer (phone) take get catch receive subir recevoir accepter répondre (téléphone) prendre obtenir recipiente acepción aceptar recibir tomar aprobar examen aceitar sofrer atender (telefone) tomar obter pegar receber じょ 546a 28-86 30 8 8 50-80 2131 912 764 323 245 2169 108 3443 1324 1-3-5 3d5.11 6601.0 zhou4 ju Chú ジュ シュ シュウ まじな.う のろ.い まじな.い のろ.う spell curse charm malediction maldición hechizo maldecir hechizar encantar 寿 5bff 28-87 41 4 8 7 52-72 1245 1 194 1374 3557 4372 2215 2979 1565 1687 1132 539 1786 7419P 1351 1550 1652 1221 1115 1327 501 1579 4-7-4 0a7.15 5034.0 5004.0 1448 shou4 su Thọ ジュ シュウ ことぶき ことぶ.く ことほ.ぐ longevity congratulations one's natural life longévité félicitations vie longevidad felicitaciones longevidade congratulações vida natural かず じゅう すっ すみ とし としかつ なが のぶ のり ひさ ひさし やす 7ae0 30-47 117 180 3 11 990 2 5112 4235 2117 2624 1342 1819 433 464 967 1461 505 25761 318 857 876 432 787 334 376 1027 1460 440 2-2-9 5b6.3 0040.6 455 2-5-6 zhang1 jang Chương ショウ badge chapter composition poem design badge clairement visible composition poème chapitre dessin distintivo banda capítulo frase insígnia capítulo composição poema projeto あき あきら あや ふみ 7b11 30-48 118 4 10 913 2 3374 4256 2646 3275 1692 2300 938 1008 889 1209 1147 25885 900 1235 1299 791 537 548 482 251 1472 947 2-6-4 6f4.1 8843.2 2363 xiao4 so Tiếu ショウ わら.う え.む laugh rire risa sonrisa reír sonreír risada えみ 7ca7 30-49 119 8 12 1545 1 3475 4400 1345 1703 909 1232 923 991 1218 1595 26945 1406 1699 1815 1614 1575 1284 1499 932 1-6-6 6b6.1 9091.4 1072 zhuang1 jang Trang ショウ cosmetics adorn (one's person) cosmétique se maquiller maquillarse arreglarse embellecerse cosméticos enfeitar-se 7d39 30-50 120 8 11 963 2 3516 4465 1335 1691 902 1222 1359 1459 938 1955 1130 27361 1400 456 463 813 1473 694 1527 1371 1-6-5 6a5.10 2796.2 2777 shao4 so Thiệu ショウ introduce inherit help présentations introduction hériter aider presentar heredar suceder a Introduzir herdar ajudar つぐ 8096 30-51 130 42 8 7 1889 1 1360 4760 2205 2733 1399 1887 114 119 1712 451 1672 29263P 1391 844 863 1665 1119 1811 1012 115 2-3-4 3n4.1 9022.7 1046 3-5-2 xiao4 xiao1 cho so Tiếu Tiêu ショウ あやか.る resemblance ressemblance portrait imitar copiar parecido semelhança 83d6 30-52 140 9 11 1 3969 5085 2881 1487 2005 2366 2452 1444 31174X 2220 2-3-8 3k8.22 4460.6 chang1 chang Xương ショウ iris iris 848b 30-53 140 13 57-22 2251 4036 5132 2947 31820X 2-3-10 3k11.5 4424.2 2-4-10 jiang3 jang Tương Tưởng ショウ ソウ まこも はげ.ます reed 8549 30-54 140 9 15 2033 1 4048 5191 2994 1529 2087 2367 2453 2372 31937X 2229 2-3-12 3k12.6 4433.1 jiao1 qiao2 cho Tiêu ショウ banana plantain plátano banana 885d 30-55 144 60 8 15 3316 972 1 1638 1772 725 893 509 658 1680 1811 918 2264 1117 34069 1411 1772 1904 1723 1759 1388 1616 1696 1-3-12 3i12.1 2110.4 2122.1 2053 chong1 chong4 chung Xung ショウ つ.く collide brunt highway opposition (astronomy) thrust pierce stab prick collision choc route principale opposition (astronomie) point clé percer poignarder piquer avenida lugar importante punto clave chocar colidir o mais forte rodovia oposição (astronomia) atravessar furar apunhalar picar 88f3 30-56 145 42 9 14 1370 5472 2615 3231 2269 800 863 1803 2376 34357 808 2-5-9 3n11.2 9073.2 2-3-11 chang2 shang5 sang Thường ショウ もすそ skirt robe falda ropa 8a1f 30-57 149 8 11 1061 1 4320 5565 1472 1864 988 1339 787 850 1675 1483 35266 1401 1403 1492 1742 1485 1208 1665 795 1-7-4 7a4.6 0863.2 3071 song4 song Tụng ショウ sue accuse accusation poursuites declarar pleitear demandar acusar processar acusar 8a3c 30-58 149 5 12 75-90 306 1 4341 5589 1506 1899 1010 1365 380 406 430 1660 697 35341 715 484 493 816 1746 737 754 448 666 1672 386 1-7-5 7a5.5 0161.1 3073 zheng4 jeung Chứng ショウ あかし evidence proof certificate preuve évidence certificat evidencia prueba testimonio certificado evidência prova certificado 8a54 30-59 149 8 12 2239 1 4333 5581 1505 1900 1009 1366 342 366 2023 1658 1213 35379 1407 1885 2055 1748 1585 1935 1673 348 1-7-5 7a5.10 0766.2 3077 zhao4 jo Chiếu ショウ みことのり imperial edict décret impérial edicto imperial decreto imperial さとし のり 8a73 30-60 149 8 13 1178 1 4357 5607 1526 1926 1026 1386 550 589 1172 1927 896 35446 1409 1577 1683 1752 1667 944 1682 556 1-7-6 7a6.12 0865.1 3055 xiang2 sang yang Tường ショウ くわ.しい つまび.らか detailed full minute accurate well-informed détaillé complet précis bien informé detallado minucioso detalhado cheio bem-informado preciso よし 571f 37-58 32 1 3 307 4 1050 966 3403 4211 2127 2875 150 161 316 42 19 4867 60 24 24 17 37 19 22 360 22 1.A 347 152 4-3-2 3b0.1 4010.0 1472 tu3 to du Thổ Độ Đỗ つち soil earth ground Turkey sol terre terrain Turquie suelo tierra Turquía solo terra chão Turquia つく ひじ 7523 27-26 100 117 4 11 161 3 3354 3716 3298 4089 2075 2812 1560 1681 142 1520 317 21684P 491 278 278 408 218 518 567 92 574 1328 1574 3-6-5 5b6.4 0021.4 472 2-2-9 2-4-7 chan3 san Sản Sẵn Sởn サン う.む う.まれる うぶ- む.す products bear give birth yield childbirth native property production porter (fruits) donner naissance produire natal propriété producto producir propiedad dar a luz nacer produtos dar a luz nascer dar crias nascimento nativo bens もと
================================================ FILE: test/logging.json ================================================ { "version": 1, "disable_existing_loggers": false, "formatters": { "simple": { "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" } }, "handlers": { "console": { "class": "logging.StreamHandler", "level": "DEBUG", "formatter": "simple", "stream": "ext://sys.stdout" }, "file_handler_important": { "class": "logging.handlers.RotatingFileHandler", "level": "WARNING", "formatter": "simple", "filename": "test/logs/logging_important.log", "maxBytes": 1000000, "backupCount": 20, "encoding": "utf8" }, "file_handler_verbose": { "class": "logging.handlers.RotatingFileHandler", "level": "DEBUG", "formatter": "simple", "filename": "test/logs/logging_details.log", "maxBytes": 1000000, "backupCount": 20, "encoding": "utf8" } }, "loggers": { "__main__": { "level": "INFO", "handlers": ["file_handler_verbose"], "propagate": "no" } ,"test": { "level": "INFO" } ,"jamdict.jmdict_sqlite": { "level": "INFO" } ,"puchikarui": { "level": "ERROR" } }, "root": { "level": "WARNING", "handlers": ["console", "file_handler_important"], "propagate": "no" } } ================================================ FILE: test/test_jamdict.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Script for testing jamdict library """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import logging import os import unittest from pathlib import Path from jamdict import Jamdict, JMDictXML from jamdict import config from jamdict.jmdict import JMDictXMLParser, JMDEntry from jamdict.kanjidic2 import Kanjidic2XMLParser from jamdict.util import _JAMDICT_DATA_AVAILABLE MY_DIR = Path(os.path.abspath(os.path.dirname(__file__))) TEST_DATA = MY_DIR / 'data' MINI_JMD = TEST_DATA / 'JMdict_mini.xml' MINI_KD2 = TEST_DATA / 'kanjidic2_mini.xml' MINI_JMNE = TEST_DATA / 'jmendict_mini.xml' TEST_DB = TEST_DATA / 'jamdict_test.db' def getLogger(): return logging.getLogger(__name__) def all_kana(result, forms=None): if forms is None: forms = set() for e in result.entries: forms.update(f.text for f in e.kana_forms) return forms def all_kanji(result, forms=None): if forms is None: forms = set() for e in result.entries: forms.update(f.text for f in e.kanji_forms) return forms class TestConfig(unittest.TestCase): def test_config(self): cfg = config.read_config() self.assertIn('KD2_XML', cfg) self.assertTrue(config.get_file('KD2_XML')) getLogger().info("jamdict log file location: {}".format(config._get_config_manager().locate_config())) class TestModels(unittest.TestCase): def test_basic_models(self): parser = JMDictXMLParser() entries = parser.parse_file(MINI_JMD) self.assertEqual(len(entries), 230) # there are 230 test entries e = entries[0] self.assertEqual(len(e), 1) # there is only 1 sense self.assertEqual(len(e[0].gloss), 1) # there is only 1 sense # first sense in entry e to string -> with POS self.assertEqual(str(e[0]), 'repetition mark in katakana ((noun (common) (futsuumeishi)))') self.assertEqual(str(e[0].text()), 'repetition mark in katakana') # compact is enabled by default self.assertEqual(str(e[0].gloss[0]), 'repetition mark in katakana') def test_lookup_result(self): jam = Jamdict(jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, auto_config=False, auto_expand=False) result = jam.lookup('おみやげ') self.assertTrue(result.entries) self.assertEqual(result.entries[0].kana_forms[0].text, 'おみやげ') # test lookup by ID res = jam.lookup('id#{}'.format(1002490)) self.assertTrue(res.entries) self.assertEqual(res.entries[0].kana_forms[0].text, 'おとそ') class TestJamdictXML(unittest.TestCase): @classmethod def setUpClass(cls): if os.path.isfile(TEST_DB): os.unlink(TEST_DB) def test_jmdict_xml(self): print("Test JMDict - lookup from XML") parser = JMDictXMLParser() entries = parser.parse_file(MINI_JMD) jmd = JMDictXML(entries) self.assertTrue(jmd.lookup(u'おてんき')) def test_jmdict_fields(self): parser = JMDictXMLParser() entries = parser.parse_file(MINI_JMD) jmd = JMDictXML(entries) results = jmd.lookup(u'おてんき') self.assertTrue(results) self.assertIsInstance(results[0], JMDEntry) def test_jmdict_json(self): print("Test JMDict - XML to JSON") # Load mini dict data jmd = JMDictXML.from_file(MINI_JMD) e = jmd[10] self.assertIsNotNone(e) self.assertTrue(e.to_dict()) self.assertTrue(jmd[-1].to_dict()) def test_kanjidic2_xml(self): print("Test KanjiDic2 XML") # test module read kanjidic XML parser = Kanjidic2XMLParser() kd2 = parser.parse_file(MINI_KD2) for c in kd2: self.assertIsNotNone(c) for g in c.rm_groups: self.assertIsNotNone(g) self.assertTrue(g.readings) self.assertTrue(g.meanings) def test_kanjidic2_json(self): print("Test KanjiDic2 XML to JSON") parser = Kanjidic2XMLParser() kd2 = parser.parse_file(MINI_KD2) for c in kd2: self.assertIsNotNone(c.to_dict()) def test_jamdict_xml(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() result = jam.lookup('おみやげ') self.assertEqual(1, len(result.entries)) self.assertEqual(2, len(result.chars)) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) class TestConfig(unittest.TestCase): _cfg_dir = TEST_DATA / '.jamdict' _cfg_file = _cfg_dir / 'config.json' @classmethod def setUpClass(cls): cls.clean_config_file() @classmethod def tearDownClass(cls): cls.clean_config_file() @classmethod def clean_config_file(cls): if cls._cfg_file.exists(): cls._cfg_file.unlink() if cls._cfg_dir.exists(): cls._cfg_dir.rmdir() def test_config_file(self): # if configuration file doesn't exist conf = config.read_config(ensure_config=False, force_refresh=True) self.assertIsNotNone(conf) # and force creating config conf = config.read_config(ensure_config=True, force_refresh=True) self.assertIsNotNone(conf) def test_ensure_config(self): self.clean_config_file() self.assertFalse(self._cfg_file.is_file()) conf = config._ensure_config(self._cfg_file) self.assertTrue(self._cfg_file.is_file()) def test_home_dir(self): _orig_home = '' if 'JAMDICT_HOME' in os.environ: _orig_home = os.environ['JAMDICT_HOME'] # set a new home os.environ['JAMDICT_HOME'] = str(self._cfg_dir) # home_dir exist ... if not self._cfg_dir.is_dir(): self._cfg_dir.mkdir(parents=True) self.assertEqual(config.home_dir(), str(self._cfg_dir)) # home_dir does not exist ... if self._cfg_dir.is_dir(): self.clean_config_file() self.assertEqual(config.home_dir(), "~/.jamdict") # no environ del os.environ['JAMDICT_HOME'] self.assertEqual(config.home_dir(), "~/.jamdict") os.environ['JAMDICT_HOME'] = _orig_home class TestAPIWarning(unittest.TestCase): def test_warn_to_json_deprecated(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE) jam.import_data() with self.assertWarns(DeprecationWarning): r = jam.lookup("おみやげ") self.assertTrue(r.to_json()) with self.assertWarns(DeprecationWarning): r2 = jam.lookup("シェンロン") self.assertTrue(r2.to_json()) class TestJamdictSQLite(unittest.TestCase): @classmethod def tearDownClass(cls): if os.path.isfile(TEST_DB): os.unlink(TEST_DB) def test_search_by_pos(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() # test get all pos poses = jam.all_pos() expected = {'Godan verb - -aru special class', "Godan verb with `ku' ending", "Godan verb with `ru' ending", "Godan verb with `su' ending", "Godan verb with `u' ending", 'Ichidan verb', 'adjectival nouns or quasi-adjectives (keiyodoshi)', 'adjective (keiyoushi)', 'adverb (fukushi)', "adverb taking the `to' particle", 'auxiliary verb', 'conjunction', 'expressions (phrases, clauses, etc.)', 'interjection (kandoushi)', 'intransitive verb', 'noun (common) (futsuumeishi)', 'noun or participle which takes the aux. verb suru', 'noun or verb acting prenominally', "nouns which may take the genitive case particle `no'", 'pre-noun adjectival (rentaishi)', 'pronoun', 'transitive verb'} self.assertEqual(expected, set(poses)) result = jam.lookup('おみやげ', pos=['noun (common) (futsuumeishi)']) self.assertEqual(1, len(result.entries)) with self.assertLogs('jamdict.jmdict_sqlite', level="WARNING") as cm: result = jam.lookup('おみやげ', pos='noun (common) (futsuumeishi)') self.assertEqual(1, len(result.entries)) warned_pos_as_str = False for line in cm.output: if "POS filter should be a collection, not a string" in line: warned_pos_as_str = True break self.assertTrue(warned_pos_as_str) result = jam.lookup('おみやげ', pos=['intransitive verb']) self.assertFalse(result.entries) result = jam.lookup('おみやげ', pos=['intransitive verb', 'noun (common) (futsuumeishi)']) self.assertTrue(result.entries) def test_search_by_ne_type(self): print("Test Jamdict search in XML files") jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() netypes = jam.all_ne_type() expected = ['company', 'fem', 'given', 'organization', 'person', 'place', 'surname', 'unclass'] self.assertEqual(expected, netypes) res = jam.lookup("place") actual = set() for n in res.names: actual.update(k.text for k in n.kanji_forms) self.assertIn("厦門", actual) res = jam.lookup("company") actual = set() for n in res.names: actual.update(k.text for k in n.kanji_forms) expected = {'埼銀', 'IKEA'} self.assertTrue(expected.issubset(actual)) def test_find_all_verbs(self): jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() # cannot search for everything self.assertRaises(ValueError, lambda: jam.lookup("")) self.assertRaises(ValueError, lambda: jam.lookup("%")) self.assertRaises(ValueError, lambda: jam.lookup("%", pos="")) res = jam.lookup("", pos="pronoun") actual = all_kana(res) pronouns = {'おい', 'おまい', 'おたく', 'あのひと', 'かしこ', 'あのかた', 'おめえ', 'おまえ', 'おおい', 'おーい', 'あそこ', 'あこ', 'あしこ', 'あすこ'} self.assertTrue(pronouns.issubset(actual)) result = jam.lookup("%", pos=["intransitive verb", 'pronoun']) forms = all_kana(result) iverbs = {'いじける', 'イカす', 'うろたえる', 'いかす', 'おっこちる', 'いらっしゃる', 'あぶれる', 'いななく', 'いちゃつく'} self.assertTrue(iverbs.issubset(forms)) self.assertTrue(pronouns.issubset(forms)) @unittest.skipIf(not _JAMDICT_DATA_AVAILABLE, "Jamdict data is not available. Data test is skipped") def test_jamdict_data(self): jam = Jamdict() # search verb kaeru res = jam.lookup("かえる", pos="transitive verb") actual = [e.idseq for e in res.entries] self.assertIn(1510650, actual) self.assertIn(1589780, actual) forms = all_kanji(res) expected = {'変える', '代える', '換える', '替える'} self.assertTrue(expected.issubset(forms)) # search by noun kaeru res2 = jam.lookup("かえる", pos='noun (common) (futsuumeishi)') actual2 = [e.idseq for e in res2.entries] forms2 = all_kanji(res2) self.assertIn(1577460, actual2) expected2 = {'蛙', '蛤', '蝦'} self.assertTrue(expected2.issubset(forms2)) # search both noun and verb res3 = jam.lookup("かえる", pos=['noun (common) (futsuumeishi)', "transitive verb"]) forms3 = all_kanji(res3) self.assertTrue(expected.issubset(forms3)) self.assertTrue(expected2.issubset(forms3)) def test_jamdict_sqlite_all(self): if os.path.isfile(TEST_DB): os.unlink(TEST_DB) TEST_DB.touch() jam = Jamdict(db_file=TEST_DB, jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE) # Lookup using XML result = jam.jmdict_xml.lookup('おみやげ') getLogger().debug("Results: {}".format(result)) # Lookup using SQLite jam.import_data() # test lookup result = jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) print("Test reading DB into RAM") ram_jam = Jamdict(TEST_DB, memory_mode=True) print("1st lookup") result = ram_jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) print("2nd lookup") result = ram_jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(len(result.chars), 2) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) print("3rd lookup") result = ram_jam.lookup('おみやげ') self.assertIsNotNone(result.entries) self.assertEqual(len(result.entries), 1) self.assertEqual(2, len(result.chars)) self.assertEqual({c.literal for c in result.chars}, {'土', '産'}) def test_lookup_iter(self): jam = Jamdict(":memory:", jmd_xml_file=MINI_JMD, kd2_xml_file=MINI_KD2, jmnedict_xml_file=MINI_JMNE, auto_config=True) jam.import_data() # verify entries res = jam.lookup_iter("おこ%", pos="noun (common) (futsuumeishi)") entries = [e.text() for e in res.entries] expected = ['おこのみやき (お好み焼き) : okonomiyaki/savoury pancake containing meat or seafood and ' 'vegetables', 'おこさん (お子さん) : child', "おこさま (お子様) : child (someone else's)"] self.assertEqual(expected, entries) # verify characters res = jam.lookup_iter("お土産") self.assertIsNotNone(res.entries) self.assertIsNotNone(res.chars) self.assertIsNotNone(res.names) # verify characters chars = [repr(c) for c in res.chars] expected = ['土:3:soil,earth,ground,Turkey', '産:11:products,bear,give birth,yield,childbirth,native,property'] self.assertEqual(expected, chars) # verify names res = jam.lookup_iter("surname") names = [n.text() for n in res.names] expected = ['しめたに (〆谷) : Shimetani (surname)', 'しめき (〆木) : Shimeki (surname)', 'しめの (〆野) : Shimeno (surname)'] self.assertEqual(expected, names) ######################################################################## if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_jmdict_sqlite.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Test script for JMDict SQLite """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import unittest import logging from pathlib import Path from chirptext.cli import setup_logging from jamdict import Jamdict from jamdict import JMDictXML from jamdict import JMDictSQLite # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- TEST_DIR = Path(os.path.realpath(__file__)).parent TEST_DATA = TEST_DIR / 'data' if not TEST_DATA.exists(): TEST_DATA.mkdir() TEST_DB = TEST_DATA / 'test.db' MINI_JMD = TEST_DATA / 'JMdict_mini.xml' okashi = 'お菓子' setup_logging(TEST_DIR / 'logging.json', 'logs') def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------- # Test cases # ------------------------------------------------------------------------------- class TestJamdictSQLite(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.db = JMDictSQLite(str(TEST_DB)) self.xdb = JMDictXML.from_file(str(MINI_JMD)) self.ramdb = JMDictSQLite(":memory:") @classmethod def setUpClass(cls): if os.path.isfile(TEST_DB): getLogger().info("Removing previous database file at {}".format(TEST_DB)) os.unlink(TEST_DB) def test_xml2sqlite(self): print("Test JMDict: XML to SQLite") try: self.db.insert_entries(self.xdb) except Exception: getLogger().exception("Error happened while inserting entries") raise entries = self.db.Entry.select() self.assertEqual(len(entries), len(self.xdb)) # test select entry by id e = self.db.get_entry(1001710) ejson = e.to_dict() self.assertEqual(ejson['kanji'][0]['text'], 'お菓子') getLogger().debug(e.to_dict()) def test_import_to_ram(self): print("Testing XML to RAM") noe = len(self.xdb) with self.ramdb.ctx() as ctx: self.ramdb.insert_entries(self.xdb, ctx=ctx) self.assertEqual(len(self.ramdb.Entry.select(ctx=ctx)), noe) def test_import_function(self): print("Testing JMDict import function") jd = Jamdict(db_file=":memory:", jmd_xml_file=MINI_JMD, auto_config=False, auto_expand=False) jd.import_data() def test_search(self): print("Test searching JMDict SQLite") with self.ramdb.ds.open() as ctx: self.ramdb.insert_entries(self.xdb, ctx=ctx) entries = ctx.Entry.select() # Search by kana es = self.ramdb.search('あの', ctx) self.assertEqual(len(es), 2) getLogger().info('あの: {}'.format('|'.join([str(x) for x in es]))) # Search by kanji es = self.db.search('%子%', ctx, exact_match=False) self.assertEqual(len(es), 4) getLogger().info('%子%: {}'.format('|'.join([str(x) for x in es]))) # search by meaning es = self.db.search('%confections%', ctx, exact_match=False) self.assertTrue(es) getLogger().info('%confections%: {}'.format('|'.join([str(x) for x in es]))) def test_iter_search(self): with self.ramdb.open() as ctx: self.ramdb.insert_entries(self.xdb, ctx=ctx) forms = set() for e in self.ramdb.search_iter("%あの%", iter_mode=True, ctx=ctx): forms.update(f.text for f in e.kana_forms) expected = {'あのー', 'あのう', 'あの', 'かの', 'あのかた', 'あのひと'} self.assertTrue(expected.issubset(forms)) # ------------------------------------------------------------------------------- # Main # ------------------------------------------------------------------------------- if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_jmnedict.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Test script for JMendict support """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2020 Le Tuan Anh # :license: MIT, see LICENSE for more details. import logging import os import unittest from jamdict.jmdict import JMENDICT_TYPE_MAP from jamdict.jmnedict_sqlite import JMNEDictSQLite from jamdict.util import JMNEDictXML, JamdictSQLite # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- TEST_DIR = os.path.dirname(os.path.realpath(__file__)) TEST_DATA = os.path.join(TEST_DIR, 'data') if not os.path.isdir(TEST_DATA): os.makedirs(TEST_DATA) TEST_DB = os.path.join(TEST_DATA, 'jamcha.db') MINI_JMNE = os.path.join(TEST_DATA, 'jmendict_mini.xml') def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------- # Test cases # ------------------------------------------------------------------------------- class TestJMendictModels(unittest.TestCase): xdb = JMNEDictXML.from_file(MINI_JMNE) ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) ramdb = JamdictSQLite(":memory:", auto_expand_path=False) def extract_fields(self): ''' Test JMnedict XML parser ''' entries = self.xdb.entries expected_idseqs = ['1657560', '2092920', '2831743', '5000000', '5000001', '5000002', '5000003', '5000004', '5000005', '5000006', '5000007', '5000008', '5000009', '5741686', '5723538', '5741815', '5001644'] idseqs = [e.idseq for e in entries] self.assertEqual(expected_idseqs, idseqs) def test_ne_type_map(self): ''' Test all name_type* ''' expected = {'person', 'organization', 'surname', 'company', 'place', 'fem', 'unclass', 'masc', 'given', 'work', 'product', 'ok', 'station'} actual = set(JMENDICT_TYPE_MAP.keys()) self.assertEqual(expected, actual) def test_jmne_support(self): ''' Test metadata ''' ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) with ramdb.ctx() as ctx: self.ramdb.insert_name_entities(self.xdb, ctx=ctx) m = ctx.meta.select_single('key=?', ('jmnedict.version',)) self.assertEqual(m.key, 'jmnedict.version') self.assertEqual(m.value, '1.08') def test_xml2ramdb(self): print("Testing XML to RAM") ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) with ramdb.ctx() as ctx: ramdb.insert_name_entities(self.xdb, ctx=ctx) # all entries were inserted expected_idseqs = {int(e.idseq) for e in self.xdb} inserted_idseqs = {e.idseq for e in ramdb.NEEntry.select(ctx=ctx)} getLogger().info("Inserted entries: {}".format(len(inserted_idseqs))) self.assertEqual(expected_idseqs, inserted_idseqs) # make sure that the kanjis are inserted expected_kanjis = set() for e in self.xdb.entries: expected_kanjis.update(k.text for k in e.kanji_forms) kanjis = {k.text for k in ctx.NEKanji.select()} self.assertEqual(expected_kanjis, kanjis) # make sure that the kanas were inserted expected_readings = set() for e in self.xdb.entries: expected_readings.update(k.text for k in e.kana_forms) readings = {k.text for k in ctx.NEKana.select()} self.assertEqual(expected_readings, readings) # make sure that the definitions were inserted expected_glosses = set() for e in self.xdb.entries: for s in e.senses: expected_glosses.update(g.text for g in s.gloss) glosses = {k.text for k in self.ramdb.NETransGloss.select(ctx=ctx)} self.assertEqual(expected_glosses, glosses) # make sure that the XML entries and the SQLite entries are the same for idseq in inserted_idseqs: ne_xml = self.xdb.lookup("id#{}".format(idseq))[0] ne_xml.idseq = int(ne_xml.idseq) getLogger().debug(ne_xml.to_dict()) ne = ramdb.get_ne(idseq, ctx=ctx) getLogger().debug(ne.to_dict()) self.assertEqual(ne_xml.to_dict(), ne.to_dict()) # test search by idseq shenron = ramdb.search_ne('id#5741815', ctx=ctx) self.assertEqual(len(shenron), 1) self.assertEqual(shenron[0].idseq, 5741815) # test exact search shenron2 = ramdb.search_ne('神龍', ctx=ctx) self.assertEqual(len(shenron2), 1) self.assertEqual(shenron2[0].idseq, 5741815) # test search by kana shenron3 = ramdb.search_ne('シェンロン', ctx=ctx) self.assertEqual(len(shenron3), 1) self.assertEqual(shenron3[0].idseq, 5741815) # test search by definition shenron4 = ramdb.search_ne('%spiritual%', ctx=ctx) self.assertEqual(len(shenron4), 1) self.assertEqual(shenron4[0].idseq, 5741815) # test search by wild card all_shime_names = ramdb.search_ne('しめ%', ctx=ctx) expected_idseqs = [5000001, 5000002, 5000003, 5000004, 5000005, 5000006, 5000007, 5000008, 5000009] actual = [x.idseq for x in all_shime_names] self.assertEqual(expected_idseqs, actual) # test search by name_type all_fems = ramdb.search_ne('person', ctx=ctx) expected_idseqs = [2831743, 5001644] actual = [x.idseq for x in all_fems] self.assertEqual(expected_idseqs, actual) def test_query_netype(self): ramdb = JMNEDictSQLite(":memory:", auto_expand_path=False) ctx = ramdb.ctx() ramdb.insert_name_entities(self.xdb, ctx=ctx) shenron = ctx.search_ne('id#5741815', ctx=ctx)[0] self.assertTrue(shenron) # ------------------------------------------------------------------------------- # Main # ------------------------------------------------------------------------------- if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_kanjidic2_sqlite.py ================================================ # -*- coding: utf-8 -*- """ Test script for Jamcha SQLite """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import os import unittest import logging from jamdict import KanjiDic2SQLite from jamdict import KanjiDic2XML # ------------------------------------------------------------------------------- # Configuration # ------------------------------------------------------------------------------- TEST_DIR = os.path.dirname(os.path.realpath(__file__)) TEST_DATA = os.path.join(TEST_DIR, 'data') if not os.path.isdir(TEST_DATA): os.makedirs(TEST_DATA) TEST_DB = os.path.join(TEST_DATA, 'jamcha.db') MINI_KD2 = os.path.join(TEST_DATA, 'kanjidic2_mini.xml') def getLogger(): return logging.getLogger(__name__) # ------------------------------------------------------------------------------- # Test cases # ------------------------------------------------------------------------------- class TestJamdictSQLite(unittest.TestCase): db = KanjiDic2SQLite(TEST_DB) ramdb = KanjiDic2SQLite(":memory:", auto_expand_path=False) xdb = KanjiDic2XML.from_file(MINI_KD2) @classmethod def setUpClass(cls): if os.path.isfile(TEST_DB): getLogger().info("Removing previous database file at {}".format(TEST_DB)) os.unlink(TEST_DB) def test_xml2sqlite(self): print("Test KanjiDic2 - XML to SQLite DB in RAM") getLogger().info("Testing using {} test characters".format(len(self.xdb))) db = self.ramdb with db.ctx() as ctx: fv = self.xdb.kd2.file_version dv = self.xdb.kd2.database_version doc = self.xdb.kd2.date_of_creation db.update_kd2_meta(fv, dv, doc, ctx) metas = ctx.meta.select() getLogger().debug("KanjiDic2 meta: {}".format(metas)) for c in self.xdb: db.insert_char(c, ctx) c2 = db.char_by_id(c.ID, ctx) getLogger().debug("c-xml", c.to_dict()) getLogger().debug("c-sqlite", c2.to_dict()) with self.assertWarns(DeprecationWarning): self.assertEqual(c.to_json(), c2.to_dict()) # test searching # by id c = ctx.char.select_single() c = db.char_by_id(c.ID, ctx=ctx) self.assertIsNotNone(c) self.assertTrue(c.rm_groups[0].readings) self.assertTrue(c.rm_groups[0].meanings) # by literal c = db.get_char('持', ctx=ctx) self.assertEqual(c.literal, '持') self.assertTrue(c.rm_groups[0].readings) self.assertTrue(c.rm_groups[0].meanings) def test_reading_order(self): db = self.ramdb with db.ctx() as ctx: fv = self.xdb.kd2.file_version dv = self.xdb.kd2.database_version doc = self.xdb.kd2.date_of_creation db.update_kd2_meta(fv, dv, doc, ctx) metas = ctx.meta.select() getLogger().debug("KanjiDic2 meta: {}".format(metas)) for c in self.xdb: db.insert_char(c, ctx) c = db.get_char('持', ctx=ctx) rmg = c.rm_groups[0] self.assertEqual(["ジ"], [x.value for x in rmg.on_readings]) self.assertEqual(['も.つ', '-も.ち', 'も.てる'], [k.value for k in rmg.kun_readings]) self.assertEqual([('chi2', 'pinyin'), ('ji', 'korean_r'), ('지', 'korean_h'), ('Trì', 'vietnam')], [(x.value, x.r_type) for x in rmg.other_readings]) expected = [{'type': 'ja_on', 'value': 'ジ', 'on_type': '', 'r_status': ''}, {'type': 'ja_kun', 'value': 'も.つ', 'on_type': '', 'r_status': ''}, {'type': 'ja_kun', 'value': '-も.ち', 'on_type': '', 'r_status': ''}, {'type': 'ja_kun', 'value': 'も.てる', 'on_type': '', 'r_status': ''}, {'type': 'pinyin', 'value': 'chi2', 'on_type': '', 'r_status': ''}, {'type': 'korean_r', 'value': 'ji', 'on_type': '', 'r_status': ''}, {'type': 'korean_h', 'value': '지', 'on_type': '', 'r_status': ''}, {'type': 'vietnam', 'value': 'Trì', 'on_type': '', 'r_status': ''}] actual = c.rm_groups[0].to_dict()['readings'] self.assertEqual(expected, actual) # ------------------------------------------------------------------------------- # Main # ------------------------------------------------------------------------------- if __name__ == "__main__": unittest.main() ================================================ FILE: test/test_krad.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Script for testing KRad module library References: RADKFILE/KRADFILE This project provides a decomposition of kanji into a number of visual elements or radicals to support software which provides a lookup service using kanji components. https://www.edrdg.org/krad/kradinf.html """ # This code is a part of jamdict library: https://github.com/neocl/jamdict # :copyright: (c) 2016 Le Tuan Anh # :license: MIT, see LICENSE for more details. import logging import os import unittest from jamdict import config from jamdict.krad import KRad ######################################################################## MY_DIR = os.path.abspath(os.path.dirname(__file__)) TEST_DATA = os.path.join(MY_DIR, 'data') MINI_JMD = os.path.join(TEST_DATA, 'JMdict_mini.xml') MINI_KD2 = os.path.join(TEST_DATA, 'kanjidic2_mini.xml') TEST_DB = os.path.join(TEST_DATA, 'jamdict_test.db') def getLogger(): return logging.getLogger(__name__) class TestConfig(unittest.TestCase): def test_config(self): cfg = config.read_config() self.assertIn('KD2_XML', cfg) self.assertTrue(config.get_file('KD2_XML')) getLogger().info("jamdict log file location: {}".format(config._get_config_manager().locate_config())) class TestModels(unittest.TestCase): def test_read_krad(self): krad = KRad() self.assertEqual(krad.krad['㘅'], ['亅', '二', '口', '彳', '金']) self.assertEqual(krad.krad['𪚲'], ['乙', '勹', '月', '田', '亀']) self.assertEqual(krad.radk['龠'], {'籥', '鸙', '龢', '龠', '龡', '籲', '瀹', '龥', '禴', '鑰', '爚', '龣'}) ######################################################################## if __name__ == "__main__": logging.getLogger('jamdict').setLevel(logging.DEBUG) unittest.main() ================================================ FILE: test.sh ================================================ #!/bin/bash python3 -m unittest discover