[
  {
    "path": ".github/FUNDING.yml",
    "content": "custom: ['https://www.paypal.me/MMandalka']\n"
  },
  {
    "path": ".gitignore",
    "content": "__pycache__\n.project\n.pydevproject\n.settings\n"
  },
  {
    "path": ".gitmodules",
    "content": "[submodule \"src/open-semantic-entity-search-api\"]\n\tpath = src/open-semantic-entity-search-api\n\turl = https://github.com/opensemanticsearch/open-semantic-entity-search-api.git\n\tbranch = master\n[submodule \"src/tesseract-ocr-cache\"]\n\tpath = src/tesseract-ocr-cache\n\turl = https://github.com/opensemanticsearch/tesseract-ocr-cache.git\n"
  },
  {
    "path": "DEBIAN/conffiles",
    "content": "/etc/opensemanticsearch/etl\n/etc/opensemanticsearch/filemonitoring/files\n/etc/opensemanticsearch/connector-files\n/etc/opensemanticsearch/connector-web\n/etc/opensemanticsearch/enhancer-rdf\n/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname\n/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix\n/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix\n/etc/opensemanticsearch/blacklist/blacklist-url\n/etc/opensemanticsearch/blacklist/blacklist-url-prefix\n/etc/opensemanticsearch/blacklist/blacklist-url-suffix\n/etc/opensemanticsearch/blacklist/blacklist-url-regex\n/etc/opensemanticsearch/blacklist/whitelist-url\n/etc/opensemanticsearch/blacklist/whitelist-url-prefix\n/etc/opensemanticsearch/blacklist/whitelist-url-suffix\n/etc/opensemanticsearch/blacklist/whitelist-url-regex\n/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype\n/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-prefix\n/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-suffix\n/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-regex\n/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype\n/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-prefix\n/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-suffix\n/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-regex\n"
  },
  {
    "path": "DEBIAN/control",
    "content": "Package: open-semantic-etl\nVersion: 21.10.18\nSection: misc\nPriority: optional\nArchitecture: all\nDepends: tika-server(>=0), python3-tika(>=0), curl(>=0), python3-pycurl(>=0), python3-rdflib(>=0), python3-sparqlwrapper(>=0), file(>=0), python3-requests(>=0), python3-pysolr(>=0), python3-dateutil(>=0), python3-lxml(>=0), python3-feedparser(>=0), poppler-utils(>=0), pst-utils(>=0),rabbitmq-server(>=0),python3-pyinotify(>=0),python3-pip(>=0), python3-dev(>=0), build-essential(>=0), libssl-dev(>=0), libffi-dev(>=0), tesseract-ocr(>=0), tesseract-ocr-deu(>=0)\nInstalled-Size: 100\nMaintainer: Markus Mandalka <debian@mandalka.name>\nHomepage: https://opensemanticsearch.org/\nDescription: Crawler to index files and directories to Solr\n Index your files to Solr.\n If tesseract-ocr installed there will be character recognition on images.\n Hint: install ocr language files like tesseract-ocr-deu for german texts.\n"
  },
  {
    "path": "DEBIAN/postinst",
    "content": "#!/bin/sh\n\nadduser --system --disabled-password opensemanticetl\ngroupadd -r tesseract_cache\nusermod -a -G tesseract_cache opensemanticetl\n\n# rights for OCR cache\nchown opensemanticetl:tesseract_cache /var/cache/tesseract\nchmod 770 /var/cache/tesseract\n\n# rights for thumbnail dir\nchown opensemanticetl /var/opensemanticsearch/media/thumbnails\nchmod o+w /var/opensemanticsearch/media/thumbnails\n\n\n# install dependencies\npip3 install -r /usr/lib/python3/dist-packages/opensemanticetl/requirements.txt\n\n\n# load our additional systemd service config\nsystemctl daemon-reload\n\n# start while booting\nsystemctl enable opensemanticetl\nsystemctl enable opensemanticetl-filemonitoring\n\n# (re)start after installation (or upgrade)\nsystemctl restart opensemanticetl\n"
  },
  {
    "path": "DEBIAN/prerm",
    "content": "#!/bin/sh\n\nsystemctl disable opensemanticetl-filemonitoring\n\nsystemctl stop opensemanticetl-filemonitoring\n\nsystemctl disable opensemanticetl\n\nsystemctl stop opensemanticetl\n\nexit 0\n"
  },
  {
    "path": "Dockerfile",
    "content": "ARG FROM=debian:bullseye\nFROM ${FROM}\n\nENV DEBIAN_FRONTEND=noninteractive\nENV CRYPTOGRAPHY_DONT_BUILD_RUST=1\n\nRUN apt-get update && apt-get install --no-install-recommends --yes \\\n    build-essential \\\n    curl \\\n    file \\\n    libffi-dev \\\n    librabbitmq4 \\\n    libssl-dev \\\n    poppler-utils \\\n    pst-utils \\\n    python3-dateutil \\\n    python3-dev \\\n    python3-feedparser \\\n    python3-lxml \\\n    python3-pip \\\n    python3-pycurl \\\n    python3-pyinotify \\\n    python3-pysolr \\\n    python3-rdflib \\\n    python3-requests \\\n    python3-scrapy \\\n    python3-setuptools \\\n    python3-sparqlwrapper \\\n    python3-wheel \\\n    tesseract-ocr \\\n#    tesseract-ocr-all \\\n    && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*\n\nCOPY ./src/opensemanticetl/requirements.txt /usr/lib/python3/dist-packages/opensemanticetl/requirements.txt\n# install Python PIP dependecies\nRUN pip3 install -r /usr/lib/python3/dist-packages/opensemanticetl/requirements.txt\n\nCOPY ./src/opensemanticetl /usr/lib/python3/dist-packages/opensemanticetl\nCOPY ./src/tesseract-ocr-cache/tesseract_cache /usr/lib/python3/dist-packages/tesseract_cache\nCOPY ./src/tesseract-ocr-cache/tesseract_fake /usr/lib/python3/dist-packages/tesseract_fake\nCOPY ./src/open-semantic-entity-search-api/src/entity_linking /usr/lib/python3/dist-packages/entity_linking\nCOPY ./src/open-semantic-entity-search-api/src/entity_manager /usr/lib/python3/dist-packages/entity_manager\n\nCOPY docker-entrypoint.sh /\nRUN chmod 755 /docker-entrypoint.sh\n\n# add user\nRUN adduser --system --disabled-password opensemanticetl\n\nRUN mkdir /var/cache/tesseract\nRUN chown opensemanticetl /var/cache/tesseract\n\nUSER opensemanticetl\n\n# start Open Semantic ETL celery workers (reading and executing ETL tasks from message queue)\nCMD [\"/docker-entrypoint.sh\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "                    GNU GENERAL PUBLIC LICENSE\n                       Version 3, 29 June 2007\n\n Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>\n Everyone is permitted to copy and distribute verbatim copies\n of this license document, but changing it is not allowed.\n\n                            Preamble\n\n  The GNU General Public License is a free, copyleft license for\nsoftware and other kinds of works.\n\n  The licenses for most software and other practical works are designed\nto take away your freedom to share and change the works.  By contrast,\nthe GNU General Public License is intended to guarantee your freedom to\nshare and change all versions of a program--to make sure it remains free\nsoftware for all its users.  We, the Free Software Foundation, use the\nGNU General Public License for most of our software; it applies also to\nany other work released this way by its authors.  You can apply it to\nyour programs, too.\n\n  When we speak of free software, we are referring to freedom, not\nprice.  Our General Public Licenses are designed to make sure that you\nhave the freedom to distribute copies of free software (and charge for\nthem if you wish), that you receive source code or can get it if you\nwant it, that you can change the software or use pieces of it in new\nfree programs, and that you know you can do these things.\n\n  To protect your rights, we need to prevent others from denying you\nthese rights or asking you to surrender the rights.  Therefore, you have\ncertain responsibilities if you distribute copies of the software, or if\nyou modify it: responsibilities to respect the freedom of others.\n\n  For example, if you distribute copies of such a program, whether\ngratis or for a fee, you must pass on to the recipients the same\nfreedoms that you received.  You must make sure that they, too, receive\nor can get the source code.  And you must show them these terms so they\nknow their rights.\n\n  Developers that use the GNU GPL protect your rights with two steps:\n(1) assert copyright on the software, and (2) offer you this License\ngiving you legal permission to copy, distribute and/or modify it.\n\n  For the developers' and authors' protection, the GPL clearly explains\nthat there is no warranty for this free software.  For both users' and\nauthors' sake, the GPL requires that modified versions be marked as\nchanged, so that their problems will not be attributed erroneously to\nauthors of previous versions.\n\n  Some devices are designed to deny users access to install or run\nmodified versions of the software inside them, although the manufacturer\ncan do so.  This is fundamentally incompatible with the aim of\nprotecting users' freedom to change the software.  The systematic\npattern of such abuse occurs in the area of products for individuals to\nuse, which is precisely where it is most unacceptable.  Therefore, we\nhave designed this version of the GPL to prohibit the practice for those\nproducts.  If such problems arise substantially in other domains, we\nstand ready to extend this provision to those domains in future versions\nof the GPL, as needed to protect the freedom of users.\n\n  Finally, every program is threatened constantly by software patents.\nStates should not allow patents to restrict development and use of\nsoftware on general-purpose computers, but in those that do, we wish to\navoid the special danger that patents applied to a free program could\nmake it effectively proprietary.  To prevent this, the GPL assures that\npatents cannot be used to render the program non-free.\n\n  The precise terms and conditions for copying, distribution and\nmodification follow.\n\n                       TERMS AND CONDITIONS\n\n  0. Definitions.\n\n  \"This License\" refers to version 3 of the GNU General Public License.\n\n  \"Copyright\" also means copyright-like laws that apply to other kinds of\nworks, such as semiconductor masks.\n\n  \"The Program\" refers to any copyrightable work licensed under this\nLicense.  Each licensee is addressed as \"you\".  \"Licensees\" and\n\"recipients\" may be individuals or organizations.\n\n  To \"modify\" a work means to copy from or adapt all or part of the work\nin a fashion requiring copyright permission, other than the making of an\nexact copy.  The resulting work is called a \"modified version\" of the\nearlier work or a work \"based on\" the earlier work.\n\n  A \"covered work\" means either the unmodified Program or a work based\non the Program.\n\n  To \"propagate\" a work means to do anything with it that, without\npermission, would make you directly or secondarily liable for\ninfringement under applicable copyright law, except executing it on a\ncomputer or modifying a private copy.  Propagation includes copying,\ndistribution (with or without modification), making available to the\npublic, and in some countries other activities as well.\n\n  To \"convey\" a work means any kind of propagation that enables other\nparties to make or receive copies.  Mere interaction with a user through\na computer network, with no transfer of a copy, is not conveying.\n\n  An interactive user interface displays \"Appropriate Legal Notices\"\nto the extent that it includes a convenient and prominently visible\nfeature that (1) displays an appropriate copyright notice, and (2)\ntells the user that there is no warranty for the work (except to the\nextent that warranties are provided), that licensees may convey the\nwork under this License, and how to view a copy of this License.  If\nthe interface presents a list of user commands or options, such as a\nmenu, a prominent item in the list meets this criterion.\n\n  1. Source Code.\n\n  The \"source code\" for a work means the preferred form of the work\nfor making modifications to it.  \"Object code\" means any non-source\nform of a work.\n\n  A \"Standard Interface\" means an interface that either is an official\nstandard defined by a recognized standards body, or, in the case of\ninterfaces specified for a particular programming language, one that\nis widely used among developers working in that language.\n\n  The \"System Libraries\" of an executable work include anything, other\nthan the work as a whole, that (a) is included in the normal form of\npackaging a Major Component, but which is not part of that Major\nComponent, and (b) serves only to enable use of the work with that\nMajor Component, or to implement a Standard Interface for which an\nimplementation is available to the public in source code form.  A\n\"Major Component\", in this context, means a major essential component\n(kernel, window system, and so on) of the specific operating system\n(if any) on which the executable work runs, or a compiler used to\nproduce the work, or an object code interpreter used to run it.\n\n  The \"Corresponding Source\" for a work in object code form means all\nthe source code needed to generate, install, and (for an executable\nwork) run the object code and to modify the work, including scripts to\ncontrol those activities.  However, it does not include the work's\nSystem Libraries, or general-purpose tools or generally available free\nprograms which are used unmodified in performing those activities but\nwhich are not part of the work.  For example, Corresponding Source\nincludes interface definition files associated with source files for\nthe work, and the source code for shared libraries and dynamically\nlinked subprograms that the work is specifically designed to require,\nsuch as by intimate data communication or control flow between those\nsubprograms and other parts of the work.\n\n  The Corresponding Source need not include anything that users\ncan regenerate automatically from other parts of the Corresponding\nSource.\n\n  The Corresponding Source for a work in source code form is that\nsame work.\n\n  2. Basic Permissions.\n\n  All rights granted under this License are granted for the term of\ncopyright on the Program, and are irrevocable provided the stated\nconditions are met.  This License explicitly affirms your unlimited\npermission to run the unmodified Program.  The output from running a\ncovered work is covered by this License only if the output, given its\ncontent, constitutes a covered work.  This License acknowledges your\nrights of fair use or other equivalent, as provided by copyright law.\n\n  You may make, run and propagate covered works that you do not\nconvey, without conditions so long as your license otherwise remains\nin force.  You may convey covered works to others for the sole purpose\nof having them make modifications exclusively for you, or provide you\nwith facilities for running those works, provided that you comply with\nthe terms of this License in conveying all material for which you do\nnot control copyright.  Those thus making or running the covered works\nfor you must do so exclusively on your behalf, under your direction\nand control, on terms that prohibit them from making any copies of\nyour copyrighted material outside their relationship with you.\n\n  Conveying under any other circumstances is permitted solely under\nthe conditions stated below.  Sublicensing is not allowed; section 10\nmakes it unnecessary.\n\n  3. Protecting Users' Legal Rights From Anti-Circumvention Law.\n\n  No covered work shall be deemed part of an effective technological\nmeasure under any applicable law fulfilling obligations under article\n11 of the WIPO copyright treaty adopted on 20 December 1996, or\nsimilar laws prohibiting or restricting circumvention of such\nmeasures.\n\n  When you convey a covered work, you waive any legal power to forbid\ncircumvention of technological measures to the extent such circumvention\nis effected by exercising rights under this License with respect to\nthe covered work, and you disclaim any intention to limit operation or\nmodification of the work as a means of enforcing, against the work's\nusers, your or third parties' legal rights to forbid circumvention of\ntechnological measures.\n\n  4. Conveying Verbatim Copies.\n\n  You may convey verbatim copies of the Program's source code as you\nreceive it, in any medium, provided that you conspicuously and\nappropriately publish on each copy an appropriate copyright notice;\nkeep intact all notices stating that this License and any\nnon-permissive terms added in accord with section 7 apply to the code;\nkeep intact all notices of the absence of any warranty; and give all\nrecipients a copy of this License along with the Program.\n\n  You may charge any price or no price for each copy that you convey,\nand you may offer support or warranty protection for a fee.\n\n  5. Conveying Modified Source Versions.\n\n  You may convey a work based on the Program, or the modifications to\nproduce it from the Program, in the form of source code under the\nterms of section 4, provided that you also meet all of these conditions:\n\n    a) The work must carry prominent notices stating that you modified\n    it, and giving a relevant date.\n\n    b) The work must carry prominent notices stating that it is\n    released under this License and any conditions added under section\n    7.  This requirement modifies the requirement in section 4 to\n    \"keep intact all notices\".\n\n    c) You must license the entire work, as a whole, under this\n    License to anyone who comes into possession of a copy.  This\n    License will therefore apply, along with any applicable section 7\n    additional terms, to the whole of the work, and all its parts,\n    regardless of how they are packaged.  This License gives no\n    permission to license the work in any other way, but it does not\n    invalidate such permission if you have separately received it.\n\n    d) If the work has interactive user interfaces, each must display\n    Appropriate Legal Notices; however, if the Program has interactive\n    interfaces that do not display Appropriate Legal Notices, your\n    work need not make them do so.\n\n  A compilation of a covered work with other separate and independent\nworks, which are not by their nature extensions of the covered work,\nand which are not combined with it such as to form a larger program,\nin or on a volume of a storage or distribution medium, is called an\n\"aggregate\" if the compilation and its resulting copyright are not\nused to limit the access or legal rights of the compilation's users\nbeyond what the individual works permit.  Inclusion of a covered work\nin an aggregate does not cause this License to apply to the other\nparts of the aggregate.\n\n  6. Conveying Non-Source Forms.\n\n  You may convey a covered work in object code form under the terms\nof sections 4 and 5, provided that you also convey the\nmachine-readable Corresponding Source under the terms of this License,\nin one of these ways:\n\n    a) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by the\n    Corresponding Source fixed on a durable physical medium\n    customarily used for software interchange.\n\n    b) Convey the object code in, or embodied in, a physical product\n    (including a physical distribution medium), accompanied by a\n    written offer, valid for at least three years and valid for as\n    long as you offer spare parts or customer support for that product\n    model, to give anyone who possesses the object code either (1) a\n    copy of the Corresponding Source for all the software in the\n    product that is covered by this License, on a durable physical\n    medium customarily used for software interchange, for a price no\n    more than your reasonable cost of physically performing this\n    conveying of source, or (2) access to copy the\n    Corresponding Source from a network server at no charge.\n\n    c) Convey individual copies of the object code with a copy of the\n    written offer to provide the Corresponding Source.  This\n    alternative is allowed only occasionally and noncommercially, and\n    only if you received the object code with such an offer, in accord\n    with subsection 6b.\n\n    d) Convey the object code by offering access from a designated\n    place (gratis or for a charge), and offer equivalent access to the\n    Corresponding Source in the same way through the same place at no\n    further charge.  You need not require recipients to copy the\n    Corresponding Source along with the object code.  If the place to\n    copy the object code is a network server, the Corresponding Source\n    may be on a different server (operated by you or a third party)\n    that supports equivalent copying facilities, provided you maintain\n    clear directions next to the object code saying where to find the\n    Corresponding Source.  Regardless of what server hosts the\n    Corresponding Source, you remain obligated to ensure that it is\n    available for as long as needed to satisfy these requirements.\n\n    e) Convey the object code using peer-to-peer transmission, provided\n    you inform other peers where the object code and Corresponding\n    Source of the work are being offered to the general public at no\n    charge under subsection 6d.\n\n  A separable portion of the object code, whose source code is excluded\nfrom the Corresponding Source as a System Library, need not be\nincluded in conveying the object code work.\n\n  A \"User Product\" is either (1) a \"consumer product\", which means any\ntangible personal property which is normally used for personal, family,\nor household purposes, or (2) anything designed or sold for incorporation\ninto a dwelling.  In determining whether a product is a consumer product,\ndoubtful cases shall be resolved in favor of coverage.  For a particular\nproduct received by a particular user, \"normally used\" refers to a\ntypical or common use of that class of product, regardless of the status\nof the particular user or of the way in which the particular user\nactually uses, or expects or is expected to use, the product.  A product\nis a consumer product regardless of whether the product has substantial\ncommercial, industrial or non-consumer uses, unless such uses represent\nthe only significant mode of use of the product.\n\n  \"Installation Information\" for a User Product means any methods,\nprocedures, authorization keys, or other information required to install\nand execute modified versions of a covered work in that User Product from\na modified version of its Corresponding Source.  The information must\nsuffice to ensure that the continued functioning of the modified object\ncode is in no case prevented or interfered with solely because\nmodification has been made.\n\n  If you convey an object code work under this section in, or with, or\nspecifically for use in, a User Product, and the conveying occurs as\npart of a transaction in which the right of possession and use of the\nUser Product is transferred to the recipient in perpetuity or for a\nfixed term (regardless of how the transaction is characterized), the\nCorresponding Source conveyed under this section must be accompanied\nby the Installation Information.  But this requirement does not apply\nif neither you nor any third party retains the ability to install\nmodified object code on the User Product (for example, the work has\nbeen installed in ROM).\n\n  The requirement to provide Installation Information does not include a\nrequirement to continue to provide support service, warranty, or updates\nfor a work that has been modified or installed by the recipient, or for\nthe User Product in which it has been modified or installed.  Access to a\nnetwork may be denied when the modification itself materially and\nadversely affects the operation of the network or violates the rules and\nprotocols for communication across the network.\n\n  Corresponding Source conveyed, and Installation Information provided,\nin accord with this section must be in a format that is publicly\ndocumented (and with an implementation available to the public in\nsource code form), and must require no special password or key for\nunpacking, reading or copying.\n\n  7. Additional Terms.\n\n  \"Additional permissions\" are terms that supplement the terms of this\nLicense by making exceptions from one or more of its conditions.\nAdditional permissions that are applicable to the entire Program shall\nbe treated as though they were included in this License, to the extent\nthat they are valid under applicable law.  If additional permissions\napply only to part of the Program, that part may be used separately\nunder those permissions, but the entire Program remains governed by\nthis License without regard to the additional permissions.\n\n  When you convey a copy of a covered work, you may at your option\nremove any additional permissions from that copy, or from any part of\nit.  (Additional permissions may be written to require their own\nremoval in certain cases when you modify the work.)  You may place\nadditional permissions on material, added by you to a covered work,\nfor which you have or can give appropriate copyright permission.\n\n  Notwithstanding any other provision of this License, for material you\nadd to a covered work, you may (if authorized by the copyright holders of\nthat material) supplement the terms of this License with terms:\n\n    a) Disclaiming warranty or limiting liability differently from the\n    terms of sections 15 and 16 of this License; or\n\n    b) Requiring preservation of specified reasonable legal notices or\n    author attributions in that material or in the Appropriate Legal\n    Notices displayed by works containing it; or\n\n    c) Prohibiting misrepresentation of the origin of that material, or\n    requiring that modified versions of such material be marked in\n    reasonable ways as different from the original version; or\n\n    d) Limiting the use for publicity purposes of names of licensors or\n    authors of the material; or\n\n    e) Declining to grant rights under trademark law for use of some\n    trade names, trademarks, or service marks; or\n\n    f) Requiring indemnification of licensors and authors of that\n    material by anyone who conveys the material (or modified versions of\n    it) with contractual assumptions of liability to the recipient, for\n    any liability that these contractual assumptions directly impose on\n    those licensors and authors.\n\n  All other non-permissive additional terms are considered \"further\nrestrictions\" within the meaning of section 10.  If the Program as you\nreceived it, or any part of it, contains a notice stating that it is\ngoverned by this License along with a term that is a further\nrestriction, you may remove that term.  If a license document contains\na further restriction but permits relicensing or conveying under this\nLicense, you may add to a covered work material governed by the terms\nof that license document, provided that the further restriction does\nnot survive such relicensing or conveying.\n\n  If you add terms to a covered work in accord with this section, you\nmust place, in the relevant source files, a statement of the\nadditional terms that apply to those files, or a notice indicating\nwhere to find the applicable terms.\n\n  Additional terms, permissive or non-permissive, may be stated in the\nform of a separately written license, or stated as exceptions;\nthe above requirements apply either way.\n\n  8. Termination.\n\n  You may not propagate or modify a covered work except as expressly\nprovided under this License.  Any attempt otherwise to propagate or\nmodify it is void, and will automatically terminate your rights under\nthis License (including any patent licenses granted under the third\nparagraph of section 11).\n\n  However, if you cease all violation of this License, then your\nlicense from a particular copyright holder is reinstated (a)\nprovisionally, unless and until the copyright holder explicitly and\nfinally terminates your license, and (b) permanently, if the copyright\nholder fails to notify you of the violation by some reasonable means\nprior to 60 days after the cessation.\n\n  Moreover, your license from a particular copyright holder is\nreinstated permanently if the copyright holder notifies you of the\nviolation by some reasonable means, this is the first time you have\nreceived notice of violation of this License (for any work) from that\ncopyright holder, and you cure the violation prior to 30 days after\nyour receipt of the notice.\n\n  Termination of your rights under this section does not terminate the\nlicenses of parties who have received copies or rights from you under\nthis License.  If your rights have been terminated and not permanently\nreinstated, you do not qualify to receive new licenses for the same\nmaterial under section 10.\n\n  9. Acceptance Not Required for Having Copies.\n\n  You are not required to accept this License in order to receive or\nrun a copy of the Program.  Ancillary propagation of a covered work\noccurring solely as a consequence of using peer-to-peer transmission\nto receive a copy likewise does not require acceptance.  However,\nnothing other than this License grants you permission to propagate or\nmodify any covered work.  These actions infringe copyright if you do\nnot accept this License.  Therefore, by modifying or propagating a\ncovered work, you indicate your acceptance of this License to do so.\n\n  10. Automatic Licensing of Downstream Recipients.\n\n  Each time you convey a covered work, the recipient automatically\nreceives a license from the original licensors, to run, modify and\npropagate that work, subject to this License.  You are not responsible\nfor enforcing compliance by third parties with this License.\n\n  An \"entity transaction\" is a transaction transferring control of an\norganization, or substantially all assets of one, or subdividing an\norganization, or merging organizations.  If propagation of a covered\nwork results from an entity transaction, each party to that\ntransaction who receives a copy of the work also receives whatever\nlicenses to the work the party's predecessor in interest had or could\ngive under the previous paragraph, plus a right to possession of the\nCorresponding Source of the work from the predecessor in interest, if\nthe predecessor has it or can get it with reasonable efforts.\n\n  You may not impose any further restrictions on the exercise of the\nrights granted or affirmed under this License.  For example, you may\nnot impose a license fee, royalty, or other charge for exercise of\nrights granted under this License, and you may not initiate litigation\n(including a cross-claim or counterclaim in a lawsuit) alleging that\nany patent claim is infringed by making, using, selling, offering for\nsale, or importing the Program or any portion of it.\n\n  11. Patents.\n\n  A \"contributor\" is a copyright holder who authorizes use under this\nLicense of the Program or a work on which the Program is based.  The\nwork thus licensed is called the contributor's \"contributor version\".\n\n  A contributor's \"essential patent claims\" are all patent claims\nowned or controlled by the contributor, whether already acquired or\nhereafter acquired, that would be infringed by some manner, permitted\nby this License, of making, using, or selling its contributor version,\nbut do not include claims that would be infringed only as a\nconsequence of further modification of the contributor version.  For\npurposes of this definition, \"control\" includes the right to grant\npatent sublicenses in a manner consistent with the requirements of\nthis License.\n\n  Each contributor grants you a non-exclusive, worldwide, royalty-free\npatent license under the contributor's essential patent claims, to\nmake, use, sell, offer for sale, import and otherwise run, modify and\npropagate the contents of its contributor version.\n\n  In the following three paragraphs, a \"patent license\" is any express\nagreement or commitment, however denominated, not to enforce a patent\n(such as an express permission to practice a patent or covenant not to\nsue for patent infringement).  To \"grant\" such a patent license to a\nparty means to make such an agreement or commitment not to enforce a\npatent against the party.\n\n  If you convey a covered work, knowingly relying on a patent license,\nand the Corresponding Source of the work is not available for anyone\nto copy, free of charge and under the terms of this License, through a\npublicly available network server or other readily accessible means,\nthen you must either (1) cause the Corresponding Source to be so\navailable, or (2) arrange to deprive yourself of the benefit of the\npatent license for this particular work, or (3) arrange, in a manner\nconsistent with the requirements of this License, to extend the patent\nlicense to downstream recipients.  \"Knowingly relying\" means you have\nactual knowledge that, but for the patent license, your conveying the\ncovered work in a country, or your recipient's use of the covered work\nin a country, would infringe one or more identifiable patents in that\ncountry that you have reason to believe are valid.\n\n  If, pursuant to or in connection with a single transaction or\narrangement, you convey, or propagate by procuring conveyance of, a\ncovered work, and grant a patent license to some of the parties\nreceiving the covered work authorizing them to use, propagate, modify\nor convey a specific copy of the covered work, then the patent license\nyou grant is automatically extended to all recipients of the covered\nwork and works based on it.\n\n  A patent license is \"discriminatory\" if it does not include within\nthe scope of its coverage, prohibits the exercise of, or is\nconditioned on the non-exercise of one or more of the rights that are\nspecifically granted under this License.  You may not convey a covered\nwork if you are a party to an arrangement with a third party that is\nin the business of distributing software, under which you make payment\nto the third party based on the extent of your activity of conveying\nthe work, and under which the third party grants, to any of the\nparties who would receive the covered work from you, a discriminatory\npatent license (a) in connection with copies of the covered work\nconveyed by you (or copies made from those copies), or (b) primarily\nfor and in connection with specific products or compilations that\ncontain the covered work, unless you entered into that arrangement,\nor that patent license was granted, prior to 28 March 2007.\n\n  Nothing in this License shall be construed as excluding or limiting\nany implied license or other defenses to infringement that may\notherwise be available to you under applicable patent law.\n\n  12. No Surrender of Others' Freedom.\n\n  If conditions are imposed on you (whether by court order, agreement or\notherwise) that contradict the conditions of this License, they do not\nexcuse you from the conditions of this License.  If you cannot convey a\ncovered work so as to satisfy simultaneously your obligations under this\nLicense and any other pertinent obligations, then as a consequence you may\nnot convey it at all.  For example, if you agree to terms that obligate you\nto collect a royalty for further conveying from those to whom you convey\nthe Program, the only way you could satisfy both those terms and this\nLicense would be to refrain entirely from conveying the Program.\n\n  13. Use with the GNU Affero General Public License.\n\n  Notwithstanding any other provision of this License, you have\npermission to link or combine any covered work with a work licensed\nunder version 3 of the GNU Affero General Public License into a single\ncombined work, and to convey the resulting work.  The terms of this\nLicense will continue to apply to the part which is the covered work,\nbut the special requirements of the GNU Affero General Public License,\nsection 13, concerning interaction through a network will apply to the\ncombination as such.\n\n  14. Revised Versions of this License.\n\n  The Free Software Foundation may publish revised and/or new versions of\nthe GNU General Public License from time to time.  Such new versions will\nbe similar in spirit to the present version, but may differ in detail to\naddress new problems or concerns.\n\n  Each version is given a distinguishing version number.  If the\nProgram specifies that a certain numbered version of the GNU General\nPublic License \"or any later version\" applies to it, you have the\noption of following the terms and conditions either of that numbered\nversion or of any later version published by the Free Software\nFoundation.  If the Program does not specify a version number of the\nGNU General Public License, you may choose any version ever published\nby the Free Software Foundation.\n\n  If the Program specifies that a proxy can decide which future\nversions of the GNU General Public License can be used, that proxy's\npublic statement of acceptance of a version permanently authorizes you\nto choose that version for the Program.\n\n  Later license versions may give you additional or different\npermissions.  However, no additional obligations are imposed on any\nauthor or copyright holder as a result of your choosing to follow a\nlater version.\n\n  15. Disclaimer of Warranty.\n\n  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY\nAPPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT\nHOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM \"AS IS\" WITHOUT WARRANTY\nOF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,\nTHE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\nPURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM\nIS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF\nALL NECESSARY SERVICING, REPAIR OR CORRECTION.\n\n  16. Limitation of Liability.\n\n  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING\nWILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS\nTHE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY\nGENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE\nUSE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF\nDATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD\nPARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),\nEVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF\nSUCH DAMAGES.\n\n  17. Interpretation of Sections 15 and 16.\n\n  If the disclaimer of warranty and limitation of liability provided\nabove cannot be given local legal effect according to their terms,\nreviewing courts shall apply local law that most closely approximates\nan absolute waiver of all civil liability in connection with the\nProgram, unless a warranty or assumption of liability accompanies a\ncopy of the Program in return for a fee.\n\n                     END OF TERMS AND CONDITIONS\n\n            How to Apply These Terms to Your New Programs\n\n  If you develop a new program, and you want it to be of the greatest\npossible use to the public, the best way to achieve this is to make it\nfree software which everyone can redistribute and change under these terms.\n\n  To do so, attach the following notices to the program.  It is safest\nto attach them to the start of each source file to most effectively\nstate the exclusion of warranty; and each file should have at least\nthe \"copyright\" line and a pointer to where the full notice is found.\n\n    {one line to give the program's name and a brief idea of what it does.}\n    Copyright (C) {year}  {name of author}\n\n    This program is free software: you can redistribute it and/or modify\n    it under the terms of the GNU General Public License as published by\n    the Free Software Foundation, either version 3 of the License, or\n    (at your option) any later version.\n\n    This program is distributed in the hope that it will be useful,\n    but WITHOUT ANY WARRANTY; without even the implied warranty of\n    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n    GNU General Public License for more details.\n\n    You should have received a copy of the GNU General Public License\n    along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\nAlso add information on how to contact you by electronic and paper mail.\n\n  If the program does terminal interaction, make it output a short\nnotice like this when it starts in an interactive mode:\n\n    {project}  Copyright (C) {year}  {fullname}\n    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.\n    This is free software, and you are welcome to redistribute it\n    under certain conditions; type `show c' for details.\n\nThe hypothetical commands `show w' and `show c' should show the appropriate\nparts of the General Public License.  Of course, your program's commands\nmight be different; for a GUI interface, you would use an \"about box\".\n\n  You should also get your employer (if you work as a programmer) or school,\nif any, to sign a \"copyright disclaimer\" for the program, if necessary.\nFor more information on this, and how to apply and follow the GNU GPL, see\n<http://www.gnu.org/licenses/>.\n\n  The GNU General Public License does not permit incorporating your program\ninto proprietary programs.  If your program is a subroutine library, you\nmay consider it more useful to permit linking proprietary applications with\nthe library.  If this is what you want to do, use the GNU Lesser General\nPublic License instead of this License.  But first, please read\n<http://www.gnu.org/philosophy/why-not-lgpl.html>.\n\n"
  },
  {
    "path": "build-deb",
    "content": "#!/bin/sh\n\nVERSION=`date +%y.%m.%d`\nPACKAGE=open-semantic-etl_${VERSION}.deb\nBUILDDIR=/tmp/open-semantic-etl-$$.deb\n\n\n#\n# Build standard package (preconfigured for Solr)\n#\n\necho \"Building ${PACKAGE} in temp directory ${BUILDDIR}\"\n\nmkdir ${BUILDDIR}\ncp -a DEBIAN ${BUILDDIR}/\ncp -a etc ${BUILDDIR}/\ncp -a usr ${BUILDDIR}/\nmkdir -p ${BUILDDIR}/usr/lib/python3/dist-packages\ncp -a src/* ${BUILDDIR}/usr/lib/python3/dist-packages/\n\nmkdir -p ${BUILDDIR}/var/cache/tesseract\n\nmkdir -p ${BUILDDIR}/var/opensemanticsearch/media/thumbnails\n\n# Build standard package (preconfigured for Solr)\ndpkg -b ${BUILDDIR} ${PACKAGE}\n\n\n#\n# Build alternate package (preconfigured for Elasticsearch)\n#\n\n# change config file and set export plugin to Elasticsearch\nPACKAGE=open-semantic-etl-elasticsearch_${VERSION}.deb\n\necho \"Building ${PACKAGE} in temp directory ${BUILDDIR}\"\n\n# change option \"config['export']\" in ${BUILDDIR}/etc/opensemanticsearch/etl from \"solr\" to \"elasticsearch\" by commenting / uncommenting\n\nsed -r -e \"s/(config\\['export'\\] = 'export_solr')/#\\1/g\" -e \"s/(config\\['index'\\] = 'core1')/#\\1/g\" -e \"s/(#)(config\\['export'\\] = 'export_elasticsearch')/\\2/\"  -e \"s/(#)(config\\['index'\\] = 'opensemanticsearch')/\\2/\" -i ${BUILDDIR}/etc/opensemanticsearch/etl\n\n# todo: delete dependency on pysolr\n\n# Build the alternate package\ndpkg -b ${BUILDDIR} ${PACKAGE}\n"
  },
  {
    "path": "docker-compose.test.yml",
    "content": "sut:\n  build: .\n  command: /usr/lib/python3/dist-packages/opensemanticetl/test/run_tests.sh\n"
  },
  {
    "path": "docker-compose.ubuntu.test.yml",
    "content": "version: '3'\nservices:\n  sut:\n    build:\n      context: .\n      args:\n        FROM: ubuntu:focal\n    command: /usr/lib/python3/dist-packages/opensemanticetl/test/run_tests.sh\n"
  },
  {
    "path": "docker-entrypoint.sh",
    "content": "#! /bin/sh\n\n# docker-entrypoint for opensemanticsearch/open-semantic-etl\n\n# wait for the apps container to finish initializing:\nwhile ! curl -m 1 -sf http://apps >/dev/null 2>&1\ndo\n\tsleep 1\ndone\n\nexec /usr/bin/python3 /usr/lib/python3/dist-packages/opensemanticetl/tasks.py\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/blacklist-url",
    "content": "# Blacklist of URLs\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/blacklist-url-prefix",
    "content": "# Blacklist of URL Prefixes like domains or paths\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/blacklist-url-regex",
    "content": "# Blacklist URLs with text patterns by regular expressions (regex)\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/blacklist-url-suffix",
    "content": "# Blacklist of URL Suffixes like file endings\n.css\n.CSS\n.Css"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_extract_law/blacklist-lawcode-if-no-clause",
    "content": "# Preferred labels of Law codes will be only added to facet \"Law code\",\n# if the following configured (alternate) labels are directly before or after a\n# law clause (f.e. in text \"abc § 123 CC xyz\"), but not if such blacklisted\n# (alternate) label stands alone\n# (f.e. in text \"abc CC xyz\" or in \"CC: mail@domain) because too ambiguous\n\n\n# too ambiguous alternate label from Wikidata entity Q206834 \"Swiss Civil Code\"\nCC\n\n# too ambiguous alternate label from Wikidata entity Q56045 \"Basic Law for the Federal Republic of Germany\"\nGG\n\n# too ambiguous alternate label from Wikidata entity Q187719 \"Corpus Juris Civilis\"\nInstitutes\n\n# too ambiguous alternate label from Wikidata entity Q7101313 \"Oregon Revised Statutes\"\nORS\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype",
    "content": "# Blacklist of contenttypes\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-prefix",
    "content": "# Blacklist of contenttype prefixes\n\n# Open Office / Libreoffice / MS Office\n# Open document format and MS office open xml format is a zip archive with the document as XML, the embedded images and meta data as XML\n# Tika will extract the main content, which - if you do not forensics - is enough in most cases.\n# So we dont want additional handle each single (metadata) file in this archive, so we deactivate the ZIP plugin for that content type\n# Since this is a prefix blacklist, it will stop unzip application/vnd.oasis.opendocument.text, application/vnd.oasis.opendocument.spreadsheet and so on ...\n\napplication/vnd.oasis.opendocument.\napplication/vnd.openxmlformats-officedocument.\napplication/msword\napplication/vnd.ms-word.\napplication/msexcel\napplication/vnd.ms-excel.\napplication/mspowerpoint\napplication/vnd.ms-powerpoint.\n\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-regex",
    "content": "# Blacklist contenttypes with text patterns by regular expressions (regex)\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-suffix",
    "content": "# Blacklist of contenttype suffixes\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype",
    "content": "# Whitelist of contenttypes\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-prefix",
    "content": "# Whitelist of contenttype prefixes\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-regex",
    "content": "# Whitelist contenttypes with text patterns by regular expressions (regex)\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-suffix",
    "content": "# Whitelist of contenttype suffixes\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname",
    "content": "language_s\ncontent_type_ss\ncontent_type_group_ss\nAEB Bracket Value_ss\nAE Setting_ss\nAF Area Height_ss\nAF Area Width_ss\nAF Area X Positions_ss\nAF Area Y Positions_ss\nAF Image Height_ss\nAF Image Width_ss\nAF Point Count_ss\nAF Point Selected_ss\nAF Points in Focus_ss\nAperture Value_ss\nAuto Exposure Bracketing_ss\nAuto ISO_ss\nAuto Rotate_ss\nBase ISO_ss\nBulb Duration_ss\nCamera Info Array_ss\nCamera Serial Number_ss\nCamera Temperature_ss\nCamera Type_ss\nCanon Model ID_ss\nContrast_ss\nComponents Configuration_ss\nCompressed Bits Per Pixel_ss\nCompression_ss\nColor Balance Array_ss\nColor Space_ss\nColor Temperature_ss\nColor Tone_ss\nContent-Encoding_s\nContinuous Drive Mode_ss\nControl Mode_ss\nCustom Functions_ss\nCustom Rendered_ss\ncreated_ss\nCreation-Date_ss\nData BitsPerSample_ss\nData PlanarConfiguration_ss\nData Precision_ss\nData SampleFormat_ss\nData SignificantBitsPerSample_ss\ndate_ss\ndc:format_ss\ndcterms:created_ss\ndcterms:modified_ss\nDimension ImageOrientation_ss\nDimension PixelAspectRatio_ss\nDigital Zoom_ss\nDisplay Aperture_ss\nEasy Shooting Mode_ss\nembeddedResourceType_ss\nExif Version_ss\nexif:DateTimeOriginal_ss\nexif:ExposureTime_ss\nexif:Flash_ss\nexif:FocalLength_ss\nexif:FNumber_ss\nExif Image Height_ss\nExif Image Width_ss\nexif:IsoSpeedRatings_ss\nExposure Bias Value_ss\nExposure Compensation_ss\nExposure Mode_ss\nExposure Time_ss\nF-Number_ss\nF Number_ss\nFile Name_ss\nFile Length_ss\nFile Modified Date_ss\nFile Info Array_ss\nFile Size_ss\nFirmware Version_ss\nFlash_ss\nFlashPix Version_ss\nFlash Activity_ss\nFlash Details_ss\nFlash Exposure Compensation_ss\nFlash Guide Number_ss\nFocal Length_ss\nFlash Mode_ss\nFocal Plane Resolution Unit_ss\nFocal Plane X Resolution_ss\nFocal Plane Y Resolution_ss\nFocal Units per mm_ss\nFocus Continuous_ss\nFocus Distance Lower_ss\nFocus Distance Upper_ss\nFocus Mode_ss\nFocus Type_ss\nheight_ss\nISO Speed Ratings_ss\nIHDR_ss\nImage Height_ss\nImage Number_ss\nImage Size_ss\nImage Width_ss\nImage Type_ss\nInteroperability Index_ss\nInteroperability Version_ss\nIso_ss\nLast-Modified_ss\nLast-Save-Date_ss\nLens Type_ss\nLong Focal Length_ss\nMacro Mode_ss\nManual Flash Output_ss\nMax Aperture_ss\nMax Aperture Value_ss\nMeasured Color Array_ss\nMeasured EV_ss\nmeta:creation-date_ss\nmeta:save-date_ss\nMetering Mode_ss\nMin Aperture_ss\nmodified_ss\nND Filter_ss\nNumber of Components_ss\nNumber of Tables_ss\nOrientation_ss\nOptical Zoom Code_ss\npdf:PDFVersion_ss\npdf:docinfo:created_ss\npdf:docinfo:creator_tool_ss\npdf:docinfo:modified_ss\npdf:docinfo:producer_ss\npdf:encrypted_ss\npdf:charsPerPage_ss\npdf:unmappedUnicodeCharsPerPage_ss\nPhoto Effect_ss\nproducer_ss\nRecord Mode_ss\nRelated Image Height_ss\nRelated Image Width_ss\nResolution Unit_ss\nResolution Units_ss\nSaturation_ss\nsBIT sBIT_RGBAlpha_ss\nScene Capture Type_ss\nSensing Method_ss\nSequence Number_ss\nSerial Number Format_ss\nSlow Shutter_ss\nSharpness_ss\nShort Focal Length_ss\nShutter Speed Value_ss\nSpot Metering Mode_ss\nSRAW Quality_ss\nTarget Aperture_ss\nTarget Exposure Time_ss\ntiff:BitsPerSample_ss\ntiff:ImageLength_ss\ntiff:ImageWidth_ss\ntiff:Make_ss\ntiff:Model_ss\ntiff:Orientation_ss\ntiff:ResolutionUnit_ss\ntiff:XResolution_ss\ntiff:YResolution_ss\nThumbnail Height Pixels_ss\nThumbnail Width Pixels_ss\nThumbnail Image Valid Area_ss\nThumbnail Length_ss\nThumbnail Offset_ss\nTransparency Alpha_ss\nValid AF Point Count_ss\nwidth_ss\nX-Parsed-By_ss\nX-TIKA:parse_time_millis_ss\nX Resolution_ss\nxmpTPg:NPages_ss\nxmp:CreatorTool_ss\nYCbCr Positioning_ss\nY Resolution_ss\nZoom Source Width_ss\nZoom Target Width_ss\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix",
    "content": "etl_\nX-TIKA\nAF Point\nChroma \nCompression \nComponent \nDate/Time\nMeasured EV \nPrimary AF Point \nSelf Timer \nUnknown Camera Setting \nUnknown tag \nWhite Balance\naccess_permission:\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix",
    "content": "_i\n_is\n_l\n_ls\n_b\n_bs\n_f\n_fs\n_d\n_ds\n_f\n_fs\n_dt\n_dts\n_uri_ss\n_matchtext_ss\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/whitelist-url",
    "content": "# Whitelist of URLs\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/whitelist-url-prefix",
    "content": "# Whitelist of URL Prefixes like domains or paths\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/whitelist-url-regex",
    "content": "# Whitelist URLs with text patterns by regular expressions (regex)\n"
  },
  {
    "path": "etc/opensemanticsearch/blacklist/whitelist-url-suffix",
    "content": "# Whitelist of URL Suffixes like file endings\n"
  },
  {
    "path": "etc/opensemanticsearch/connector-files",
    "content": "# -*- coding: utf-8 -*-\n\n# Config for opensemanticsearch-index-file\n\n\n# print Debug output\n#config['verbose'] = True\n\n\n# Index files again even if indexed before and modification time of file unchanged\n#config['force'] = True\n\n\n#\n# Mapping filename to URI\n#\n\n# if the users have other path (mountpoint with other path then the servers full path)\n# or protocol (http:// instead of file://)\n# you can map the servers path to the users path\n\n# default: user can access the file system, so /fullpath/filename will be mapped to file:///fullpath/filename\nconfig['mappings'] = { \"/\": \"file:///\" }\n\n\n# If documents access not via filesystem but via website (http)\n# your files in /var/www/documents/ should be mapped to http://www.opensemanticsearch.org/documents/\n#config['mappings'] = { \"/var/www/documents/\": \"http://www.opensemanticsearch.org/documents/\" }\n\n\n#\n# UI Path navigator: Strip parts of path facet\n#\n\n# The path facet is the sidebar component to navigate (sub)paths.\n# If all your different directories are in one path like /documents\n# or even worse the main content dirs are subdirs like /mnt/fileserver/onesubdir and /mnt/fileserver/othersubdirectory\n# you might want that the user can select or navigate the subdirectories directly (which from the content perspective are main dirs)\n# instead of forcing the user first navigate to ./mnt, then to ./fileserver and so on...\n\n# this option wont change the uri (which is the base of this option and can be mapped and stripped above),\n# it will only change/strip/shorten the path facet in the interactive navigation of the user interface\n#config['facet_path_strip_prefix'] = [ \"file:///home/\", \"file://\" ]\n"
  },
  {
    "path": "etc/opensemanticsearch/connector-web",
    "content": "# -*- coding: utf-8 -*-\n\n#\n# Config for opensemanticsearch-index-web-crawl\n#\n\n#\n# common file extensions that are not followed if they occur in links\n#\n\nconfig['webcrawler_deny_extensions'] = [\n    # archives\n    '7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',\n\n    # images\n    'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',\n    'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',\n\n    # audio\n    'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',\n\n    # video\n    '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',\n    'm4a', 'm4v', 'flv', 'webm',\n\n    # office suites (commented, since we want to index office documents)\n    #'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',\n    #'odp', 'pdf',\n\n    # other\n    'css', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'\n]\n\n\n# Uncomment, if you do not want to exclude file extensions\n# Warning: You might not want to download Gigabytes or Terabytes of archives, videos, CD-ROM/DVD ISOs and so on...\n\n#config['webcrawler_deny_extensions'] = []\n"
  },
  {
    "path": "etc/opensemanticsearch/enhancer-rdf",
    "content": "# -*- coding: utf-8 -*-\n\n# Config for RDF metadata server\n\n# URL of the meta data server (RDF)\n# if set to False don't use additional metadata from server (like tags or annotations)\n#\n# Templates:\n# [uri] for URL of annotated page\n# [uri_md5] for MD5 Sum of the URL\n\nconfig['metaserver'] = False\n\n# Use Drupal as meta server\n#config['metaserver'] = [ 'http://localhost/drupal/rdf?uri=[uri]' ]\n\n# Use Semantic Mediawiki as meta server\n#config['metaserver'] = [ 'http://localhost/mediawiki/index.php/Special:ExportRDF?xmlmime=rdf&page=[uri_md5]' ]\n\n# Use tagger app as meta server\nconfig['metaserver'] = [ 'http://localhost/search-apps/annotate/rdf?uri=[uri]' ]\n\n# mapping of RDF properties or RDF classes to facets / columns\nconfig['property2facet'] = {\n 'http://www.wikidata.org/entity/Q5': 'person_ss',\n 'http://www.wikidata.org/entity/Q43229': 'organization_ss',\n 'http://www.wikidata.org/entity/Q178706': 'organization_ss',\n 'http://www.wikidata.org/entity/Q18810687': 'organization_ss',\n 'http://www.wikidata.org/entity/Q2221906': 'location_ss',\n 'http://schema.org/Person': 'person_ss',\n 'http://schema.org/Organization': 'organization_ss',\n 'http://schema.org/Place': 'location_ss',\n 'http://schema.org/location': 'location_ss',\n 'http://schema.org/address': 'location_ss',\n 'http://schema.org/keywords': 'tag_ss',\n 'http://schema.org/Comment': 'comment_txt',\n 'http://semantic-mediawiki.org/swivt/1.0#specialProperty_dat': 'meta_date_dts'\n}\n"
  },
  {
    "path": "etc/opensemanticsearch/etl",
    "content": "# -*- coding: utf-8 -*-\n\n#\n# ETL config for connector(s)\n#\n\n# print debug messages\n#config['verbose'] = True\n\n\n#\n# Languages for language specific index\n#\n# Each document is analyzed without grammar rules in the index fields like content, additionally it can be added/copied to language specific index fields/analyzers\n# Document language is autodetected by default plugin enhance_detect_language_tika_server\n\n# If index support enhanced analytics for specific languages, we can add/copy data to language specific fields/analyzers\n# Set which languages are configured and shall be used in index for language specific analysis/stemming/synonyms\n# Default / if not set all languages that are supported will be analyzed additionally language specific\n#config['languages'] = ['en','de','fr','hu','it','pt','nl','cz','ro','ru','ar','fa']\n\n# force to language specific analysis additional in this language(s) grammar & synonyms, even if language autodetection detects other language\n#config['languages_force'] = ['en','de']\n\n\n# only use language for language specific analysis which are added / uncommented later\n#config['languages'] = []\n\n# add English\n#config['languages'].append('en')\n\n# add German / Deutsch\n#config['languages'].append('de')\n\n# add French / Francais\n#config['languages'].append('fr')\n\n# add Hungarian\n#config['languages'].append('hu')\n\n# add Spanish\n#config['languages'].append('es')\n\n# add Portuguese\n#config['languages'].append('pt')\n\n# add Italian\n#config['languages'].append('it')\n\n# add Czech\n#config['languages'].append('cz')\n\n# add Dutch\n#config['languages'].append('nl')\n\n# add Romanian\n#config['languages'].append('ro')\n\n# add Russian\n#config['languages'].append('ru')\n\n\n\n#\n# Index/storage\n#\n\n#\n# Solr URL and port\n#\n\nconfig['export'] = 'export_solr'\n\n# Solr server\nconfig['solr'] = 'http://localhost:8983/solr/'\n\n# Solr core\nconfig['index'] = 'opensemanticsearch'\n\n\n#\n# Elastic Search\n#\n\n#config['export'] = 'export_elasticsearch'\n\n# Index\n#config['index'] = 'opensemanticsearch'\n\n\n#\n# Tika for text and metadata extraction\n#\n\n# Tika server (with tesseract-ocr-cache)\n# default: http://localhost:9998\n\n#config['tika_server'] = 'http://localhost:9998'\n\n# Tika server with fake OCR cache of tesseract-ocr-cache used if OCR in later ETL tasks\n# default: http://localhost:9999\n\n#config['tika_server_fake_ocr'] = 'http://localhost:9999'\n\n\n#\n# Annotations\n#\n\n# add plugin for annotation/tagging/enrichment of documents\nconfig['plugins'].append('enhance_annotations')\n\n# set alternate URL of annotation server\n#config['metadata_server'] = 'http://localhost/search-apps/annotate/json'\n\n\n#\n# RDF Knowledge Graph\n#\n\n# add RDF Metadata Plugin for granular import of RDF file statements to entities of knowledge graphs\nconfig['plugins'].append('enhance_rdf')\n\n\n#\n# Config for OCR (automatic text recognition of text in images)\n#\n\n# Disable OCR for image files (i.e for more performance and/or because you don't need the text within images or have only photos without photographed text)\n#config['ocr'] = False\n\n# Option to disable OCR of embedded images in PDF by Tika\n# so (if alternate plugin is enabled) OCR will be done only by alternate\n# plugin enhance_pdf_ocr (which else works only as fallback, if Tika exceptions)\n#config['ocr_pdf_tika'] = False\n\n# Use OCR cache\nconfig['ocr_cache'] = '/var/cache/tesseract'\n\n# Option to disable OCR cache\n#config['ocr_cache'] = None\n\n# Do OCR for images embedded in PDF documents (i.e. designed images or scanned or photographed documents)\nconfig['plugins'].append('enhance_pdf_ocr')\n\n#OCR language\n\n#If other than english you have to install package tesseract-XXX (tesseract language support) for your language\n#and set ocr_lang to this value (be careful, the tesseract package for english is \"eng\" (not \"en\") german is named \"deu\", not \"de\"!)\n\n# set OCR language to English/default\n#config['ocr_lang'] = 'eng'\n\n# set OCR language to German/Deutsch\n#config['ocr_lang'] = 'deu'\n\n# set multiple OCR languages\nconfig['ocr_lang'] = 'eng+deu'\n\n\n#\n# Regex pattern for extraction\n#\n\n# Enable Regex plugin\nconfig['plugins'].append('enhance_regex')\n\n# Regex config for IBAN extraction\nconfig['regex_lists'].append('/etc/opensemanticsearch/regex/iban.tsv')\n\n\n#\n# Email address and email domain extraction\n#\nconfig['plugins'].append('enhance_extract_email')\n\n\n#\n# Phone number extraction\n#\nconfig['plugins'].append('enhance_extract_phone')\n\n\n#\n# Config for Named Entities Recognition (NER) and Named Entity Linking (NEL)\n#\n\n# Enable Entity Linking / Normalization and dictionary based Named Entities Extraction from thesaurus and ontologies\nconfig['plugins'].append('enhance_entity_linking')\n\n# Enable SpaCy NER plugin\nconfig['plugins'].append('enhance_ner_spacy')\n\n# Spacy NER Machine learning classifier (for which language and with which/how many classes)\n\n# Default classifier if no classifier for specific language\n\n# disable NER for languages where no classifier defined in config['spacy_ner_classifiers']\nconfig['spacy_ner_classifier_default'] = None\n\n# Set default classifier to English (only if you are sure, that all documents you index are english)\n# config['spacy_ner_classifier_default'] = 'en_core_web_sm'\n\n# Set default classifier to German (only if you are sure, that all documents you index are german)\n# config['spacy_ner_classifier_default'] = 'de_core_news_sm'\n\n# Language specific classifiers (mapping to autodetected document language to Spacy classifier / language)\n#\n# You have to download additional language classifiers for example english (en) or german (de) by\n# python3 -m spacy download en\n# python3 -m spacy download de\n# ...\n\nconfig['spacy_ner_classifiers'] = {\n    'da': 'da_core_news_sm',\n    'de': 'de_core_news_sm',\n    'en': 'en_core_web_sm',\n    'es': 'es_core_news_sm',\n    'fr': 'fr_core_news_sm',\n    'it': 'it_core_news_sm',\n    'lt': 'lt_core_news_sm',\n    'nb': 'nb_core_news_sm',\n    'nl': 'nl_core_news_sm',\n    'pl': 'pl_core_news_sm',\n    'pt': 'pt_core_news_sm',\n    'ro': 'ro_core_news_sm',\n}\n\n\n# Enable Stanford NER plugin\n#config['plugins'].append('enhance_ner_stanford')\n\n# Stanford NER Machine learning classifier (for which language and with how many classes, which need more computing time)\n\n# Default classifier if no classifier for specific language\n\n# disable NER for languages where no classifier defined in config['stanford_ner_classifiers']\nconfig['stanford_ner_classifier_default'] = None\n\n# Set default classifier to English (only if you are sure, that all documents you index are english)\n#config['stanford_ner_classifier_default'] = '/usr/share/java/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'\n\n# Set default classifier to German (only if you are sure, that all documents you index are german)\n#config['stanford_ner_classifier_default'] = '/usr/share/java/stanford-ner/classifiers/german.conll.germeval2014.hgc_175m_600.crf.ser.gz'\n\n# Language specific classifiers (mapping to autodetected document language)\n# Before you have to download additional language classifiers to the configured path\nconfig['stanford_ner_classifiers'] = {\n    'en': '/usr/share/java/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',\n    'es': '/usr/share/java/stanford-ner/classifiers/spanish.ancora.distsim.s512.crf.ser.gz',\n    'de': '/usr/share/java/stanford-ner/classifiers/german.conll.germeval2014.hgc_175m_600.crf.ser.gz',\n}\n\n# If Stanford NER JAR not in standard path\nconfig['stanford_ner_path_to_jar'] = \"/usr/share/java/stanford-ner/stanford-ner.jar\"\n\n# Stanford NER Java options like RAM settings\nconfig['stanford_ner_java_options'] = '-mx1000m'\n\n\n#\n# Law clauses extraction\n#\n\nconfig['plugins'].append('enhance_extract_law')\n\n\n#\n# Money extraction\n#\n\nconfig['plugins'].append('enhance_extract_money')\n\n\n#\n# Neo4j graph database\n#\n\n# exports named entities and relations to Neo4j graph database\n\n# Enable plugin to export entities and connections to Neo4j graph database\n#config['plugins'].append('export_neo4j')\n\n# Neo4j server\n#config['neo4j_host'] = 'localhost'\n\n# Username & password\n#config['neo4j_user'] = 'neo4j'\n#config['neo4j_password'] = 'neo4j'\n"
  },
  {
    "path": "etc/opensemanticsearch/facets",
    "content": "# Warning: Do not edit here!\n\n# This config file will be overwritten\n# by web admin user interface after config changes\n# and on initialization by /var/lib/opensemanticsearch/manage.py entities\n\n#\n# Default facet config if no facets are configured\n#\nconfig['facets'] = {\n    'author_ss': {'label': 'Author(s)', 'uri': 'http://schema.org/Author', 'facet_limit': '10', 'snippets_limit': '10'},\n    'tag_ss': {'label': 'Tags', 'uri': 'http://schema.org/keywords', 'facet_limit': '10', 'snippets_limit': '10'},\n    'annotation_tag_ss': {'label': 'Tags (Hypothesis)', 'uri': 'http://schema.org/keywords', 'facet_limit': '10', 'snippets_limit': '10'},\n    'person_ss': {'label': 'Persons', 'uri': 'http://schema.org/Person', 'facet_limit': '10', 'snippets_limit': '10'},\n    'organization_ss': {'label': 'Organizations', 'uri': 'http://schema.org/Organization', 'facet_limit': '10', 'snippets_limit': '10'},\n    'location_ss': {'label': 'Locations', 'uri': 'http://schema.org/Place', 'facet_limit': '10', 'snippets_limit': '10'},\n    'language_s': {'label': 'Language', 'uri': 'http://schema.org/inLanguage', 'facet_limit': '10', 'snippets_limit': '10'},\n    'email_ss': {'label': 'Email', 'uri': 'http://schema.org/email', 'facet_limit': '10', 'snippets_limit': '10'},\n    'Message-From_ss': {'label': 'Message from', 'uri': 'http://schema.org/sender', 'facet_limit': '10', 'snippets_limit': '10'},\n    'Message-To_ss': {'label': 'Message to', 'uri': 'http://schema.org/toRecipient', 'facet_limit': '10', 'snippets_limit': '10'},\n    'Message-CC_ss': {'label': 'Message CC', 'uri': 'http://schema.org/ccRecipient', 'facet_limit': '10', 'snippets_limit': '10'},\n    'Message-BCC_ss': {'label': 'Message BCC', 'uri': 'http://schema.org/bccRecipient', 'facet_limit': '10', 'snippets_limit': '10'},\n    'hashtag_ss': {'label': 'Hashtags', 'uri': 'http://schema.org/keywords', 'facet_limit': '10', 'snippets_limit': '10'},\n    'email_domain_ss': {'label': 'Email domain', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'phone_normalized_ss': {'label': 'Phone numbers', 'uri': 'https://schema.org/telephone', 'facet_limit': '10', 'snippets_limit': '10'},\n    'phone_ss': {'label': 'Phone numbers', 'uri': 'https://schema.org/telephone', 'facet_limit': '10', 'snippets_limit': '10'},\n    'money_ss': {'label': 'Money', 'uri': 'http://schema.org/MonetaryAmount', 'facet_limit': '10', 'snippets_limit': '10'},\n    'iban_ss': {'label': 'IBAN', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'law_clause_ss': {'label': 'Law clause', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'law_code_ss': {'label': 'Law code', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'law_code_clause_ss': {'label': 'Law code clause', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'filename_extension_s': {'label': 'Filename extension', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'content_type_group_ss': {'label': 'Content type group', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'content_type_ss': {'label': 'Content type', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},\n    'law_codes_rdf_ss': {'label': '', 'uri': '', 'facet_limit': '0', 'snippets_limit': '0'},\n}\n"
  },
  {
    "path": "etc/opensemanticsearch/filemonitoring/files",
    "content": ""
  },
  {
    "path": "etc/opensemanticsearch/ocr/dictionary.txt",
    "content": ""
  },
  {
    "path": "etc/opensemanticsearch/regex/email.tsv",
    "content": "[\\w\\.-]+@[\\w\\.-]+\temail_ss\n"
  },
  {
    "path": "etc/opensemanticsearch/regex/iban.tsv",
    "content": "\\b[a-zA-Z]{2}(?: ?)[0-9]{2}(?: ?)[a-zA-Z0-9]{4}(?: ?)[0-9]{7}(?: ?)([a-zA-Z0-9]?){0,16}\\b\tiban_ss\n"
  },
  {
    "path": "etc/opensemanticsearch/regex/phone.tsv",
    "content": "[\\+\\(]?[1-9][0-9 .\\-\\(\\)]{8,}[0-9]\tphone_ss\n"
  },
  {
    "path": "etc/opensemanticsearch/task_priorities",
    "content": "# Priorities of document processing in task queue\n\n# The higher the additional priority is, the earlier the document will be processed by task queue.\n\n\n#\n# Priorities in task queue by filename extension\n#\n\n# the higher the additional priority, the earlier file with this file name extension will be processed\n# the lower the additional priority, the later files with this file name extension will be processed\n\nconfig['priorities_filename_extension] = {\n\n  '.pdf': 5,\n  '.doc': 5,\n  '.docx': 5,\n  '.xls': 5,\n  '.xlsx': 5,\n  '.odp': 5,\n  '.ppt': 5,\n  '.pptx': 5,\n  '.eml': 5,\n  '.pst': 4,\n  '.csv': 4,\n  '.tsv': 4,\n  '.txt': 4,\n  '.htm': 3,\n  '.html': 3,\n  '.md': 3,\n  '.jpg': 1,\n  '.jpeg': 1,\n  '.gif': 1,\n  '.png': 1,\n  '.tif': 1,\n  '.mp3': 1,\n  '.mp4': 1,\n  '.wav': 1,\n  '.ini': -3,\n  '.bat': -4,\n  '.apk': -5,\n  '.bin': -5,\n  '.com': -5,\n  '.deb': -5,\n  '.exe': -5,\n  '.msi': -5,\n  '.php': -5,\n  '.cache': -5,\n  '.h': -5,\n  '.pl': -5,\n  '.py': -5,\n  '.pyc': -5,\n  '.js': -5,\n  '.css': -5,\n  '.ova': -5,\n  '.iso': -5,\n\n}\n\n\n#\n# Priorities on parts of filenames\n#\n\n# If a configures string is part of the filename, additional priority is set\n\nconfig['priorities_filename] = {\n\n  'corrupt': 5,\n  'illegal': 5,\n  'important': 5,\n  'relevant': 5,\n  'problem': 5,\n  'urgent': 5,\n  'passwor': 5,\n  'account': 4,\n  'agreement': 4,\n  'bank': 4,\n  'complian': 4,\n  'cost': 4,\n  'contract': 4,\n  'legal': 4,\n  'treaty': 4,\n\n}\n"
  },
  {
    "path": "etc/systemd/system/opensemanticetl-filemonitoring.service",
    "content": "[Unit]\nDescription=Open Semantic ETL filemonitoring\nAfter=network.target\n\n[Service]\nType=simple\nUser=opensemanticetl\nExecStart=/usr/bin/opensemanticsearch-filemonitoring --fromfile /etc/opensemanticsearch/filemonitoring/files\nRestart=always\n\n[Install]\nWantedBy=multi-user.target\n"
  },
  {
    "path": "etc/systemd/system/opensemanticetl.service",
    "content": "[Unit]\nDescription=Open Semantic ETL\nAfter=network.target\n\n[Service]\nType=simple\nUser=opensemanticetl\nEnvironment=OMP_THREAD_LIMIT=1\nExecStart=/usr/bin/etl_tasks\nRestart=always\n\n[Install]\nWantedBy=multi-user.target\n"
  },
  {
    "path": "src/opensemanticetl/__init__.py",
    "content": ""
  },
  {
    "path": "src/opensemanticetl/clean_title.py",
    "content": "import sys\n\n# Replace empty title with useful info from other fields for better usability\n\n\nclass clean_title(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        #\n        # if no title but subject (i.e. emails), use subject as document / result title\n        #\n\n        try:\n            # if no field title exists, but field subject, use it\n            if not 'title_txt' in data:\n                if 'subject_ss' in data:\n                    data['title_txt'] = data['subject_ss']\n\n            else:\n                # if title empty and field subject exists, use subjects value\n                if not data['title_txt']:\n                    if 'subject_ss' in data:\n                        if data['subject_ss']:\n                            data['title_txt'] = data['subject_ss']\n\n        except:\n            sys.stderr.write(\n                \"Error while trying to clean empty title with subject\")\n\n        # if no title yet, use the filename part of URI\n        try:\n            # if no field title exists, but field subject, use it\n            if not 'title_txt' in data:\n\n                # get filename from URI\n                filename = parameters['id'].split('/')[-1]\n\n                data['title_txt'] = filename\n\n        except:\n            sys.stderr.write(\n                \"Error while trying to clean empty title with filename\")\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_annotations.py",
    "content": "import os\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom requests.packages.urllib3.util.retry import Retry\n\nimport etl_plugin_core\n\n\n# Get tags and annotations from annotation server\nclass enhance_annotations(etl_plugin_core.Plugin):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        # get parameters\n        docid = parameters['id']\n\n        if os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER'):\n            server = os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER')\n        elif 'metadata_server' in parameters:\n            server = parameters['metadata_server']\n        else:\n            server = 'http://localhost/search-apps/annotate/json'\n\n        adapter = HTTPAdapter(max_retries=Retry(total=10, backoff_factor=1))\n        http = requests.Session()\n        http.mount(\"https://\", adapter)\n        http.mount(\"http://\", adapter)\n\n        response = http.get(server, params={'uri': docid})\n        response.raise_for_status()\n\n        annotations = response.json()\n\n        for facet in annotations:\n            etl_plugin_core.append(data, facet, annotations[facet])\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_contenttype_group.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\n#\n# Map/aggregate content type to content type group\n#\n\n\nclass enhance_contenttype_group(object):\n\n    fieldname = 'content_type_group_ss'\n\n    contenttype_groups = {\n        'application/vnd.ms-excel': 'Spreadsheet',\n        'application/vnd.oasis.opendocument.spreadsheet': 'Spreadsheet',\n        'application/vnd.oasis.opendocument.spreadsheet-template': 'Spreadseheet template',\n        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'Spreadsheet',\n        'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'Spreadsheet template',\n        'text': 'Text document',\n        'application/gzip text': 'Text document',\n        'application/pdf': 'Text document',\n        'application/msword': 'Text document',\n        'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'Text document',\n        'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'Text document template',\n        'application/vnd.oasis.opendocument.text': 'Text document',\n        'application/vnd.oasis.opendocument.text-template': 'Text document template',\n        'application/rtf': 'Text document',\n        'application/vnd.ms-powerpoint': 'Presentation',\n        'application/vnd.oasis.opendocument.presentation': 'Presentation',\n        'application/vnd.oasis.opendocument.presentation-template': 'Presentation template',\n        'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'Presentation',\n        'application/vnd.openxmlformats-officedocument.presentationml.template': 'Presentation template',\n        'image': 'Image',\n        'audio': 'Audio',\n        'video': 'Video',\n        'application/mp4': 'Video',\n        'application/x-matroska': 'Video',\n        'application/vnd.etsi.asic-e+zip': 'Electronic Signature Container',\n        'Knowledge graph': 'Knowledge graph',\n    }\n\n    suffix_groups = {\n        '.csv': \"Spreadsheet\",\n    }\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        content_types = []\n        if 'content_type_ss' in data:\n            content_types = data['content_type_ss']\n\n        if not isinstance(content_types, list):\n            content_types = [content_types]\n\n        groups = []\n\n        for content_type in content_types:\n\n            # Contenttype to group\n            for mapped_content_type, group in self.contenttype_groups.items():\n                if content_type.startswith(mapped_content_type):\n                    if not group in groups:\n                        groups.append(group)\n\n            # Suffix to group\n            for suffix, group in self.suffix_groups.items():\n                if parameters['id'].upper().endswith(suffix.upper()):\n                    if not group in groups:\n                        groups.append(group)\n\n            if len(groups) > 0:\n                data[self.fieldname] = groups\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_csv.py",
    "content": "import sys\nimport os\nimport csv\nimport urllib.request\nfrom etl import ETL\n\n\n# import each row of CSV file to index\n# write CSV cols to database columns or facets\n\n\nclass enhance_csv(object):\n\n    def __init__(self, verbose=False):\n\n        self.verbose = verbose\n\n        self.config = {}\n        self.titles = False\n        self.cache = False\n\n        self.encoding = 'utf-8'\n\n        self.delimiter = None\n\n        self.start_row = 1\n\n        self.title_row = 0\n\n        self.cols = []\n        self.rows = []\n        self.cols_include = False\n        self.rows_include = False\n\n        self.sniff_dialect = True\n\n        self.quotechar = None\n        self.doublequote = None\n        self.escapechar = None\n\n    def read_parameters(self, parameters, data):\n\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                self.verbose = True\n\n        if 'encoding' in parameters:\n            self.encoding = parameters['encoding']\n        elif 'encoding_s' in data:\n            self.encoding = data['encoding_s']\n\n        if 'delimiter' in parameters:\n            self.delimiter = parameters['delimiter']\n\n        if 'cache' in parameters:\n            self.cache = parameters['cache']\n\n        if 'title_row' in parameters:\n            if parameters['title_row']:\n                self.title_row = parameters['title_row']\n\n        if 'start_row' in parameters:\n            if parameters['start_row']:\n                self.start_row = parameters['start_row']\n\n        if 'sniff_dialect' in parameters:\n            self.sniff_dialect = parameters['sniff_dialect']\n\n        if 'quotechar' in parameters:\n            self.quotechar = parameters['quotechar']\n\n        if 'doublequote' in parameters:\n            self.doublequote = parameters['doublequote']\n\n        if 'escapechar' in parameters:\n            self.escapechar = parameters['escapechar']\n\n        if 'rows' in parameters:\n            self.rows = parameters['rows']\n\n        if 'cols' in parameters:\n            self.cols = parameters['cols']\n\n        if 'rows_include' in parameters:\n            self.rows_include = parameters['rows_include']\n\n        if 'cols_include' in parameters:\n            self.cols_include = parameters['cols_include']\n\n    # Todo:\n\n    #\n    # If existing CSV parameter settings in CSV manager, use them\n    # even if not importing within CSV manager\n    #\n    def add_csv_parameters_from_meta_settings(self, metaserver):\n        pass\n        # get csv settings for this file from csvmnager\n        # json = get csvserver\n\n        # if delimiter in json:\n        #\tparameters['delimiter'] = json['delimiters']\n\n    #\n    # Build CSV dialect\n    #\n\n    # Autodetect and/or construct from parameters\n    def get_csv_dialect(self):\n\n        kwargs = {}\n\n        # automatically detect dialect\n        sniffed_dialect = False\n\n        if self.sniff_dialect:\n            try:\n                if self.verbose:\n                    print(\"Opening {} for guessing CSV dialect\".format(self.filename))\n\n                csvfile = open(self.filename, newline='',\n                               encoding=self.encoding)\n\n                if self.verbose:\n                    print(\"Starting dialect guessing\")\n\n                # sniff dialect in first 32 MB\n                sniffsize = 33554432\n                sniffed_dialect = csv.Sniffer().sniff(csvfile.read(sniffsize))\n\n                if self.verbose:\n                    print(\"Sniffed dialect: {}\".format(sniffed_dialect))\n\n            except KeyboardInterrupt:\n                raise KeyboardInterrupt\n\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception while CSV format autodetection for {}: {}\".format(self.filename, e))\n\n            finally:\n                csvfile.close()\n\n        if sniffed_dialect:\n            kwargs['dialect'] = sniffed_dialect\n        else:\n            kwargs['dialect'] = 'excel'\n\n        # Overwrite options, if set\n        if self.delimiter:\n            kwargs['delimiter'] = str(self.delimiter)\n\n        if self.quotechar:\n            kwargs['quotechar'] = str(self.quotechar)\n\n        if self.escapechar:\n            kwargs['escapechar'] = str(self.escapechar)\n\n        if self.doublequote:\n            kwargs['doublequote'] = self.doublequote\n\n        return kwargs\n\n    def set_titles(self, row):\n\n        self.titles = []\n        colnumber = 0\n\n        for col in row:\n\n            colnumber += 1\n\n            self.titles.append(col)\n\n        return self.titles\n\n    def export_row_data_to_index(self, data, rownumber):\n\n        parameters = self.config.copy()\n\n        # todo: all content plugins configurated, not only this one\n        parameters['plugins'] = [\n            'enhance_path',\n            'enhance_entity_linking',\n            'enhance_multilingual',\n        ]\n\n        etl = ETL()\n\n        try:\n\n            etl.process(parameters=parameters, data=data)\n\n        # if exception because user interrupted by keyboard, respect this and abbort\n        except KeyboardInterrupt:\n            raise KeyboardInterrupt\n        except BaseException as e:\n            sys.stderr.write(\n                \"Exception adding CSV row {} : {}\".format(rownumber, e))\n\n            if 'raise_pluginexception' in self.config:\n                if self.config['raise_pluginexception']:\n                    raise e\n\n    def import_row(self, row, rownumber, docid):\n\n        colnumber = 0\n\n        data = {}\n\n        data['content_type_ss'] = \"CSV row\"\n\n        data['container_s'] = docid\n\n        data['page_i'] = str(rownumber)\n\n        data['id'] = docid + '#' + str(rownumber)\n\n        for col in row:\n\n            colnumber += 1\n\n            exclude_column = False\n\n            if self.cols_include:\n                if not colnumber in self.cols:\n                    exclude_column = True\n            else:\n                if colnumber in self.cols:\n                    exclude_column = True\n\n            if not exclude_column:\n\n                if self.titles and len(self.titles) >= colnumber:\n                    fieldname = self.titles[colnumber - 1] + \"_t\"\n                else:\n                    fieldname = 'column_' + str(colnumber).zfill(2) + \"_t\"\n\n                data[fieldname] = col\n\n                # if number, save as float value, too\n                try:\n                    if self.titles and len(self.titles) >= colnumber:\n                        fieldname = self.titles[colnumber - 1] + \"_f\"\n                    else:\n                        fieldname = 'column_' + str(colnumber).zfill(2) + \"_f\"\n                    data[fieldname] = float(col)\n                except ValueError:\n                    pass\n\n                self.export_row_data_to_index(data=data, rownumber=rownumber)\n\n        return colnumber\n\n    #\n    # read parameters, analyze csv dialect and import row by row\n    #\n\n    def enhance_csv(self, parameters, data):\n\n        self.config = parameters.copy()\n\n        docid = parameters['id']\n\n        #\n        # Read parameters\n        #\n\n        self.read_parameters(parameters, data)\n\n        if 'csvmanager' in parameters:\n            self.read_csv_parameters_from_meta_settings(\n                metaserver=parameters['csvmanager'], docid=docid)\n\n        # Download, if not a file(name) yet but URI reference\n\n        # todo: move to csv manager or downloader plugin that in that case should use etl_web\n        if 'filename' in parameters:\n\n            is_tempfile = False\n\n            self.filename = parameters['filename']\n\n            # if exist delete protocoll prefix file://\n            if self.filename.startswith(\"file://\"):\n                self.filename = self.filename.replace(\"file://\", '', 1)\n\n        else:\n\n            # Download url to an tempfile\n            is_tempfile = True\n            self.filename, headers = urllib.request.urlretrieve(self.filename)\n\n        #\n        # Get CSV dialect parameters\n        #\n\n        dialect_kwargs = self.get_csv_dialect()\n\n        if self.verbose:\n            print(\"Opening CSV file with Encoding {} and dialect {}\".format(\n                self.encoding, dialect_kwargs))\n\n        #\n        # Open and read CSV\n        #\n\n        csvfile = open(self.filename, newline='', encoding=self.encoding)\n\n        reader = csv.reader(csvfile, **dialect_kwargs)\n\n        # increase limits to maximum, since there are often text fields with longer texts\n        csv.field_size_limit(sys.maxsize)\n\n        rownumber = 0\n\n        #\n        # Read CSV row by row\n        #\n\n        for row in reader:\n\n            rownumber += 1\n\n            #\n            # If title row, read column titles\n            #\n            if rownumber == self.title_row:\n\n                if self.verbose:\n                    print(\"Importing Titles from row {}\".format(self.title_row))\n\n                self.set_titles(row)\n\n            #\n            # Import data row\n            #\n            if rownumber >= self.start_row:\n\n                exclude_row = False\n\n                if self.rows_include:\n                    if not rownumber in self.rows:\n                        exclude_row = True\n                else:\n                    if rownumber in self.rows:\n                        exclude_row = True\n\n                if exclude_row:\n                    if self.verbose:\n                        print(\"Excluding row {}\".format(rownumber))\n                else:\n\n                    if self.verbose:\n                        print(\"Importing row {}\".format(rownumber))\n\n                    count_columns = self.import_row(\n                        row, rownumber=rownumber, docid=docid)\n\n        #\n        # delete if downloaded tempfile\n        #\n        if not self.cache:\n            if is_tempfile:\n                os.remove(self.filename)\n\n        #\n        # Print stats\n        #\n\n        if self.verbose:\n            print(\"Rows: \" + str(rownumber))\n            print(\"Cols: \" + str(count_columns))\n\n        return rownumber\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        docid = parameters['id']\n\n        # if CSV (file suffix is .csv), enhance it (import row by row)\n        if docid.lower().endswith('.csv') or docid.lower().endswith('.tsv') or docid.lower().endswith('.tab'):\n            self.enhance_csv(parameters, data)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_detect_language_tika_server.py",
    "content": "import os\nimport sys\nimport time\nimport requests\n\n# Extract text from filename\n\n\nclass enhance_detect_language_tika_server(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        if os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER'):\n            tika_server = os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER')\n        elif 'tika_server' in parameters:\n            tika_server = parameters['tika_server']\n        else:\n            tika_server = 'http://localhost:9998'\n\n\n        uri = tika_server + '/language/string'\n\n        analyse_fields = ['title_txt', 'content_txt',\n                          'description_txt', 'ocr_t', 'ocr_descew_t']\n\n        text = ''\n        for field in analyse_fields:\n            if field in data:\n                text = \"{}{}\\n\".format(text, data[field])\n\n        if verbose:\n            print(\"Calling Tika server for language detection from {}\".format(uri))\n\n        retries = 0\n        retrytime = 1\n        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry\n        retrytime_max = 120\n        no_connection = True\n\n        while no_connection:\n            try:\n                if retries > 0:\n                    print(\n                        'Retrying to connect to Tika server in {} second(s).'.format(retrytime))\n                    time.sleep(retrytime)\n                    retrytime = retrytime * 2\n                    if retrytime > retrytime_max:\n                        retrytime = retrytime_max\n\n                r = requests.put(uri, data=text.encode('utf-8'))\n\n                no_connection = False\n\n            except requests.exceptions.ConnectionError as e:\n                retries += 1\n                sys.stderr.write(\n                    \"Connection to Tika server (will retry in {} seconds) failed. Exception: {}\\n\".format(retrytime, e))\n\n        language = r.content.decode('utf-8')\n\n        if verbose:\n            print(\"Detected language: {}\".format(language))\n\n        data['language_s'] = language\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_entity_linking.py",
    "content": "#\n# Named Entity Extraction by Open Semantic Entity Search API dictionary\n#\n\nimport requests\nimport sys\nimport time\n\nfrom entity_linking.entity_linker import Entity_Linker\nimport etl\nimport etl_plugin_core\n\n\n#\n# split a taxonomy entry to separated index fields\n#\ndef taxonomy2fields(taxonomy, field, separator=\"\\t\", subfields_suffix=\"_ss\"):\n\n    result = {}\n\n    # if not multivalued field, convert to used list/array strucutre\n    if not isinstance(taxonomy, list):\n        taxonomy = [taxonomy]\n\n    for taxonomy_entry in taxonomy:\n\n        i = 0\n        path = ''\n        for taxonomy_entry_part in taxonomy_entry.split(separator):\n\n            taxonomy_fieldname = field + '_taxonomy' + str(i) + subfields_suffix\n\n            if not taxonomy_fieldname in result:\n                result[taxonomy_fieldname] = []\n\n            if len(path) > 0:\n                path += separator\n\n            path += taxonomy_entry_part\n\n            result[taxonomy_fieldname].append(path)\n\n            i += 1\n\n    return result\n\n\nclass enhance_entity_linking(etl_plugin_core.Plugin):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        entity_linking_taggers = ['all_labels_ss_tag']\n        if 'entity_linking_taggers' in parameters:\n            entity_linking_taggers = parameters['entity_linking_taggers']\n\n        # add taggers for stemming\n        entity_linking_taggers_document_language_dependent = {}\n        if 'entity_linking_taggers_document_language_dependent' in parameters:\n            entity_linking_taggers_document_language_dependent = parameters[\n                'entity_linking_taggers_document_language_dependent']\n\n        if 'language_s' in data:\n            # is a language specific tagger there for the detected language?\n            if data['language_s'] in entity_linking_taggers_document_language_dependent:\n                for entity_linking_tagger in entity_linking_taggers_document_language_dependent[data['language_s']]:\n                    if not entity_linking_tagger in entity_linking_taggers:\n                        entity_linking_taggers.append(entity_linking_tagger)\n\n        openrefine_server = False\n        if 'openrefine_server' in parameters:\n            openrefine_server = parameters['openrefine_server']\n\n        taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']\n\n        # collect/copy to be analyzed text from all fields\n        text = etl_plugin_core.get_text(data=data)\n\n        # tag all entities (by different taggers for different analyzers/stemmers)\n        for entity_linking_tagger in entity_linking_taggers:\n\n            results = {}\n\n            retries = 0\n            retrytime = 1\n            # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry\n            retrytime_max = 120\n            no_connection = True\n\n            while no_connection:\n                try:\n                    if retries > 0:\n                        print(\n                            'Retrying to connect to Solr tagger in {} second(s).'.format(retrytime))\n                        time.sleep(retrytime)\n                        retrytime = retrytime * 2\n                        if retrytime > retrytime_max:\n                            retrytime = retrytime_max\n\n                    # call REST API\n                    if openrefine_server:\n                        # use REST-API on (remote) HTTP server\n                        params = {'text': text}\n                        r = requests.post(openrefine_server, params=params)\n                        # if bad status code, raise exception\n                        r.raise_for_status()\n\n                        results = r.json()\n\n                    else:\n                        # use local Python library\n                        linker = Entity_Linker()\n                        linker.verbose = verbose\n\n                        results = linker.entities(text=text, taggers=[\n                                                  entity_linking_tagger], additional_result_fields=taxonomy_fields)\n\n                    no_connection = False\n\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n\n                except requests.exceptions.ConnectionError as e:\n\n                    retries += 1\n\n                    if openrefine_server:\n                        sys.stderr.write(\n                            \"Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\\n\".format(retrytime, e))\n                    else:\n                        sys.stderr.write(\n                            \"Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\\n\".format(retrytime, e))\n\n                except requests.exceptions.HTTPError as e:\n                    if e.response.status_code == 503:\n\n                        retries += 1\n\n                        if openrefine_server:\n                            sys.stderr.write(\n                                \"Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\\n\".format(retrytime, e))\n                        else:\n                            sys.stderr.write(\n                                \"Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\\n\".format(retrytime, e))\n\n                    elif e.response.status_code == 400:\n                        no_connection = False\n\n                        # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail\n                        empty_entity_index = False\n                        try:\n                            errorstatus = e.response.json()\n                            if errorstatus['error']['msg'] == 'field ' + entity_linking_tagger + ' has no indexed data':\n                                empty_entity_index = True\n                        except:\n                            pass\n\n                        if not empty_entity_index:\n                            etl.error_message(\n                                docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)\n\n                    else:\n                        no_connection = False\n                        etl.error_message(\n                            docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)\n\n                except BaseException as e:\n                    no_connection = False\n                    etl.error_message(\n                        docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)\n\n            if verbose:\n                print(\"Named Entity Linking by Tagger {}: {}\".format(\n                    entity_linking_tagger, results))\n\n            # write entities from result to document facets\n            for match in results:\n                for candidate in results[match]['result']:\n                    if candidate['match']:\n                        for facet in candidate['type']:\n\n                            # use different facet for fuzzy/stemmed matches\n                            if not entity_linking_tagger == 'all_labels_ss_tag':\n                                # do not use another different facet if same stemmer but forced / not document language dependent\n                                entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace(\n                                    '_stemming_force_', '_stemming_')\n                                facet = facet + entity_linking_tagger_withoutforceoption + '_ss'\n\n                            etl_plugin_core.append(data, facet, candidate['name'])\n                            etl_plugin_core.append(data, facet + '_uri_ss',\n                                       candidate['id'])\n                            etl_plugin_core.append(data, facet + '_preflabel_and_uri_ss',\n                                       candidate['name'] + ' <' + candidate['id'] + '>')\n\n                            if 'matchtext' in candidate:\n                                for matchtext in candidate['matchtext']:\n                                    etl_plugin_core.append(\n                                        data, facet + '_matchtext_ss', candidate['id'] + \"\\t\" + matchtext)\n\n                            for taxonomy_field in taxonomy_fields:\n                                if taxonomy_field in candidate:\n                                    separated_taxonomy_fields = taxonomy2fields(\n                                        taxonomy=candidate[taxonomy_field], field=facet)\n                                    for separated_taxonomy_field in separated_taxonomy_fields:\n                                        etl_plugin_core.append(\n                                            data, separated_taxonomy_field, separated_taxonomy_fields[separated_taxonomy_field])\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_extract_email.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n#\n# extract email addresses\n#\n\nclass enhance_extract_email(object):\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        # collect/copy to be analyzed text from all fields\n        text = etl_plugin_core.get_text(data=data)\n            \n\n        for match in re.finditer('[\\w\\.-]+@[\\w\\.-]+', text, re.IGNORECASE):\n            value = match.group(0)\n            etl_plugin_core.append(data, 'email_ss', value)\n\n\n        # if extracted email addresses from data, do further analysis for separated specialized facets\n        if 'email_ss' in data:\n\n            # extract email adresses of sender (from)\n            for match in re.finditer('From: (.* )?([\\w\\.-]+@[\\w\\.-]+)', text, re.IGNORECASE):\n                value = match.group(2)\n                etl_plugin_core.append(data, 'Message-From_ss', value)\n\n            # extract email adresses (to)\n            for match in re.finditer('To: (.* )?([\\w\\.-]+@[\\w\\.-]+)', text, re.IGNORECASE):\n                value = match.group(2)\n                etl_plugin_core.append(data, 'Message-To_ss', value)\n\n            # extract the domain part from all emailadresses to facet email domains\n            data['email_domain_ss'] = []\n            emails = data['email_ss']\n            if not isinstance(emails, list):\n                emails = [emails]\n\n            for email in emails:\n                domain = email.split('@')[1]\n                etl_plugin_core.append(data, 'email_domain_ss', domain)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_extract_hashtags.py",
    "content": "import etl_plugin_core\n\n# Extract text from filename\nclass enhance_extract_hashtags(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        minimallenght = 3\n\n        # collect/copy to be analyzed text from all fields\n        text = etl_plugin_core.get_text(data=data)\n\n        data['hashtag_ss'] = [word for word in text.split() if (\n            word.startswith(\"#\") and len(word) > minimallenght)]\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_extract_law.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n\n#\n# get taxonomy for aggregated facets / filters\n#\n\n# example: '§ 153 Abs. 1 Satz 2' -> ['§ 153', '§ 153 Absatz 1', '§ 153 Absatz 1 Satz 2']\n\n# todo:\n\ndef get_taxonomy(law_clause, law_code = None):\n\n    law_clauses = [law_clause]\n    \n    return law_clauses\n\n\n#1.a\n#1(2)\n#1 (2)\n\n\n#\n# extract law codes\n#\n\nclass enhance_extract_law(etl_plugin_core.Plugin):\n    \n    def process(self, parameters=None, data=None):\n        \n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        clause_prefixes = [\n            '§',\n            'Article',\n            'Artikel',\n            'Art',\n            'Section',\n            'Sec',\n        ]\n\n        clause_subsections = [\n            'Abschnitt',\n            'Absatz',\n            'Abs',\n            'Sentence',\n            'Satz',\n            'S',\n            'Halbsatz',\n            'Number',\n            'Nummer',\n            'Nr',\n            'Buchstabe',\n        ]\n\n        text = etl_plugin_core.get_text(data)\n\n\n        clauses = []\n\n        rule = '(' + '|'.join(clause_prefixes) + ')\\W*((\\d+\\W\\w(\\W|\\b))|(\\d+\\w?))(\\W?(' + '|'.join(clause_subsections) + ')\\W*(\\d+\\w?|\\w(\\W|\\b)))*'\n        for match in re.finditer(rule, text, re.IGNORECASE):\n            clause = match.group(0)\n\n            clause = clause.strip()\n\n            clauses.append(clause)\n\n            # if \"§123\" normalize to \"§ 123\"\n            if clause[0] == '§' and not clause[1] == ' ':\n                clause = '§ ' + clause[1:]\n\n            etl_plugin_core.append(data, 'law_clause_ss', clause)\n\n        code_matchtexts = etl_plugin_core.get_all_matchtexts(data.get('law_code_ss_matchtext_ss', []))\n        code_matchtexts_with_clause = []\n\n        preflabels = {}\n        if 'law_code_ss_preflabel_and_uri_ss' in data:\n            preflabels = etl_plugin_core.get_preflabels(data['law_code_ss_preflabel_and_uri_ss'])\n\n        if len(clauses)>0 and len(code_matchtexts)>0:\n\n            text = text.replace(\"\\n\", \" \")\n\n            for code_match_id in code_matchtexts:\n\n                #get only matchtext (without ID/URI of matching entity)\n                for code_matchtext in code_matchtexts[code_match_id]:\n    \n                    for clause in clauses:\n                        if clause + \" \" + code_matchtext in text or code_matchtext + \" \" + clause in text:\n                            \n                            code_matchtexts_with_clause.append(code_matchtext)\n                            \n                            # if \"§123\" normalize to \"§ 123\"\n                            if clause[0] == '§' and not clause[1] == ' ':\n                                clause = '§ ' + clause[1:]\n    \n                            law_code_preflabel = code_match_id\n                            if code_match_id in preflabels:\n                                law_code_clause_normalized = clause + \" \" + preflabels[code_match_id]\n                            else:\n                                law_code_clause_normalized = clause + \" \" + code_match_id\n     \n                            etl_plugin_core.append(data, 'law_code_clause_ss', law_code_clause_normalized)\n\n        if len(code_matchtexts)>0:\n            \n            blacklist = []\n            listfile = open('/etc/opensemanticsearch/blacklist/enhance_extract_law/blacklist-lawcode-if-no-clause')\n            for line in listfile:\n                line = line.strip()\n                if line and not line.startswith(\"#\"):\n                    blacklist.append(line)\n            listfile.close()\n\n            if not isinstance(data['law_code_ss_matchtext_ss'], list):\n                data['law_code_ss_matchtext_ss'] = [data['law_code_ss_matchtext_ss']]\n\n            blacklisted_code_ids = []\n            for code_match_id in code_matchtexts:\n                for code_matchtext in code_matchtexts[code_match_id]:\n                    if code_matchtext in blacklist:\n                        if code_matchtext not in code_matchtexts_with_clause:\n                            blacklisted_code_ids.append(code_match_id)\n                            data['law_code_ss_matchtext_ss'].remove(code_match_id + \"\\t\" + code_matchtext)\n\n            code_matchtexts = etl_plugin_core.get_all_matchtexts(data.get('law_code_ss_matchtext_ss', []))\n\n            if not isinstance(data['law_code_ss'], list):\n                data['law_code_ss'] = [data['law_code_ss']]\n            if not isinstance(data['law_code_ss_preflabel_and_uri_ss'], list):\n                data['law_code_ss_preflabel_and_uri_ss'] = [data['law_code_ss_preflabel_and_uri_ss']]\n\n            for blacklisted_code_id in blacklisted_code_ids:\n                if blacklisted_code_id not in code_matchtexts:\n                    data['law_code_ss'].remove(preflabels[blacklisted_code_id])\n                    data['law_code_ss_preflabel_and_uri_ss'].remove(preflabels[blacklisted_code_id] + ' <' + blacklisted_code_id + '>')\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_extract_money.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\nfrom numerizer import numerize\n\n#\n# extract money\n#\n\nclass enhance_extract_money(etl_plugin_core.Plugin):\n\n    # todo: all other currency signs from Wikidata\n    currency_signs = ['$', '€']\n\n    def process(self, parameters=None, data=None):\n\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        moneys = set(data.get('money_ss', []))\n\n        text = etl_plugin_core.get_text(data)\n        text = text.replace(\"\\n\", \" \")\n\n        # convert written numbers like \"one\" and \"two million\" to integer like \"1\" and \"2000000\"\n        if 'language_s' in data:\n            if data['language_s'] == \"en\":\n                text = numerize(text)\n\n        currencies_escaped = []\n\n        # currency signs\n        for currency in self.currency_signs:\n            currencies_escaped.append(re.escape(currency))\n\n        # currency labels\n        matched_currency_labels = etl_plugin_core.get_all_matchtexts(data.get('currency_ss_matchtext_ss', []))\n        for currency_id in matched_currency_labels:\n            #get only matchtext (without ID/URI of matching entity)\n            for matchtext in matched_currency_labels[currency_id]:\n                currencies_escaped.append(re.escape(matchtext))\n\n        regex_part_number = '\\d+((\\.|\\,)\\d+)*'\n        regex_part_currencies = '(' + '|'.join(currencies_escaped) + ')'\n\n        rule = regex_part_number + '\\s?' + regex_part_currencies\n        for match in re.finditer(rule, text, re.IGNORECASE):\n            moneys.add(match.group(0))\n\n        rule = regex_part_currencies + '\\s?' + regex_part_number\n        for match in re.finditer(rule, text, re.IGNORECASE):\n            moneys.add(match.group(0))\n\n        data['money_ss'] = list(moneys)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_extract_phone.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n#\n# normalize phone number (remove all non-numeric chars except leading +)\n# so same number is used for aggregations/facet filters, even if written in different formats (with or without space(s) and hyphen(s))\n#\n\ndef normalize_phonenumber(phone):\n    chars = ['+','0','1','2','3','4','5','6','7','8','9']\n    phone_normalized = ''\n    for char in phone:\n        if char in chars:\n            # only first +\n            if char == '+':\n                if not phone_normalized:\n                    phone_normalized = '+'\n            else:\n                phone_normalized += char\n\n    return phone_normalized\n\n\n#\n# extract phone number(s)\n#\n\nclass enhance_extract_phone(object):\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        # collect/copy to be analyzed text from all fields\n        text = etl_plugin_core.get_text(data=data)\n\n        for match in re.finditer('[\\+\\(]?[1-9][0-9 .\\-\\(\\)]{8,}[0-9]', text, re.IGNORECASE):\n            value = match.group(0)\n            etl_plugin_core.append(data, 'phone_ss', value)\n\n\n        # if extracted phone number(s), normalize to format that can be used for aggregation/filters\n\n        if 'phone_ss' in data:\n\n            phones = data['phone_ss']\n            if not isinstance(phones, list):\n                phones = [phones]\n\n            for phone in phones:\n                phone_normalized = normalize_phonenumber(phone)\n                etl_plugin_core.append(data, 'phone_normalized_ss', phone_normalized)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_extract_text_tika_server.py",
    "content": "import os\nimport tempfile\nimport sys\nimport time\nimport requests\n\n\ndef in_parsers(parser, parsers):\n\n    for value in parsers:\n        if isinstance(value, list):\n            for subvalue in value:\n                if subvalue == parser:\n                    return True\n        else:\n            if value == parser:\n                return True\n\n    return False\n\n\n# Extract text from file(name)\nclass enhance_extract_text_tika_server(object):\n\n    mapping = {\n        'Content-Type': 'content_type_ss',\n        'dc:creator': 'author_ss',\n        'Content-Encoding': 'Content-Encoding_ss',\n        'dc:title': 'title_txt',\n        'dc:subject': 'subject_ss',\n    }\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        tika_log_path = tempfile.mkdtemp(prefix=\"tika-python-\")\n        os.environ['TIKA_LOG_PATH'] = tika_log_path\n\n        os.environ['TIKA_CLIENT_ONLY'] = 'True'\n\n        import tika\n        from tika import parser\n\n        tika.TikaClientOnly = True\n\n        headers = {}\n\n        do_ocr = parameters.get('ocr', False)\n        \n        do_ocr_pdf_tika = parameters.get('ocr_pdf_tika', True)\n        do_ocr_pdf = False\n        if 'plugins' in parameters:\n            if 'enhance_pdf_ocr' in parameters['plugins'] and do_ocr_pdf_tika:\n                do_ocr_pdf = True\n\n        # if only OCR for PDF enabled (enhance_pdf_ocr as fallback and OCR by tika enabled) but not OCR for image files,\n        # run OCR only if file ending .pdf so disabled OCR for other file types\n        if do_ocr_pdf and not do_ocr:\n            contenttype = data.get('content_type_ss', None)\n            if isinstance(contenttype, list):\n                contenttype = contenttype[0]\n\n            if contenttype == 'application/pdf' or filename.lower().endswith('.pdf'):\n                do_ocr_pdf = True\n            else:\n                do_ocr_pdf = False\n\n        if 'ocr_lang' in parameters:\n            headers['X-Tika-OCRLanguage'] = parameters['ocr_lang']\n        \n        if do_ocr or do_ocr_pdf:\n\n            if os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER'):\n                tika_server = os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER')\n            elif 'tika_server' in parameters:\n                tika_server = parameters['tika_server']\n            else:\n                tika_server = 'http://localhost:9998'\n\n            # OCR embedded images in PDF, if not disabled or has to be done by other plugin\n            if do_ocr_pdf:\n                headers['X-Tika-PDFextractInlineImages'] = 'true'\n            else:\n                headers['X-Tika-PDFextractInlineImages'] = 'false'\n\n            # set OCR status in indexed document\n            data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = True\n            # OCR is enabled, so was done by this Tika call, no images left to OCR\n            data['etl_count_images_yet_no_ocr_i'] = 0\n        \n        else:\n            # OCR (yet) disabled, so use the Tika instance using the fake tesseract so we only get OCR results if in cache\n            # else we get OCR status [Image (No OCR yet)] in content, so we know that there are images to OCR for later steps\n\n            if os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER_FAKECACHE'):\n                tika_server = os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER_FAKECACHE')\n            elif 'tika_server_fake_ocr' in parameters:\n                tika_server = parameters['tika_server_fake_ocr']\n            else:\n                tika_server = 'http://localhost:9999'\n\n            headers['X-Tika-PDFextractInlineImages'] = 'true'\n\n            # set OCR status in indexed document, so next stage knows that yet no OCR\n            data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = False\n\n        #\n        # Parse on Apache Tika Server by python-tika\n        #\n        if verbose:\n            print(\"Parsing by Tika Server on {} with additional headers {}\".format(tika_server, headers))\n\n        retries = 0\n        retrytime = 1\n        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry\n        retrytime_max = 120\n        no_connection = True\n\n        while no_connection:\n            try:\n                if retries > 0:\n                    print(\n                        'Retrying to connect to Tika server in {} second(s).'.format(retrytime))\n                    time.sleep(retrytime)\n                    retrytime = retrytime * 2\n                    if retrytime > retrytime_max:\n                        retrytime = retrytime_max\n\n                parsed = parser.from_file(\n                    filename=filename,\n                    serverEndpoint=tika_server,\n                    headers=headers,\n                    requestOptions={'timeout': 60000})\n\n                no_connection = False\n\n            except requests.exceptions.ConnectionError as e:\n                retries += 1\n                sys.stderr.write(\n                    \"Connection to Tika server (will retry in {} seconds) failed. Exception: {}\\n\".format(retrytime, e))\n\n        if parsed['content']:\n            data['content_txt'] = parsed['content']\n\n        tika_exception = False\n        for tika_field in parsed[\"metadata\"]:\n\n            # there is a field name with exceptions, so copy fieldname to failed plugins\n            if 'exception' in tika_field.lower():\n                tika_exception = True\n                parameters['etl_tika_exception'] = True\n                if 'etl_error_plugins_ss' not in data:\n                    data['etl_error_plugins_ss'] = []\n                data['etl_error_plugins_ss'].append(tika_field)\n\n            # copy Tika fields to (mapped) data fields\n            if tika_field in self.mapping:\n                data[self.mapping[tika_field]] = parsed['metadata'][tika_field]\n            else:\n                data[tika_field + '_ss'] = parsed['metadata'][tika_field]\n\n        #\n        # anaylze and (re)set OCR status to prevent (re)process unnecessary tasks of later stage(s)\n        #\n        contenttype = data.get('content_type_ss', None)\n        if isinstance(contenttype, list):\n            contenttype = contenttype[0]\n\n        ocr_status_known = False\n\n        # file was PDF and OCR for PDF enabled, so we know status\n        if do_ocr_pdf:\n            ocr_status_known = True\n\n        # all OCR cases enabled, so we know status\n        if do_ocr and do_ocr_pdf:\n            ocr_status_known = True\n\n        # if no kind of OCR done now, we know status because fake tesseract wrapper\n        if not do_ocr and not do_ocr_pdf:\n            ocr_status_known = True\n        \n        # if OCR for images done but content type is PDF and OCR of PDF by Tika is disabled\n        # (because using other plugin for that) we do not know status for PDF,\n        # since Tika runned without inline OCR for PDF\n        if do_ocr and not do_ocr_pdf:\n            if not contenttype == 'application/pdf':\n                ocr_status_known = True\n\n        if ocr_status_known:\n            \n            # Tika made an tesseract OCR call (if OCR (yet) off, by fake Tesseract CLI wrapper)\n            # so there is really something to OCR?\n            if not in_parsers('org.apache.tika.parser.ocr.TesseractOCRParser', data['X-TIKA:Parsed-By_ss']):\n                # since Tika did not call (fake or cached) tesseract (wrapper), nothing to OCR in this file,\n    \n                if verbose:\n                    print('Tika OCR parser not used, so nothing to OCR in later stages, too')\n                \n                # so set all OCR plugin status and OCR configs to done,\n                # so filter_file_not_modifield in later stage task will prevent reprocessing\n                # because of only this yet not runned plugins or OCR configs\n                data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = True\n                data['etl_count_images_yet_no_ocr_i'] = 0\n    \n                if not tika_exception:\n                    parameters['etl_nothing_for_ocr'] = True\n                    data['etl_enhance_ocr_descew_b'] = True\n                    data['etl_enhance_pdf_ocr_b'] = True\n    \n            else:\n                # OCR parser used by Tika, so there was something to OCR\n    \n                # If in this case the fake tesseract wrapper could get all results from cache,\n                # no additional Tika-Server run with OCR enabled needed\n                # So set Tika-Server OCR status of tika-server to done\n    \n                if not do_ocr and 'content_txt' in data:\n    \n                    if verbose:\n                        print(\"Tika OCR parser was used, so there is something to OCR\")\n    \n                    # how many images yet not OCRd because no result from cache\n                    # so we got fake OCR result \"[Image (no OCR yet)]\"\n                    count_images_yet_no_ocr = data['content_txt'].count('[Image (no OCR yet)]')\n                    data['etl_count_images_yet_no_ocr_i'] = count_images_yet_no_ocr\n    \n                    # got all Tika-Server Tesseract OCR results from cache,\n                    # so no additional OCR tasks for later stage\n                    if count_images_yet_no_ocr == 0:\n                        if verbose:\n                            print('But could get all OCR results in this stage from OCR cache')\n                        # therefore set status like OCR related config\n                        # yet runned, so on next stage filter_file_not_modified\n                        # wont process document again only because of OCR\n                        # (but not reset status of other plugins,\n                        # since maybe additional image in changed file)\n                        data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = True\n                        data['etl_count_images_yet_no_ocr_i'] = 0\n\n                        # if not a (maybe changed) PDF, set enhance_pdf_ocr to done, too,\n                        # so no reprocessing because this additional plugin on later stage\n                        if not contenttype == 'application/pdf':\n                            data['etl_enhance_pdf_ocr_b'] = True\n\n        tika_log_file = tika_log_path + os.path.sep + 'tika.log'\n        if os.path.isfile(tika_log_file):\n            os.remove(tika_log_file)\n\n        os.rmdir(tika_log_path)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_file_mtime.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os.path\nimport datetime\n\n#\n# Add file modification time\n#\n\n\nclass enhance_file_mtime(object):\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        # get modification time from file\n        file_mtime = os.path.getmtime(filename)\n\n        # convert mtime to Lucene format\n        file_mtime_masked = datetime.datetime.fromtimestamp(\n            file_mtime).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n\n        if verbose:\n            print(\"File modification time: {}\".format(file_mtime_masked))\n\n        data['file_modified_dt'] = file_mtime_masked\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_file_size.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os.path\n\n#\n# add file size\n#\n\n\nclass enhance_file_size(object):\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        # get filesize\n        file_size = os.path.getsize(filename)\n\n        if verbose:\n            print(\"File size: {}\".format(file_size))\n\n        data['file_size_i'] = str(file_size)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_html.py",
    "content": "#\n# Extracts text within configured HTML tags / XML tags\n#\n\nfrom lxml import etree\n\n\nclass enhance_html(object):\n\n    def elements2data(self, element, data, path=None, recursive=True):\n\n        if self.verbose:\n            print(\"Extracting element {}\".format(element.tag))\n\n        if path:\n            path += \"/\" + element.tag\n        else:\n            path = element.tag\n\n        fieldname = path + '_ss'\n\n        text = element.text\n\n        if text:\n            text = text.strip()\n\n        if text:\n            if fieldname in data:\n                data[fieldname].append(text)\n            else:\n                data[fieldname] = [text]\n\n        if recursive:\n            for child in element:\n                data = self.elements2data(\n                    element=child, path=path, data=data, recursive=True)\n\n        return data\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        self.verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                self.verbose = True\n\n        filename = parameters['filename']\n\n        if 'content_type_ss' in data:\n            mimetype = data['content_type_ss']\n        else:\n            mimetype = parameters['content_type_ss']\n\n        # if connector returns a list, use only first value (which is the only entry of the list)\n        if isinstance(mimetype, list):\n            mimetype = mimetype[0]\n\n        if mimetype.startswith('application/xhtml+xml'):\n\n            html_extract_tags = []\n            if 'html_extract_tags' in parameters:\n                html_extract_tags = parameters['html_extract_tags']\n\n            html_extract_tags_and_children = []\n            if 'html_extract_tags_and_children' in parameters:\n                html_extract_tags_and_children = parameters['html_extract_tags_and_children']\n\n            parser = etree.HTMLParser()\n\n            et = etree.parse(filename, parser)\n\n            for xpath in html_extract_tags:\n                for el in et.xpath(xpath):\n                    self.elements2data(element=el, data=data, recursive=False)\n\n            for xpath in html_extract_tags_and_children:\n                for el in et.xpath(xpath):\n                    self.elements2data(element=el, data=data)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_mapping_id.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\n\n#\n# Map paths or domains\n#\n\nclass enhance_mapping_id(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        if 'mappings' in parameters:\n            parameters['id'] = mapping(\n                value=parameters['id'], mappings=parameters['mappings'])\n\n        return parameters, data\n\n\n# Change value with best/deepest mapping\ndef mapping(value, mappings=None):\n    if mapping is None:\n        mappings = {}\n\n    max_match_len = -1\n\n    # check all mappings for matching and use the best\n    for map_from, map_to in mappings.items():\n\n        # map from matching value?\n        if value.startswith(map_from):\n\n            # if from string longer (deeper path), this is the better matching\n            match_len = len(map_from)\n\n            if match_len > max_match_len:\n                max_match_len = match_len\n                best_match_map_from = map_from\n                best_match_map_to = map_to\n\n    # if there is a match, replace first occurance of value with mapping\n    if max_match_len >= 0:\n        value = value.replace(best_match_map_from, best_match_map_to, 1)\n\n    return value\n\n\n# Change mapped value to origin value\ndef mapping_reverse(value, mappings=None):\n    if mapping is None:\n        mappings = {}\n\n    max_match_len = -1\n\n    # check all mappings for matching and use the best\n    for map_from, map_to in mappings.items():\n\n        # map from matching value?\n        if value.startswith(map_to):\n\n            # if from string longer (deeper path), this is the better matching\n            match_len = len(map_to)\n\n            if match_len > max_match_len:\n                max_match_len = match_len\n                best_match_map_from = map_from\n                best_match_map_to = map_to\n\n    # if there is a match, replace first occurance of value with reverse mapping\n    if max_match_len >= 0:\n        value = value.replace(best_match_map_to, best_match_map_from, 1)\n\n    return value\n"
  },
  {
    "path": "src/opensemanticetl/enhance_mimetype.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport magic\n\n\n#\n# Get MimeType (Which kind of file is this?)\n#\nclass enhance_mimetype(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        mimetype = None\n\n        m = magic.open(magic.MAGIC_MIME)\n        m.load()\n        mimetype = m.file(filename)\n        m.close()\n\n        if verbose:\n            print(\"Detected MimeType: {}\".format(mimetype))\n\n        data['content_type_magic_s'] = mimetype\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_multilingual.py",
    "content": "#\n# Multilinguality\n#\n# Copy content language specific dynamic fields for language specific analysis like stemming, grammar or synonyms\n#\n# Language has been detected before by plugin enhance_detect_language using Apache Tika / OpenNLP\n#\n\n\nclass enhance_multilingual(object):\n\n    verbose = False\n\n    # languages that are defined in index schema for language specific analysis and used if autodetected as documents language\n    languages = ['en', 'fr', 'de', 'es', 'hu', 'pt',\n                 'nl', 'ro', 'ru', 'it', 'cz', 'ar', 'fa']\n    languages_hunspell = ['hu']\n\n    # languages for language specific analysis even if not the autodetected document language\n    languages_force = []\n    languages_force_hunspell = []\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        if 'verbose' in parameters:\n            self.verbose = parameters['verbose']\n\n        if 'languages' in parameters:\n            self.languages = parameters['languages']\n\n        if 'languages_hunspell' in parameters:\n            self.languages_hunspell = parameters['languages_hunspell']\n\n        if 'languages_force' in parameters:\n            self.languages_force = parameters['languages_force']\n\n        if 'languages_force_hunspell' in parameters:\n            self.languages_force_hunspell = parameters['languages_force_hunspell']\n\n        if 'languages_exclude_fields' in parameters:\n            self.exclude_fields = parameters['languages_exclude_fields']\n\n        if 'languages_exclude_fields_map' in parameters:\n            self.exclude_fields_map = parameters['languages_exclude_fields_map']\n\n        language = data.get('language_s', None)\n\n        #\n        # exclude fields like technical metadata\n        #\n    \n        exclude_prefix = []\n    \n        listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix')\n        for line in listfile:\n            line = line.strip()\n            if line and not line.startswith(\"#\"):\n                exclude_prefix.append(line)\n        listfile.close()\n    \n        # suffixes of non-text fields like nubers\n        exclude_suffix = []\n    \n        listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix')\n        for line in listfile:\n            line = line.strip()\n            if line and not line.startswith(\"#\"):\n                exclude_suffix.append(line)\n        listfile.close()\n    \n        # full fieldnames\n        exclude_fields = []\n        listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname')\n        for line in listfile:\n            line = line.strip()\n            if line and not line.startswith(\"#\"):\n                exclude_fields.append(line)\n        listfile.close()\n    \n        exclude_fields_map = {}\n\n        language_fields = ['_text_']\n        language_specific_data = {}\n\n        # language specific analysis for recognized language of document\n        # if language support of detected language in index schema\n        if language in self.languages:\n            language_fields.append(\"text_txt_\" + language)\n\n        if language in self.languages_hunspell:\n            language_fields.append(\"text_txt_hunspell_\" + language)\n\n        # fields for language specific analysis by forced languages even if other language or false recognized language\n        for language_force in self.languages_force:\n\n            language_field = \"text_txt_\" + language_force\n\n            if not language_field in language_fields:\n                language_fields.append(language_field)\n\n        for language_force in self.languages_force_hunspell:\n\n            language_field = \"text_txt_hunspell_\" + language_force\n\n            if not language_field in language_fields:\n                language_fields.append(language_field)\n\n        # copy each data field to language specific field with suffix _txt_$language\n        for fieldname in data:\n\n            exclude = False\n\n            # do not copy excluded fields\n            for exclude_field in exclude_fields:\n                if fieldname == exclude_field:\n                    exclude = True\n\n            for prefix in exclude_prefix:\n                if fieldname.startswith(prefix):\n                    exclude = True\n\n            for suffix in exclude_suffix:\n                if fieldname.endswith(suffix):\n                    exclude = True\n\n            if not exclude and data[fieldname]:\n\n                # copy field to default field with added suffixes for language dependent stemming/analysis\n                for language_field in language_fields:\n\n                    excluded_by_mapping = False\n\n                    if language_field in exclude_fields_map:\n                        if fieldname in exclude_fields_map[language_field]:\n                            excluded_by_mapping = True\n                            if self.verbose:\n                                print(\"Multilinguality: Excluding field {} to be copied to {} by config of exclude_field_map\".format(\n                                    fieldname, language_field))\n\n                    if not excluded_by_mapping:\n                        if self.verbose:\n                            print(\"Multilinguality: Add {} to {}\".format(\n                                fieldname, language_field))\n\n                        if not language_field in language_specific_data:\n                            language_specific_data[language_field] = []\n\n                        if isinstance(data[fieldname], list):\n                            language_specific_data[language_field].extend(\n                                data[fieldname])\n                        else:\n                            language_specific_data[language_field].append(\n                                data[fieldname])\n\n        # append language specific fields to data\n        for key in language_specific_data:\n            data[key] = language_specific_data[key]\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_ner_spacy.py",
    "content": "import etl\nimport requests\nimport json\nimport os\nimport sys\nimport time\n\n#\n# SpaCy Named Entity Recognizer (NER)\n#\n\n# Appends classified (Persons, Locations, Organizations) entities (names/words) to mapped facets/fields\n\nclass enhance_ner_spacy(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        if 'spacy_ner_mapping' in parameters:\n            mapping = parameters['spacy_ner_mapping']\n        else:\n            mapping = {\n                'ORG': 'organization_ss',\n                'NORP': 'organization_ss',\n                'orgName': 'organization_ss',\n                'ORGANIZATION': 'organization_ss',\n                'PER': 'person_ss',\n                'PERSON': 'person_ss',\n                'persName': 'person_ss',\n                'GPE': 'location_ss',\n                'LOC': 'location_ss',\n                'placeName': 'location_ss',\n                'FACILITY': 'location_ss',\n                'PRODUCT': 'product_ss',\n                'EVENT': 'event_ss',\n                'LAW': 'law_ss',\n                'DATE': 'date_ss',\n                'TIME': 'time_ss',\n                'MONEY': 'money_ss',\n                'WORK_OF_ART': 'work_of_art_ss',\n            }\n\n        # default classifier\n        classifier = 'en_core_web_sm'\n\n        if 'spacy_ner_classifier_default' in parameters:\n            classifier = parameters['spacy_ner_classifier_default']\n\n        # set language specific classifier, if configured and document language detected\n        if 'spacy_ner_classifiers' in parameters and 'language_s' in data:\n            # is a language specific classifier there for the detected language?\n            if data['language_s'] in parameters['spacy_ner_classifiers']:\n                classifier = parameters['spacy_ner_classifiers'][data['language_s']]\n\n        # if standard classifier configured to None and no classifier for detected language, exit the plugin\n        if not classifier:\n            return parameters, data\n\n        if verbose:\n            print(\"Using SpaCY NER language / classifier: {}\".format(classifier))\n\n        analyse_fields = ['title_txt', 'content_txt',\n                          'description_txt', 'ocr_t']\n\n        text = ''\n        for field in analyse_fields:\n            if field in data:\n                text = \"{}{}\\n\".format(text, data[field])\n\n        # classify/tag with class each word of the content\n\n        url = \"http://localhost:8080/ent\"\n        if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):\n            url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/ent'\n\n        headers = {'content-type': 'application/json'}\n        d = {'text': text, 'model': classifier}\n\n        retries = 0\n        retrytime = 1\n        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry\n        retrytime_max = 120\n        no_connection = True\n\n        while no_connection:\n            try:\n                if retries > 0:\n                    print(\n                        'Retrying to connect to Spacy services in {} second(s).'.format(retrytime))\n                    time.sleep(retrytime)\n                    retrytime = retrytime * 2\n                    if retrytime > retrytime_max:\n                        retrytime = retrytime_max\n\n                response = requests.post(\n                    url, data=json.dumps(d), headers=headers)\n                # if bad status code, raise exception\n                response.raise_for_status()\n\n                no_connection = False\n\n            except requests.exceptions.ConnectionError as e:\n                retries += 1\n                sys.stderr.write(\n                    \"Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\\n\".format(retrytime, e))\n\n        r = response.json()\n\n        for ent in r:\n\n            entity_class = ent['label']\n            # get entity string from returned start and end value\n            entity = text[int(ent['start']): int(ent['end'])]\n\n            # strip whitespaces from begin and end\n            entity = entity.strip()\n\n            # after strip exclude empty entities\n            if not entity:\n                continue\n\n            # if class of entity is mapped to a facet/field, append the entity to this facet/field\n\n            if entity_class in mapping:\n\n                if verbose:\n                    print(\"NER classified word(s)/name {} to {}. Appending to mapped facet {}\".format(\n                        entity, entity_class, mapping[entity_class]))\n\n                etl.append(data, mapping[entity_class], entity)\n\n            else:\n                if verbose:\n                    print(\"Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}\".format(entity_class, entity))\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_ner_stanford.py",
    "content": "import etl\nfrom nltk.tag.stanford import StanfordNERTagger\n\n\n#\n# Stanford Named Entitiy Recognizer (NER)\n#\n\n# Appends classified (Persons, Locations, Organizations) entities (names/words) to mapped facets/fields\n\nclass enhance_ner_stanford(object):\n\n    # compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)\n    def multi_word_entities(self, entities):\n\n        multi_word_entities = []\n        multi_word_entity = \"\"\n        last_entity_class = \"\"\n\n        i = 0\n\n        for entity, entity_class in entities:\n\n            i += 1\n\n            class_change = False\n\n            # new entity class different from last words which had been joined?\n            if last_entity_class:\n                if entity_class != last_entity_class:\n                    class_change = True\n\n            # if new class add last values to dictionary and begin new multi word entity\n            if class_change:\n                multi_word_entities.append(\n                    (multi_word_entity, last_entity_class))\n                multi_word_entity = \"\"\n\n            # add new word to multi word entity\n            if multi_word_entity:\n                multi_word_entity += \" \" + entity\n            else:\n                multi_word_entity = entity\n\n            # if last entity, no next class change, so add now\n            if i == len(entities):\n                multi_word_entities.append((multi_word_entity, entity_class))\n\n            last_entity_class = entity_class\n\n        return multi_word_entities\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        if 'stanford_ner_mapping' in parameters:\n            mapping = parameters['stanford_ner_mapping']\n        else:\n            # todo: extend mapping for models with more classes like dates\n            mapping = {\n                'PERSON': 'person_ss',\n                'LOCATION': 'location_ss',\n                'ORGANIZATION': 'organization_ss',\n                'I-ORG': 'organization_ss',\n                'I-PER': 'person_ss',\n                'I-LOC': 'location_ss',\n                'ORG': 'organization_ss',\n                'PER': 'person_ss',\n                'LOC': 'location_ss',\n                'PERS': 'person_ss',\n                'LUG': 'location_ss',\n                'MONEY': 'money_ss',\n            }\n\n        # default classifier\n        classifier = 'english.all.3class.distsim.crf.ser.gz'\n\n        if 'stanford_ner_classifier_default' in parameters:\n            classifier = parameters['stanford_ner_classifier_default']\n\n        # set language specific classifier, if configured and document language detected\n        if 'stanford_ner_classifiers' in parameters and 'language_s' in data:\n            # is a language speciic cassifier there for the detected language?\n            if data['language_s'] in parameters['stanford_ner_classifiers']:\n                classifier = parameters['stanford_ner_classifiers'][data['language_s']]\n\n        # if standard classifier configured to None and no classifier for detected language, exit the plugin\n        if not classifier:\n            return parameters, data\n\n        kwargs = {}\n\n        if 'stanford_ner_java_options' in parameters:\n            kwargs['java_options'] = parameters['stanford_ner_java_options']\n\n        if 'stanford_ner_path_to_jar' in parameters:\n            kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar']\n\n        analyse_fields = ['title_txt', 'content_txt',\n                          'description_txt', 'ocr_t', 'ocr_descew_t']\n\n        text = ''\n        for field in analyse_fields:\n            if field in data:\n                text = \"{}{}\\n\".format(text, data[field])\n\n        # classify/tag with class each word of the content\n        st = StanfordNERTagger(classifier, encoding='utf8',\n                               verbose=verbose, **kwargs)\n        entities = st.tag(text.split())\n\n        # compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)\n        entities = self.multi_word_entities(entities)\n\n        # if class of entity is mapped to a facet/field, append the entity to this facet/field\n        for entity, entity_class in entities:\n\n            if entity_class in mapping:\n\n                if verbose:\n                    print(\"NER classified word(s)/name {} to {}. Appending to mapped facet {}\".format(\n                        entity, entity_class, mapping[entity_class]))\n\n                etl.append(data, mapping[entity_class], entity)\n\n            else:\n                if verbose:\n                    print(\"Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}\".format(entity_class, entity))\n\n        # mark the document, that it was analyzed by this plugin yet\n        data['enhance_ner_stanford_b'] = \"true\"\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_ocr.py",
    "content": "from tesseract_cache import tesseract_cache\n\n\n#\n# If image add ocr text\n#\nclass enhance_ocr(object):\n\n    # how to find uris which are not enriched yet?\n    # (if not enhanced on indexing but later)\n\n    # this plugin needs to read the field id as a\n    # parameters to enrich unenriched docs\n    fields = ['id', 'content_type']\n\n    # query to find documents, that were not enriched by this plugin yet\n    # (since we marked documents which were OCRd with ocr_b = true\n    query = \"content_type: image/* AND NOT enhance_ocr_b:true\"\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = parameters.get('verbose', False)\n\n        filename = parameters['filename']\n\n        if 'content_type_ss' in data:\n            mimetype = data['content_type_ss']\n        else:\n            mimetype = parameters['content_type_ss']\n\n        # if connector returns a list, use only first\n        # value (which is the only entry of the list)\n        if isinstance(mimetype, list):\n            mimetype = mimetype[0]\n\n        lang = parameters.get('ocr_lang', 'eng')\n\n        if \"image\" in mimetype.lower():\n            if verbose:\n                print(\"Mimetype seems image ({}), starting OCR\"\n                      .format(mimetype))\n\n            ocr_txt = tesseract_cache.get_ocr_text(filename=filename, lang=lang, cache_dir=parameters.get(\"ocr_cache\"))\n\n            if ocr_txt:\n                data['ocr_t'] = ocr_txt\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_path.py",
    "content": "import os.path\n\n#\n# Build and add path facets from filename\n#\n\nclass enhance_path(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        docid = parameters['id']\n\n        filename_extension = os.path.splitext(docid)[1][1:].lower()\n        if filename_extension:\n            data['filename_extension_s'] = filename_extension\n\n        if 'facet_path_strip_prefix' in parameters:\n            facet_path_strip_prefix = parameters['facet_path_strip_prefix']\n        else:\n            facet_path_strip_prefix = ['file://', 'http://', 'https://']\n\n        # if begins with unwanted path prefix strip it\n        if facet_path_strip_prefix:\n            for prefix in facet_path_strip_prefix:\n                if docid.startswith(prefix):\n                    docid = docid.replace(prefix, '', 1)\n                    break\n\n        # replace backslash (i.e. windows filenames) with unix path seperator\n        docid = docid.replace(\"\\\\\", '/')\n\n        # replace # (i.e. uri) with unix path seperator\n        docid = docid.replace(\"#\", '/')\n\n        # if more than one /\n        docid = docid.replace(\"//\", '/')\n\n        # split paths\n        path = docid.split('/')\n\n        # it's only a domain\n        if (len(path) == 1) or (len(path) == 2 and docid.endswith('/')):\n            data['path0_s'] = path[0]\n\n        else:\n            # it's a path\n\n            # if leading / on unix paths, split leads to first element empty, so delete it\n            if not path[0]:\n                del path[0]\n\n            i = 0\n            for subpath in path:\n\n                if i == len(path) - 1:\n                    # last element, so basename/pure filename without path\n                    if subpath:  # if not ending / so empty last part after split\n                        data['path_basename_s'] = subpath\n                else:\n                    # not last path element (=filename), so part of path, not the filename at the end\n                    data['path' + str(i) + '_s'] = subpath\n                    i += 1\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_pdf_ocr.py",
    "content": "import os.path\nimport sys\nimport subprocess\nimport hashlib\nimport tempfile\nimport json\n\nimport etl_plugin_core\nfrom tesseract_cache import tesseract_cache\n\n\n# Extract text from all extracted images from pdf\n# if splitpages is off, return one txt instead of page based list of texts\n\ndef pdfimages2text(filename, lang='eng', verbose=False,\n                   pdf_ocr=True,\n                   cache=None):\n    ocr_txt = {}\n    if cache is not None:\n        try:\n            return load_cache(filename, cache, lang, pdf_ocr)\n        except (FileNotFoundError, KeyError):\n            if verbose:\n                print('Not in PDF OCR cache, starting OCR for {}'.format(filename))\n\n    ocr_temp_dirname = tempfile.mkdtemp(prefix=\"opensemanticetl_pdf_ocr_\")\n\n    # Extract all images of the pdf to tempdir with commandline tool\n    # \"pdfimages\" from poppler pdf toolbox\n    # -j = export as JPEG\n    # -p = write page name in image filename\n    result = subprocess.call(\n        ['pdfimages', '-p', '-j', filename,\n         ocr_temp_dirname + os.path.sep + 'image'])\n\n    if result != 0:\n        sys.stderr.write(\n            \"Error: Extracting images from PDF failed for {} {}\"\n            .format(filename, result))\n        return {}, {}\n\n    images = os.listdir(ocr_temp_dirname)\n    images.sort()\n\n    for image in images:\n\n        imagefilename = ocr_temp_dirname + os.path.sep + image\n\n        if pdf_ocr:\n\n            try:\n                result = tesseract_cache.get_ocr_text(filename=imagefilename, lang=lang, cache_dir=cache)\n\n                if result:\n                    # extract page number from extracted image\n                    # filename (image-pagenumber-imagenumber.jpg)\n                    pagenumber = int(image.split('-')[1])\n\n                    append_page(ocr_txt, pagenumber, result)\n            except BaseException as e:\n                sys.stderr.write(\"Exception while OCR of PDF: {} - \"\n                                 \"maybe corrupt image: {} - exception: {}\\n\"\n                                 .format(filename, imagefilename, e))\n\n        os.remove(imagefilename)\n\n    os.rmdir(ocr_temp_dirname)\n    return ocr_txt\n\n\ndef load_cache(filename, cache, lang='eng',\n               pdf_ocr=True):\n    pdffile = open(filename, 'rb')\n    md5hash = hashlib.md5(pdffile.read()).hexdigest()\n    pdffile.close()\n    ocr_cache_filename = cache + os.path.sep + \\\n        \"{}-{}.json\".format(lang, md5hash)\n    with open(ocr_cache_filename) as f:\n        dct = json.load(f)\n        ocr_txt = None\n        if pdf_ocr:\n            ocr_txt = dict(enumerate(dct[\"ocr_txt\"], 1))\n        return ocr_txt\n\n\ndef append_page(dct, n, page):\n    if n in dct:\n        dct[n] += '\\n' + page\n    else:\n        dct[n] = page\n\n#\n# Process plugin\n#\n# check if content type PDF, if so start enrich pdf process for OCR\n#\n\nclass enhance_pdf_ocr(etl_plugin_core.Plugin):\n\n    # process plugin, if one of the filters matches\n    filter_filename_suffixes = ['.pdf']\n    filter_mimetype_prefixes = ['application/pdf']\n\n    # how to find uris which are not enriched yet?\n    # (if not enhanced on indexing but later)\n\n    # this plugin needs to read the field id as a parameters\n    # to enrich unenriched docs\n    fields = ['id', 'content_type']\n\n    # query to find documents, that were not enriched by this plugin yet\n    # (since we marked documents which were OCRd with ocr_b = true\n    query = (\"(content_type:application/pdf*) \"\n             \"AND NOT (etl_enhance_pdf_ocr_b:true)\")\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = parameters.get('verbose', False)\n\n        # no further processing, if plugin filters like for content type do not match\n        if self.filter(parameters, data):\n            return parameters, data\n    \n        filename = parameters['filename']\n\n        # is OCR of embedded images by Tika enabled or disabled by config?\n        ocr_pdf_tika = parameters.get('ocr_pdf_tika', True)\n\n        # was there a Tika exception?\n        tika_exception = parameters.get('etl_tika_exception', False)\n        if 'etl_error_plugins_ss' in data:\n            if 'enhance_extract_text_tika_server' in data['etl_error_plugins_ss']:\n                tika_exception = True\n\n\n        # OCR is done by Apache Tika plugin\n        # If standard OCR by Tika is disabled or Tika Exception, do it here\n        pdf_ocr = False\n\n        # Do not run if no images (detected by Tika plugin)\n        nothing_for_ocr = parameters.get('etl_nothing_for_ocr', False)\n\n        if nothing_for_ocr:\n\n            if verbose:\n                print('Not running OCR for PDF, since no image(s) detected by Apache Tika')\n            \n            pdf_ocr = False\n\n        elif tika_exception or ocr_pdf_tika == False:\n            pdf_ocr = True\n\n        if pdf_ocr:\n    \n            if verbose:\n                print('Mimetype is PDF or file ending is .pdf, running OCR of embedded images')\n\n                if not ocr_pdf_tika:\n                    print ('OCR of embedded images in PDF by Apache Tika is disabled, so doing OCR for PDF by plugin enhance_pdf_ocr')\n                elif tika_exception:\n                    print ('Because of Apache Tika exception, adding / trying fallback OCR for PDF by plugin enhance_pdf_ocr')\n\n            lang = parameters.get('ocr_lang', 'eng')\n    \n            ocr_txt = {}\n\n            try:\n                ocr_txt = pdfimages2text(\n                    filename=filename, lang=lang, verbose=verbose,\n                    pdf_ocr=pdf_ocr,\n                    cache=parameters.get(\"ocr_cache\"))\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception while OCR the PDF {} - {}\\n\".format(filename, e))\n        \n            parameters['enhance_pdf_ocr'] = ocr_txt\n\n            # create text field ocr_t with all OCR results of all pages\n            pages_content = [value for (key, value) in sorted(ocr_txt.items())]\n            data['ocr_t'] = \"\\n\".join(pages_content)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_pdf_page.py",
    "content": "import os\nimport sys\nimport subprocess\nimport tempfile\nimport hashlib\n\nimport etl_plugin_core\nfrom etl import ETL\n\n#\n# by split to pages (so we have links to pages instead of documents) and get text from OCR from previous running plugin enhance_pdf_ocr and run plugins for splitting results into paragraphs and sentences\n#\n\n\nclass enhance_pdf_page(etl_plugin_core.Plugin):\n\n    # process plugin, if one of the filters matches\n    filter_filename_suffixes = ['.pdf']\n    filter_mimetype_prefixes = ['application/pdf']\n\n    # how to find uris which are not enriched yet?\n    # (if not enhanced on indexing but later)\n\n    # this plugin needs to read the field id as a parameters to enrich unenriched docs\n    fields = ['id', 'content_type']\n\n    # query to find documents, that were not enriched by this plugin yet\n    # (since we marked documents which were OCRd with ocr_b = true\n    query = \"content_type: application\\/pdf* AND NOT enhance_pdf_page_b:true\"\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        # no further processing, if plugin filters like for content type do not match\n        if self.filter(parameters, data):\n            return parameters, data\n\n        if verbose:\n            print('Mimetype or filename suffix is PDF, extracting single pages for segmentation')\n\n        if 'id' in data:\n            docid = data['id']\n        else:\n            docid = parameters['id']\n\n        filename = parameters['filename']\n\n        # defaults, if pdfinfo will not detect them\n        pages = 1\n        title = 'No title'\n        author = None\n\n        # get pagecount with pdfinfo command line tool\n        pdfinfo = subprocess.check_output(\n            ['pdfinfo', '-enc', 'UTF-8', filename])\n\n        # decode\n        pdfinfo = pdfinfo.decode(encoding='UTF-8')\n\n        # get the count of pages from pdfinfo result\n        # its a text with a line per parameter\n        for line in pdfinfo.splitlines():\n            line = line.strip()\n            # we want only the line with the pagecount\n            if line.startswith('Pages:'):\n                pages = int(line.split()[1])\n\n            if line.startswith('Title:'):\n                title = line.replace(\"Title:\", '', 1)\n                title = title.strip()\n\n            if line.startswith('Author:'):\n                author = line.replace(\"Author:\", '', 1)\n                author = author.strip()\n\n        etl = ETL()\n\n        # export and index each page\n        for pagenumber in range(1, pages + 1):\n\n            if verbose:\n                print(\"Extracting PDF page {} of {}\".format(pagenumber, pages))\n            # generate temporary filename\n            md5hash = hashlib.md5(filename.encode('utf-8')).hexdigest()\n            temp_filename = tempfile.gettempdir() + os.path.sep + \\\n                \"opensemanticetl_pdftotext_\" + md5hash + \"_\" + str(pagenumber)\n\n            # call pdftotext to write the text of page into tempfile\n            try:\n                result = subprocess.check_call(['pdftotext', '-enc', 'UTF-8', '-f', str(\n                    pagenumber), '-l', str(pagenumber), filename, temp_filename])\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception extracting text from PDF page {}: {}\\n\".format(pagenumber, e))\n\n            # read text from tempfile\n            f = open(temp_filename, \"r\", encoding=\"utf-8\")\n            text = f.read()\n            os.remove(temp_filename)\n\n            partdocid = docid + '#page=' + str(pagenumber)\n\n            partparameters = parameters.copy()\n            partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server',\n                                         'enhance_entity_linking', 'enhance_multilingual']\n\n            if 'enhance_ner_spacy' in parameters['plugins']:\n                partparameters['plugins'].append('enhance_ner_spacy')\n            if 'enhance_ner_stanford' in parameters['plugins']:\n                partparameters['plugins'].append('enhance_ner_stanford')\n\n            pagedata = {}\n            pagedata['id'] = partdocid\n\n            pagedata['page_i'] = pagenumber\n            pagedata['pages_i'] = pages\n            pagedata['container_s'] = docid\n            pagedata['title_txt'] = title\n\n            if author:\n                pagedata['author_ss'] = author\n\n            pagedata['content_type_group_ss'] = \"Page\"\n            pagedata['content_type_ss'] = \"PDF page\"\n            pagedata['content_txt'] = text\n\n            if verbose:\n                print(\"Indexing extracted page {}\".format(pagenumber))\n\n            # index page\n            try:\n                partparameters, pagedata = etl.process(\n                    partparameters, pagedata)\n\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception adding PDF page {} : {}\".format(pagenumber, e))\n\n        data['pages_i'] = pages\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_pdf_page_preview.py",
    "content": "import sys\nimport subprocess\nfrom pathlib import Path\nimport hashlib\n\nimport etl_plugin_core\n\n\n# generate single page PDF for each page of the full PDF for preview so client has not to load full pdf for previewing a page\n\nclass enhance_pdf_page_preview(etl_plugin_core.Plugin):\n    \n    # process plugin, if one of the filters matches\n    filter_filename_suffixes = ['.pdf']\n    filter_mimetype_prefixes = ['application/pdf']\n\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        # no further processing, if plugin filters like for content type do not match\n        if self.filter(parameters, data):\n            return parameters, data\n\n        if verbose:\n            print('Mimetype or filename suffix is PDF, extracting single pages for preview')\n\n        if 'id' in data:\n            docid = data['id']\n        else:\n            docid = parameters['id']\n\n        filename = parameters['filename']\n\n        thumbnail_dir = '/var/opensemanticsearch/media/thumbnails'\n\n        # generate thumbnail directory\n        md5hash = hashlib.md5(docid.encode('utf-8')).hexdigest()\n\n        if not thumbnail_dir.endswith('/'):\n            thumbnail_dir += '/'\n\n        thumbnail_subdir = md5hash\n\n        Path(thumbnail_dir + thumbnail_subdir).mkdir(parents=True, exist_ok=True)\n\n        if verbose:\n            print(\"Generating single page PDF for previews from {} for {} to {}\".format(\n                filename, docid, thumbnail_dir + thumbnail_subdir))\n\n        # call pdftk burst\n        try:\n            result = subprocess.check_call(\n                ['pdftk', filename, 'burst', 'output', thumbnail_dir + thumbnail_subdir + '/%d.pdf'])\n            data['etl_thumbnails_s'] = thumbnail_subdir\n        except BaseException as e:\n            sys.stderr.write(\n                \"Exception while genarating single page PDFs by pdftk burst\\n\")\n\n        return parameters, data\n    "
  },
  {
    "path": "src/opensemanticetl/enhance_pst.py",
    "content": "import sys\nimport hashlib\nimport tempfile\nimport os\nimport shutil\nimport subprocess\n\nimport etl_plugin_core\nfrom etl_file import Connector_File\n\n#\n# Extract emails from Outlook PST file\n#\n\nclass enhance_pst(etl_plugin_core.Plugin):\n    # process plugin, if one of the filters matches\n    filter_filename_suffixes = ['.pst']\n    filter_mimetype_prefixes = ['application/vnd.ms-outlook-pst']\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        # no further processing, if plugin filters like for content type do not match\n        if self.filter(parameters, data):\n            return parameters, data\n\n        if verbose:\n            print(\"Mimetype or file ending seems Outlook PST file, starting extraction of emails\")\n\n\n        pstfilename = parameters['filename']\n\n        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs\n\n        if 'tmp' in parameters:\n            system_temp_dirname = parameters['tmp']\n            if not os.path.exists(system_temp_dirname):\n                os.mkdir(system_temp_dirname)\n        else:\n            system_temp_dirname = tempfile.gettempdir()\n\n        h = hashlib.md5(parameters['id'].encode('UTF-8'))\n        temp_dirname = system_temp_dirname + os.path.sep + \\\n            \"opensemanticetl_enhancer_pst_\" + \\\n            str(os.getpid()) + \"_\" + h.hexdigest()\n\n        if not os.path.exists(temp_dirname):\n            os.mkdir(temp_dirname)\n\n        # start external PST extractor / converter\n        result = subprocess.call(\n            ['readpst', '-S', '-D', '-o', temp_dirname, pstfilename])\n\n        if not result == 0:\n            sys.stderr.write(\n                \"Error: readpst failed for {}\".format(pstfilename))\n\n        # prepare document processing\n        connector = Connector_File()\n        connector.verbose = verbose\n        connector.config = parameters.copy()\n\n        # only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST)\n        if not 'container' in connector.config:\n            connector.config['container'] = pstfilename\n\n        for dirName, subdirList, fileList in os.walk(temp_dirname):\n\n            if verbose:\n                print('Scanning directory: %s' % dirName)\n\n            for fileName in fileList:\n                if verbose:\n                    print('Scanning file: %s' % fileName)\n\n                try:\n                    # replace temp dirname from indexed id\n                    contained_dirname = dirName.replace(temp_dirname, '', 1)\n\n                    # build a virtual filename pointing to original PST file\n\n                    if contained_dirname:\n                        contained_dirname = contained_dirname + os.path.sep\n                    else:\n                        contained_dirname = os.path.sep\n\n                    connector.config['id'] = parameters['id'] + \\\n                        contained_dirname + fileName\n\n                    contained_filename = dirName + os.path.sep + fileName\n\n                    # E-mails filenames are pure number\n                    # Attachment file names are number-filename\n                    # if temp_filename without - in filename, its a mail file\n                    # rename to suffix .eml so Tika will extract more metadata like from and to\n                    if not '-' in fileName:\n                        os.rename(contained_filename,\n                                  contained_filename + '.eml')\n                        contained_filename += '.eml'\n                        connector.config['id'] += '.eml'\n\n                    try:\n                        connector.index_file(filename=contained_filename)\n\n                    except KeyboardInterrupt:\n                        raise KeyboardInterrupt\n\n                    except BaseException as e:\n                        sys.stderr.write(\"Exception while indexing contained content {} from {} : {}\\n\".format(\n                            fileName, connector.config['container'], e.args[0]))\n\n                    os.remove(contained_filename)\n\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Exception while indexing file {} : {}\\n\".format(fileName, e.args[0]))\n\n        shutil.rmtree(temp_dirname)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_rdf.py",
    "content": "import sys\nimport logging\nimport rdflib\n\nimport etl_plugin_core\n\n# define used ontologies / standards / properties\nskos = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#')\nowl = rdflib.Namespace('http://www.w3.org/2002/07/owl#')\n\nimport etl\nfrom etl import ETL\n\n\n# Import RDF graph file granular, not only as a whole single file:\n# for every entity (subject) own document with properties (predicates) as facets and its objects as values\n\nclass enhance_rdf(etl_plugin_core.Plugin):\n\n    def __init__(self, verbose=False):\n\n        self.verbose = verbose\n\n        self.labelProperties = (rdflib.term.URIRef(u'http://www.w3.org/2004/02/skos/core#prefLabel'), rdflib.term.URIRef(u'http://www.w3.org/2000/01/rdf-schema#label'),\n                                rdflib.term.URIRef(u'http://www.w3.org/2004/02/skos/core#altLabel'), rdflib.term.URIRef(u'http://www.w3.org/2004/02/skos/core#hiddenLabel'))\n\n\n    #\n    # get all labels, alternate labels / synonyms for the URI/subject, if not there, use subject (=URI) as default\n    #\n\n    def get_labels(self, subject):\n\n        labels = []\n\n        # append RDFS.label\n\n        # get all labels for this obj\n        for label in self.graph.objects(subject=subject, predicate=rdflib.RDFS.label):\n            labels.append(str(label))\n\n        #\n        # append SKOS labels\n        #\n\n        # append SKOS prefLabel\n        skos = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#')\n        for label in self.graph.objects(subject=subject, predicate=skos['prefLabel']):\n            labels.append(str(label))\n\n        # append SKOS altLabels\n        for label in self.graph.objects(subject=subject, predicate=skos['altLabel']):\n            labels.append(str(label))\n\n        # append SKOS hiddenLabels\n        for label in self.graph.objects(subject=subject, predicate=skos['hiddenLabel']):\n            labels.append(str(label))\n\n        return labels\n\n    #\n    # Get indexable full text(s) / label(s) instead of URI references\n    #\n\n    def get_values(self, obj):\n\n        values = []\n\n        # since we want full text search we want not to use ID/URI but all labels for indexing\n        # if type not literal but URI reference, add label(s)\n\n        if type(obj) == rdflib.URIRef:\n\n            # get labels of this object, therefore it is the subject parameter for getlabels()\n            values = self.get_labels(subject=obj)\n\n            if not values:\n\n                if self.verbose:\n                    print(\"No label for this object, using URI {}\".format(obj))\n\n                values = str(obj)\n\n        elif type(obj) == rdflib.term.Literal:\n            values = str(obj)\n\n        # if no values or labels, use the object / URI\n        if not values:\n            if self.verbose:\n                print(\"No label or URI for this object, using object {}\".format(obj))\n                print(\"Data type of RDF object: {}\".format(type(obj)))\n\n            values = str(obj)\n\n        return values\n\n    # best/preferred label as title\n    def get_preferred_label(self, subject, lang='en'):\n\n        preferred_label = self.graph.preferredLabel(\n            subject=subject, lang=lang, labelProperties=self.labelProperties)\n\n        # if no label in preferred language, try with english, if not preferred lang is english yet)\n        if not preferred_label and not lang == 'en':\n\n            preferred_label = self.graph.preferredLabel(\n                subject=subject, lang='en', labelProperties=self.labelProperties)\n\n        # use label from some other language\n        if not preferred_label:\n\n            preferred_label = self.graph.preferredLabel(\n                subject=subject, labelProperties=self.labelProperties)\n\n        # if no label, use URI\n        if preferred_label:\n            # since return is tuple with type and label take only the label\n            preferred_label = preferred_label[0][1]\n        else:\n            preferred_label = subject\n\n        return str(preferred_label)\n\n    #\n    # ETL knowledge graph to full text search index\n    #\n\n    # Index each entity / subject with all its properties/predicates as facets and objects (dereference URIs by their labels) as values\n\n    def etl_graph(self, parameters):\n\n        if self.verbose:\n            print(\"Graph has {} triples.\".format(len(self.graph)))\n\n        count_triple = 0\n        count_subjects = 0\n\n        part_parameters = {}\n        part_parameters['plugins'] = []\n        part_parameters['export'] = parameters['export']\n\n        property2facet = {}\n        if 'property2facet' in parameters:\n            property2facet = parameters['property2facet']\n\n        etl_processor = ETL()\n        etl_processor.verbose = self.verbose\n\n        class_properties = []\n        class_properties.append(rdflib.term.URIRef(\n            u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))\n        class_properties.append(rdflib.term.URIRef(\n            u'http://www.wikidata.org/prop/direct/P31'))\n        # since there can be multiple triples/values for same property in/from different graphs or graph describes existing other file/document,\n        # do not overwrite document but add value to existent document & values of the facet/field/property\n        part_parameters['add'] = True\n\n        # use SPARQL query with distinct to get subjects only once\n        res = self.graph.query(\n                \"\"\"SELECT DISTINCT ?subject\n\t\t\tWHERE {\n\t\t\t?subject ?predicate ?object .\n\t\t\t}\"\"\")\n\n        for row in res:\n\n            count_subjects += 1\n\n            if self.verbose:\n                print(\"Importing entity / subject {}\".format(count_subjects))\n\n            # get subject of the concept from first column\n            subj = row[0]\n\n            if self.verbose:\n                print(\"Processing RDF subject {}\".format(subj))\n\n            part_data = {}\n\n            part_data['content_type_group_ss'] = 'Knowledge graph'\n            # subject as URI/ID\n            part_parameters['id'] = str(subj)\n\n            preferred_label = self.get_preferred_label(subject=subj)\n            part_data['title_txt'] = preferred_label\n\n            count_subject_triple = 0\n\n            # get all triples for this subject\n            for pred, obj in self.graph.predicate_objects(subject=subj):\n\n                count_triple += 1\n                count_subject_triple += 1\n\n                if self.verbose:\n                    print(\"Importing subjects triple {}\".format(\n                        count_subject_triple))\n                    print(\"Predicate / property: {}\".format(pred))\n                    print(\"Object / value: {}\".format(obj))\n\n                try:\n\n                    # if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata \"instance of\" (Property:P31)),\n                    # so its name (label) will be available in entities view and as filter for faceted search\n\n                    if pred in class_properties:\n                        class_facet = str(obj)\n                        # map class to facet, if mapping for class exist\n                        if class_facet in property2facet:\n                            class_facet = property2facet[class_facet]\n                            if class_facet in parameters['facets']:\n                                part_data['content_type_ss'] = 'Knowledge graph class {}'.format(\n                                    parameters['facets'][class_facet]['label'])\n                                etl.append(data=part_data, facet=class_facet, values=preferred_label)\n\n                    #\n                    # Predicate/property to facet/field\n                    #\n\n                    # set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype\n\n                    facet = str(pred) + '_ss'\n                    facet_uri = facet + '_uri_ss'\n                    facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss'\n\n                    if self.verbose:\n                        print(\"Facet: {}\".format(facet))\n\n                    #\n                    # get values or labels of this object\n                    #\n\n                    values = self.get_values(obj=obj)\n                    if self.verbose:\n                        print(\"Values: {}\".format(values))\n\n                    # insert or append value (object of triple) to data\n                    etl.append(data=part_data, facet=facet, values=values)\n\n                    # if object is reference/URI append URI\n                    if type(obj) == rdflib.URIRef:\n\n                        uri = str(obj)\n\n                        etl.append(data=part_data, facet=facet_uri, values=uri)\n\n                        # append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search\n                        preferredlabel_and_uri = \"{} <{}>\".format(\n                            self.get_preferred_label(subject=obj), str(obj))\n\n                    else:\n                        preferredlabel_and_uri = self.get_preferred_label(\n                            subject=obj)\n\n                    etl.append(\n                        data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri)\n\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n\n                except BaseException as e:\n                    sys.stderr.write(\"Exception while triple {} of subject {}: {}\\n\".format(\n                        count_subject_triple, subj, e))\n\n            # index subject\n            etl_processor.process(part_parameters, part_data)\n\n    def etl_graph_file(self, docid, filename, parameters=None):\n        if parameters is None:\n            parameters = {}\n\n        self.graph = rdflib.Graph()\n        self.graph.parse(filename)\n\n        self.etl_graph(parameters=parameters)\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                self.verbose = True\n\n        # get parameters\n        docid = parameters['id']\n        filename = parameters['filename']\n\n        mimetype = ''\n        if 'content_type_ss' in data:\n            mimetype = data['content_type_ss']\n        elif 'content_type_ss' in parameters:\n            mimetype = parameters['content_type_ss']\n\n        # if connector returns a list, use only first value (which is the only entry of the list)\n        if isinstance(mimetype, list):\n            mimetype = mimetype[0]\n\n        # todo: add other formats like turtle\n        # if mimetype is graph, call graph import\n        if mimetype.lower() == \"application/rdf+xml\":\n\n            self.etl_graph_file(docid, filename, parameters=parameters)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_rdf_annotations_by_http_request.py",
    "content": "import os\nimport sys\nimport hashlib\nimport urllib\nimport rdflib\nfrom rdflib import URIRef\n\n# Do templating of metaserver url for id\n\n\ndef metaserver_url(metaserver, docid):\n\n    metaurl = metaserver\n\n    metaurl = metaurl.replace('[uri]', urllib.parse.quote_plus(docid))\n\n    h = hashlib.md5(docid.encode(\"utf-8\"))\n    metaurl = metaurl.replace(\n        '[uri_md5]', urllib.parse.quote_plus(h.hexdigest()))\n\n    return metaurl\n\n\n# get the modification date of meta data\n# todo: check all metaservers, not only the last one and return latest date\n\ndef getmeta_modified(metaservers, docid, verbose=False):\n\n    if isinstance(metaservers, str):\n        metaserver = metaservers\n    else:\n        for server in metaservers:\n            metaserver = server\n\n    metaurl = metaserver_url(metaserver, docid)\n\n    moddate = False\n\n    if verbose:\n        print(\"Getting Meta from {}\".format(metaurl))\n\n    try:\n        g = rdflib.Graph()\n        result = g.parse(metaurl)\n\n        # if semantic mediawiki modification date field, take this as date\n\n        for subj, pred, obj in g.triples((None, URIRef(\"http://semantic-mediawiki.org/swivt/1.0#wikiPageModificationDate\"), None)):\n\n            # todo only if later than previos, if more than one (f.e. more than one metaserver)\n            moddate = str(obj)\n\n        if verbose:\n            print(\"Extracted modification date: {}\".format(moddate))\n\n        if verbose:\n            if not moddate:\n                print(\"No modification date for metadata\")\n\n    except BaseException as e:\n        sys.stderr.write(\n            \"Exception while getting metadata modification time: {}\\n\".format(e.args[0]))\n\n    return moddate\n\n\n# Get tagging and annotation from metadata server\ndef getmeta_rdf_from_server(metaserver, data, property2facet, docid, verbose=False):\n\n    moddate = False\n\n    metaurl = metaserver_url(metaserver, docid)\n\n    if verbose:\n        print(\"Getting Meta from {}\".format(metaurl))\n\n    g = rdflib.Graph()\n    result = g.parse(metaurl)\n\n    # Print infos\n    if verbose:\n        print(\"Meta graph has {} statements.\".format(len(g)))\n        for subj, pred, obj in g:\n\n            try:\n                print(\"{} : {}\".format(pred, obj.toPython))\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception while printing triple: {}\\n\".format(e.args[0]))\n\n    # make solr iteral for each rdf tripple contained in configurated properties\n    for facet in property2facet:\n\n        # if this predicat is configured as facet, add literal with pred as facetname and object as value\n        try:\n            if verbose:\n                print('Checking Facet {}'.format(facet))\n\n            facetRef = URIRef(facet)\n\n            for subj, pred, obj in g.triples((None, facetRef, None)):\n                try:\n\n                    # add the facet with object as value\n                    solr_facet = property2facet[facet]\n\n                    if verbose:\n                        print(\"Adding Solr facet {} with the object {}\".format(\n                            solr_facet, obj))\n\n                    if solr_facet in data:\n                        data[solr_facet].append(obj.toPython())\n                    else:\n                        data[solr_facet] = [obj.toPython()]\n\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Exception while checking predicate {}{}\\n\".format(pred, e.args[0]))\n\n        except BaseException as e:\n            sys.stderr.write(\n                \"Exception while checking a part of metadata graph: {}\\n\".format(e.args[0]))\n\n    # if semantic mediawiki modification date field, take this as date\n    moddateRef = URIRef(\n        \"http://semantic-mediawiki.org/swivt/1.0#wikiPageModificationDate\")\n    if (None, moddateRef, None) in g:\n        for subj, pred, obj in g.triples((None, moddateRef, None)):\n            moddate = obj.toPython()\n\n        # todo: transform date format to date and in exporter date to Solr date string format\n        #data['meta_modified_dt'] = str(moddate)\n\n        if verbose:\n            print(\"Extracted modification date: {}\".format(moddate))\n\n    elif verbose:\n        print(\"No semantic mediawiki modification date\")\n\n    return data\n\n\n# Get tagging and annotation from metadata server\n\nclass enhance_rdf_annotations_by_http_request(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        # get parameters\n        docid = parameters['id']\n\n        metaserver = parameters['metaserver']\n        if os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER'):\n            metaserver = os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER')\n\n        property2facet = parameters['property2facet']\n\n        if isinstance(metaserver, str):\n            # get metadata\n            metaserver=[metaserver]\n \n        for server in metaserver:\n            # get and add metadata\n            data = getmeta_rdf_from_server(\n                metaserver=server, data=data, property2facet=property2facet, docid=docid, verbose=verbose)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_regex.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n\ndef regex2facet(data, text, regex, group, facet, verbose=False):\n\n    if verbose:\n        print(\"Checking regex {} for facet {}\".format(regex, facet))\n\n    matches = re.finditer(regex, text, re.IGNORECASE)\n\n    if matches:\n        for match in matches:\n\n            try:\n                value = match.group(group)\n                if verbose:\n                    print(\"Found regex {} with value {} for facet {}\".format(\n                        regex, value, facet))\n\n                etl_plugin_core.append(data, facet, value)\n\n            except BaseException as e:\n                print(\"Exception while adding value {} from regex {} and group {} to facet {}:\".format(\n                    value, regex, group, facet))\n                print(e.args[0])\n\n\n# opens a tab with regexes and facets\ndef readregexesfromfile(data, text, filename, verbose=False):\n    listfile = open(filename)\n\n    # search all the lines\n    for line in listfile:\n        try:\n            line = line.strip()\n\n            # ignore empty lines and comment lines (starting with #)\n            if line and not line.startswith(\"#\"):\n                facet = 'tag_ss'\n                columns = line.split(\"\\t\")\n\n                regex = columns[0]\n\n                if len(columns) > 1:\n                    facet = columns[1]\n\n                if len(columns) > 2:\n                    group = int(columns[2])\n                else:\n                    group = 0\n\n                regex2facet(data=data, text=text, regex=regex,\n                            group=group, facet=facet, verbose=verbose)\n\n        except BaseException as e:\n            print(\"Exception while checking line {} of regexlist {}:\".format(\n                line, filename))\n            print(e.args[0])\n\n    listfile.close()\n\n\n#\n# add to configured facet, if entry in list is in text\n#\n\nclass enhance_regex(object):\n    def process(self, parameters=None, data=None):\n\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        regexlists = {}\n\n        if 'regex_lists' in parameters:\n            regexlists = parameters['regex_lists']\n\n        # collect/copy to be analyzed text from all fields\n        text = etl_plugin_core.get_text(data=data)\n\n        for regexlistfile in regexlists:\n\n            try:\n\n                readregexesfromfile(data=data, text=text,\n                                    filename=regexlistfile, verbose=verbose)\n\n            except BaseException as e:\n                print(\"Exception while checking regex list {}:\".format(regexlistfile))\n                print(e.args[0])\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_sentence_segmentation.py",
    "content": "import json\nimport os\nimport requests\nimport sys\nimport time\n\nfrom etl import ETL\n\n#\n# split text to sentences\n#\n\n\nclass enhance_sentence_segmentation(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        if 'id' in data:\n            docid = data['id']\n        else:\n            docid = parameters['id']\n\n        # default classifier\n        classifier = 'en_core_web_sm'\n\n        if 'spacy_ner_classifier_default' in parameters:\n            classifier = parameters['spacy_ner_classifier_default']\n\n        # set language specific classifier, if configured and document language detected\n        if 'spacy_ner_classifiers' in parameters and 'language_s' in data:\n            # is a language speciic cassifier there for the detected language?\n            if data['language_s'] in parameters['spacy_ner_classifiers']:\n                classifier = parameters['spacy_ner_classifiers'][data['language_s']]\n\n                analyse_fields = ['content_txt', 'ocr_t', 'ocr_descew_t']\n\n        text = ''\n        for field in analyse_fields:\n            if field in data:\n                text = \"{}{}\\n\".format(text, data[field])\n\n        # extract sentences from text\n        url = \"http://localhost:8080/sents\"\n        if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):\n            url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/sents'\n\n        headers = {'content-type': 'application/json'}\n        d = {'text': text, 'model': classifier}\n\n        retries = 0\n        retrytime = 1\n        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry\n        retrytime_max = 120\n        no_connection = True\n\n        while no_connection:\n            try:\n                if retries > 0:\n                    print(\n                        'Retrying to connect to Spacy services in {} second(s).'.format(retrytime))\n                    time.sleep(retrytime)\n                    retrytime = retrytime * 2\n                    if retrytime > retrytime_max:\n                        retrytime = retrytime_max\n\n                response = requests.post(url, data=json.dumps(d), headers=headers)\n\n                # if bad status code, raise exception\n                response.raise_for_status()\n\n                no_connection = False\n\n            except requests.exceptions.ConnectionError as e:\n                retries += 1\n                sys.stderr.write(\n                    \"Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\\n\".format(retrytime, e))\n\n        sentences = response.json()\n\n        etl = ETL()\n\n        sentencenumber = 0\n\n        for sentence in sentences:\n\n            sentencenumber += 1\n\n            partdocid = docid + '#sentence' + str(sentencenumber)\n\n            partparameters = parameters.copy()\n            partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server',\n                                         'enhance_entity_linking', 'enhance_multilingual']\n\n            if 'enhance_ner_spacy' in parameters['plugins']:\n                partparameters['plugins'].append('enhance_ner_spacy')\n            if 'enhance_ner_stanford' in parameters['plugins']:\n                partparameters['plugins'].append('enhance_ner_stanford')\n\n            sentencedata = {}\n            sentencedata['id'] = partdocid\n\n            sentencedata['container_s'] = docid\n\n            if 'author_ss' in data:\n                sentencedata['author_ss'] = data['author_ss']\n\n            sentencedata['content_type_group_ss'] = \"Sentence\"\n            sentencedata['content_type_ss'] = \"Sentence\"\n            sentencedata['content_txt'] = sentence\n\n            # index sentence\n            try:\n                partparameters, sentencedata = etl.process(\n                    partparameters, sentencedata)\n\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception adding sentence {} : {}\".format(sentencenumber, e))\n\n        data['sentences_i'] = sentencenumber\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_warc.py",
    "content": "import hashlib\nimport tempfile\nimport os\nimport sys\nimport shutil\nimport time\n\nfrom warcio.archiveiterator import ArchiveIterator\n\nimport etl_plugin_core\nfrom etl_file import Connector_File\n\n\nclass enhance_warc(etl_plugin_core.Plugin):\n\n    # process plugin, if one of the filters matches\n    filter_filename_suffixes = ['.warc', '.warc.gz']\n    filter_mimetype_prefixes = ['application/warc']\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        # no further processing, if plugin filters like for content type do not match\n        if self.filter(parameters, data):\n            return parameters, data\n\n        warcfilename = parameters['filename']\n\n        # create temp dir where to unwarc the archive\n        if 'tmp' in parameters:\n            system_temp_dirname = parameters['tmp']\n            if not os.path.exists(system_temp_dirname):\n                os.mkdir(system_temp_dirname)\n        else:\n            system_temp_dirname = tempfile.gettempdir()\n\n        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs\n        h = hashlib.md5(parameters['id'].encode('UTF-8'))\n        temp_dirname = system_temp_dirname + os.path.sep + \\\n            \"opensemanticetl_enhancer_warc_\" + h.hexdigest()\n\n        if os.path.exists(temp_dirname) == False:\n            os.mkdir(temp_dirname)\n\n        # prepare document processing\n        connector = Connector_File()\n        connector.verbose = verbose\n        connector.config = parameters.copy()\n\n        # only set container if not yet set by a zip before (if this zip is inside another zip)\n        if not 'container' in connector.config:\n            connector.config['container'] = warcfilename\n\n        i = 0\n\n        with open(warcfilename, 'rb') as stream:\n            for record in ArchiveIterator(stream):\n                i += 1\n\n                if record.rec_type == 'response':\n\n                    print(record.rec_headers)\n\n                    # write WARC record content to tempfile\n                    tempfilename = temp_dirname + \\\n                        os.path.sep + 'warcrecord' + str(i)\n                    tmpfile = open(tempfilename, 'wb')\n                    tmpfile.write(record.content_stream().read())\n                    tmpfile.close()\n\n                    # set last modification time of the file to WARC-Date\n                    try:\n                        last_modified = time.mktime(time.strptime(\n                            record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ'))\n                        os.utime(tempfilename, (last_modified, last_modified))\n                    except BaseException as e:\n                        sys.stderr.write(\"Exception while reading filedate to warc content {} from {} : {}\\n\".format(\n                            tempfilename, connector.config['container'], e))\n\n                    # set id (URL and WARC Record ID)\n                    uri = record.rec_headers.get_header('WARC-Target-URI')\n                    if not uri.endswith('/'):\n                        uri += '/'\n                    connector.config['id'] = uri + record.rec_headers.get_header('WARC-Record-ID')\n\n                    # index the extracted file\n                    try:\n\n                        connector.index_file(filename=tempfilename)\n\n                    except KeyboardInterrupt:\n                        raise KeyboardInterrupt\n\n                    except BaseException as e:\n                        sys.stderr.write(\"Exception while indexing warc content {} from {} : {}\\n\".format(\n                            tempfilename, connector.config['container'], e))\n\n                    os.remove(tempfilename)\n\n        shutil.rmtree(temp_dirname)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_xml.py",
    "content": "import xml.etree.ElementTree as ElementTree\nimport os.path\nimport sys\n\n\nclass enhance_xml(object):\n\n    def elements2data(self, element, data, path=\"xml\"):\n\n        path += \"/\" + element.tag\n\n        fieldname = path + '_ss'\n\n        text = element.text.strip()\n\n        if text:\n            if fieldname in data:\n                data[fieldname].append(text)\n            else:\n                data[fieldname] = [text]\n\n        for child in element:\n            data = self.elements2data(element=child, path=path, data=data)\n\n        return data\n\n    # get xml filename by mapping configuration\n    def get_xml_filename(self, filename, mapping):\n\n        dirname = os.path.dirname(filename)\n        basename = os.path.basename(filename)\n\n        xmlfilename = mapping\n\n        xmlfilename = xmlfilename.replace('%DIRNAME%', dirname)\n        xmlfilename = xmlfilename.replace('%BASENAME%', dirname)\n\n        if not os.path.isfile(xmlfilename):\n            xmlfilename = False\n\n        return xmlfilename\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        mapping = parameters['xml_sidecar_file_mapping']\n\n        #\n        # is there a xml sidecar file?\n        #\n\n        xmlfilename = self.get_xml_filename(filename, mapping)\n\n        if verbose:\n\n            if xmlfilename:\n\n                print('XML sidecar file: {}'.format(xmlfilename))\n\n            else:\n                print(\"No xml sidecar file\")\n\n        #\n        # read meta data from the XML sidecar file\n        #\n        if xmlfilename:\n\n            if verbose:\n                print(\"Reading XML sidecar file: {}\".format(xmlfilename))\n            try:\n\n                # Parse the XML file\n                parser = ElementTree.XMLParser()\n                et = ElementTree.parse(xmlfilename, parser)\n                root = et.getroot()\n\n                for child in root:\n                    self.elements2data(element=child, path=root.tag, data=data)\n\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception while parsing XML {} {}\".format(xmlfilename, e))\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_xmp.py",
    "content": "import xml.etree.ElementTree as ElementTree\nimport os.path\nimport sys\n\n\n#\n# is there a xmp sidecar file?\n#\n\ndef get_xmp_filename(filename):\n\n    xmpfilename = False\n\n    # some xmp sidecar filenames are based on the original filename without extensions like .jpg or .jpeg\n    filenamewithoutextension = '.' . join(filename.split('.')[:-1])\n\n    # check if a xmp sidecar file exists\n    if os.path.isfile(filename + \".xmp\"):\n        xmpfilename = filename + \".xmp\"\n    elif os.path.isfile(filename + \".XMP\"):\n        xmpfilename = filename + \".XMP\"\n    elif os.path.isfile(filenamewithoutextension + \".xmp\"):\n        xmpfilename = filenamewithoutextension + \".xmp\"\n    elif os.path.isfile(filenamewithoutextension + \".XMP\"):\n        xmpfilename = filenamewithoutextension + \".XMP\"\n\n    return xmpfilename\n\n\n# Build path facets from filename\n\nclass enhance_xmp(object):\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        #\n        # is there a xmp sidecar file?\n        #\n        xmpfilename = get_xmp_filename(filename)\n\n        if not xmpfilename:\n            if verbose:\n                print(\"No xmp sidecar file\")\n\n        #\n        # read meta data of the xmp sidecar file (= xml + rdf)\n        #\n        if xmpfilename:\n\n            creator = False\n            headline = False\n            creator = False\n            location = False\n            tags = []\n\n            if verbose:\n                print(\"Reading xmp sidecar file {}\".format(xmpfilename))\n            try:\n\n                # Parse the xmp file with utf 8 encoding\n                parser = ElementTree.XMLParser(encoding=\"utf-8\")\n                et = ElementTree.parse(xmpfilename, parser)\n                root = et.getroot()\n\n                # get author\n                try:\n                    creator = root.findtext(\n                        \".//{http://purl.org/dc/elements/1.1/}creator\")\n\n                    if creator:\n                        data['author_ss'] = creator\n\n                except BaseException as e:\n                    sys.stderr.write(\"Exception while parsing creator from xmp {} {}\".format(\n                        xmpfilename, e.args[0]))\n\n                # get headline\n                try:\n                    headline = root.findtext(\n                        \".//{http://ns.adobe.com/photoshop/1.0/}Headline\")\n\n                    if headline:\n                        data['title_txt'] = headline\n\n                except BaseException as e:\n                    sys.stderr.write(\"Exception while parsing headline from xmp {} {}\".format(\n                        xmpfilename, e.args[0]))\n\n                # get location\n                try:\n                    location = root.findtext(\n                        \".//{http://iptc.org/std/Iptc4xmpCore/1.0/xmlns/}Location\")\n\n                    if location:\n\n                        if 'locations_ss' in data:\n                            data['locations_ss'].append(location)\n                        else:\n                            data['locations_ss'] = [location]\n\n                except BaseException as e:\n                    sys.stderr.write(\"Exception while parsing location from xmp {} {}\".format(\n                        xmpfilename, e.args[0]))\n\n                # get tags (named \"subject\")\n                try:\n                    for tag in root.findall(\".//{http://purl.org/dc/elements/1.1/}subject/{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Bag/{http://www.w3.org/1999/02/22-rdf-syntax-ns#}li\"):\n                        try:\n                            if 'tag_ss' in data:\n                                data['tag_ss'].append(tag.text)\n                            else:\n                                data['tag_ss'] = [tag.text]\n\n                        except BaseException as e:\n                            sys.stderr.write(\"Exception while parsing a tag from xmp {} {}\".format(\n                                xmpfilename, e.args[0]))\n                except BaseException as e:\n                    sys.stderr.write(\"Exception while parsing tags from xmp {} {}\".format(\n                        xmpfilename, e.args[0]))\n\n            except BaseException as e:\n                sys.stderr.write(\"Exception while parsing xmp {} {}\".format(\n                    xmpfilename, e.args[0]))\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/enhance_zip.py",
    "content": "import zipfile\nimport sys\nimport hashlib\nimport tempfile\nimport os\nimport shutil\nfrom etl_file import Connector_File\n\n\nclass enhance_zip(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = parameters['filename']\n\n        # if the processed file was extracted from a zip (parameter container was set), write container setting in data, so the link of the id/content can be set to the zip file\n        if 'container' in parameters:\n            if not 'container_s' in data:\n                data['container_s'] = parameters['container']\n\n        # if this file is a zip file, unzip it\n        if zipfile.is_zipfile(filename):\n            self.unzip_and_index_files(\n                zipfilename=filename, parameters=parameters, verbose=verbose)\n\n        return parameters, data\n\n    # unzip all content and index each file with literal filename of the zip file in field container\n    def unzip_and_index_files(self, zipfilename, parameters=None, verbose=False):\n        if parameters is None:\n            parameters = {}\n\n        # create temp dir where to unzip the archive\n\n        if 'tmp' in parameters:\n            system_temp_dirname = parameters['tmp']\n            if not os.path.exists(system_temp_dirname):\n                os.mkdir(system_temp_dirname)\n        else:\n            system_temp_dirname = tempfile.gettempdir()\n\n        # we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs\n        h = hashlib.md5(parameters['id'].encode('UTF-8'))\n        temp_dirname = system_temp_dirname + os.path.sep + \\\n            \"opensemanticetl_enhancer_zip_\" + h.hexdigest()\n\n        if os.path.exists(temp_dirname) == False:\n            os.mkdir(temp_dirname)\n\n        # unzip the files\n        my_zip = zipfile.ZipFile(zipfilename)\n        my_zip.extractall(temp_dirname)\n        my_zip.close()\n\n        # prepare document processing\n        connector = Connector_File()\n        connector.verbose = verbose\n        connector.config = parameters.copy()\n\n        # only set container if not yet set by a zip before (if this zip is inside another zip)\n        if not 'container' in connector.config:\n            connector.config['container'] = zipfilename\n\n        # walk trough all unzipped directories / files and index all files\n        for dirName, subdirList, fileList in os.walk(temp_dirname):\n\n            if verbose:\n                print('Scanning directory: %s' % dirName)\n\n            for fileName in fileList:\n                if verbose:\n                    print('Scanning file: %s' % fileName)\n\n                try:\n                    # replace temp dirname from indexed id\n                    zipped_dirname = dirName.replace(temp_dirname, '', 1)\n\n                    # build a virtual filename pointing to original zip file\n\n                    if zipped_dirname:\n                        zipped_dirname = zipped_dirname + os.path.sep\n                    else:\n                        zipped_dirname = os.path.sep\n\n                    connector.config['id'] = parameters['id'] + \\\n                        zipped_dirname + fileName\n\n                    unziped_filename = dirName + os.path.sep + fileName\n\n                    try:\n\n                        connector.index_file(filename=unziped_filename)\n\n                    except KeyboardInterrupt:\n                        raise KeyboardInterrupt\n\n                    except BaseException as e:\n                        sys.stderr.write(\"Exception while indexing zipped content {} from {} : {}\\n\".format(\n                            fileName, connector.config['container'], e))\n\n                    os.remove(unziped_filename)\n\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Exception while indexing file {} : {}\\n\".format(fileName, e))\n\n        shutil.rmtree(temp_dirname)\n"
  },
  {
    "path": "src/opensemanticetl/etl.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport datetime\nimport importlib\nimport os\nimport sys\n\nimport filter_blacklist\n\n#\n# Extract Transform Load (ETL):\n#\n\n# Runs the configured plugins with parameters from configs it reads\n#\n# Then exports data like content, data enrichment or analytics results generated by the plugins\n# to index or database\n\n\nclass ETL(object):\n\n    def __init__(self, plugins=(), verbose=False):\n\n        self.verbose = verbose\n\n        self.config = {}\n\n        self.config['plugins'] = list(plugins)\n\n        self.set_configdefaults()\n\n    def set_configdefaults(self):\n\n        #\n        # Standard config\n        #\n        # Do not edit config here! Overwrite options in /etc/opensemanticsearch/etl or connector configs\n        #\n\n        self.config['plugins'] = ['enhance_extract_text_tika_server',\n                                  'enhance_detect_language_tika_server']\n        self.config['export'] = 'export_solr'\n        self.config['regex_lists'] = []\n\n        self.config['raise_pluginexception'] = False\n\n    def init_exporter(self):\n\n        exporter = self.config['export']\n\n        module = importlib.import_module(exporter)\n        objectreference = getattr(module, exporter)\n        self.exporter = objectreference(self.config)\n\n    def read_configfile(self, configfile):\n        result = False\n\n        if os.path.isfile(configfile):\n            config = self.config\n            file = open(configfile, \"r\")\n            exec(file.read(), locals())\n            file.close()\n            self.config = config\n\n            result = True\n\n        # if another exporter\n        self.init_exporter()\n\n    def is_plugin_blacklisted_for_contenttype(self, plugin, parameters, data):\n\n        blacklisted = False\n\n        # is there a content type yet?\n        if 'content_type_ss' in data:\n            content_types = data['content_type_ss']\n        elif 'content_type_ss' in parameters:\n            content_types = parameters['content_type_ss']\n        else:\n            content_types = None\n\n        # if content type check the plugins' blacklists\n        if content_types:\n\n            if not isinstance(content_types, list):\n                content_types = [content_types]\n\n            for content_type in content_types:\n                # Do not try to blacklist by content type if none was determined.\n                if not content_type:\n                    continue\n\n                # directory where the plugins' blacklist are\n                blacklistdir = '/etc/opensemanticsearch/blacklist/' + plugin + '/'\n\n                filename = blacklistdir + 'blacklist-contenttype'\n                if os.path.isfile(filename):\n                    if filter_blacklist.is_in_list(filename=filename, value=content_type):\n                        blacklisted = True\n\n                if not blacklisted:\n                    filename = blacklistdir + 'blacklist-contenttype-prefix'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type, match=\"prefix\"):\n                            blacklisted = True\n\n                if not blacklisted:\n                    filename = blacklistdir + 'blacklist-contenttype-suffix'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type, match=\"suffix\"):\n                            blacklisted = True\n\n                if not blacklisted:\n                    filename = blacklistdir + 'blacklist-contenttype-regex'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type, match=\"regex\"):\n                            blacklisted = True\n\n                # check whitelists for plugin, if blacklisted but should not\n                if blacklisted:\n                    filename = blacklistdir + 'whitelist-contenttype'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type):\n                            blacklisted = False\n\n                if blacklisted:\n                    filename = blacklistdir + 'whitelist-contenttype-prefix'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type, match=\"prefix\"):\n                            blacklisted = False\n\n                if blacklisted:\n                    filename = blacklistdir + 'whitelist-contenttype-suffix'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type, match=\"suffix\"):\n                            blacklisted = False\n\n                if blacklisted:\n                    filename = blacklistdir + 'whitelist-contenttype-regex'\n                    if os.path.isfile(filename):\n                        if filter_blacklist.is_in_list(filename=filename, value=content_type, match=\"regex\"):\n                            blacklisted = False\n\n        return blacklisted\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n        \n        time_start = datetime.datetime.now()\n\n        if 'plugins' in parameters:\n            plugins = sort_plugins(parameters['plugins'])\n        else:\n            plugins = sort_plugins(self.config['plugins'])\n\n        data['etl_error_plugins_ss'] = []\n        data['etl_error_txt'] = []\n\n        for plugin in plugins:\n\n            data['etl_error_' + plugin + '_txt'] = []\n\n            # if content_type / plugin combination blacklisted, continue with next plugin\n            if self.is_plugin_blacklisted_for_contenttype(plugin, parameters, data):\n\n                if self.verbose:\n                    print(\n                        \"Not starting plugin {} because this plugin is blacklisted for the contenttype\".format(plugin))\n\n                # mark plugin as blacklisted\n                data['etl_' + plugin + '_blacklisted_b'] = True\n\n                continue\n\n            # start plugin\n            if self.verbose:\n                print(\"Starting plugin {}\".format(plugin))\n\n            time_plugin_start = datetime.datetime.now()\n            try:\n                module = importlib.import_module(plugin)\n\n                objectreference = getattr(module, plugin, False)\n\n                # if object oriented programming, run instance of object and call its \"process\" function\n                if objectreference:\n                    enhancer = objectreference()\n\n                    parameters, data = enhancer.process(\n                        parameters=parameters, data=data)\n\n                else:  # else call \"process\"-function\n                    functionreference = getattr(module, 'process', False)\n\n                    if functionreference:\n\n                        parameters, data = functionreference(parameters, data)\n\n                    else:\n                        sys.stderr.write(\n                            \"Exception while data enrichment with plugin {}: Module implements wether object \\\"{}\\\" nor function \\\"process\\\"\\n\".format(plugin, plugin))\n\n            # if exception because user interrupted processing by keyboard, respect this and abort\n            except KeyboardInterrupt:\n                raise KeyboardInterrupt\n\n            # else dont break because fail of a plugin\n            # (maybe other plugins or data extraction will success),\n            # only error message\n            except BaseException as e:\n\n                error_message(\n                    docid=parameters['id'], data=data, plugin=plugin, e=e)\n\n                if self.config['raise_pluginexception']:\n                    raise\n\n            time_plugin_end = datetime.datetime.now()\n            time_plugin_delta = time_plugin_end - time_plugin_start\n            data['etl_' + plugin + '_time_millis_i'] = int(time_plugin_delta.total_seconds() * 1000)\n\n            # mark plugin as run\n            data['etl_' + plugin + '_b'] = True\n\n            # Abort plugin chain if plugin set parameters['break'] to True\n            # (used for example by blacklist or exclusion plugins)\n            abort = parameters.get('break', False)\n\n            if abort:\n                break\n\n        time_end = datetime.datetime.now()\n        time_delta = time_end - time_start\n        data['etl_time_millis_i'] = int(time_delta.total_seconds() * 1000)\n\n        # if processing aborted (f.e. by blacklist filter or file modification time did not change)\n        abort = parameters.get('break', False)\n        if not abort:\n\n            if 'export' in parameters:\n                exporter = parameters['export']\n            else:\n                exporter = self.config['export']\n\n            if exporter:\n                # export results (data) to db/storage/index\n                module = importlib.import_module(exporter)\n                objectreference = getattr(module, exporter)\n                self.exporter = objectreference(self.config)\n\n                try:\n\n                    parameters, data = self.exporter.process(\n                        parameters=parameters, data=data)\n\n                # if exception because user interrupted processing by keyboard, respect this and abbort\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Error while exporting to index or database: {}\\n\".format(parameters['id']))\n                    raise e\n\n        return parameters, data\n\n    def commit(self):\n\n        if self.verbose:\n            print(\"Committing cached or open transactions to index\")\n\n        self.exporter.commit()\n\n\n# append values (i.e. from an enhancer) to data structure\ndef append(data, facet, values):\n\n    # if facet there yet, append/extend the values, else set values to facet\n    if facet in data:\n\n        # if new value(s) single value instead of list convert to list\n        if not isinstance(values, list):\n            values = [values]\n\n        # if facet in data single value instead of list convert to list\n        if not isinstance(data[facet], list):\n            data[facet] = [data[facet]]\n\n        # add new values to this list\n        data[facet].extend(values)\n\n        # dedupe data in facet\n        data[facet] = list(set(data[facet]))\n\n        # if only one value, it has not to be a list\n        if len(data[facet]) == 1:\n            data[facet] = data[facet][0]\n\n    else:\n        data[facet] = values\n\n\n# Append errors to data/index and print error message\n# so we have a log and can see something went wrong within search engine and / or filter for that\n\ndef error_message(docid, data, plugin, e):\n\n    try:\n\n        errormessage = \"{}\".format(e)\n\n        # add error status and message to data to be indexed\n\n        if 'etl_error_txt' in data:\n            data['etl_error_txt'].append(errormessage)\n        else:\n            data['etl_error_txt'] = [errormessage]\n\n        if 'etl_error_plugins_ss' in data:\n            data['etl_error_plugins_ss'].append(plugin)\n        else:\n            data['etl_error_plugins_ss'] = [plugin]\n\n        data['etl_error_' + plugin + '_txt'] = errormessage\n\n        sys.stderr.write(\n            \"Exception while data enrichment of {} with plugin {}: {}\\n\".format(docid, plugin, e))\n\n    except:\n\n        sys.stderr.write(\n            \"Exception while generating error message for exception while processing plugin {} for file {}\\n\".format(\n                plugin, docid))\n\n\n#\n# sort added plugins because of dependencies\n#\n\ndef sort_plugins(plugins):\n\n    # OCR has to be done before language detection, since content maybe only scanned text within images\n    if \"enhance_detect_language_tika_server\" in plugins and \"enhance_pdf_ocr\" in plugins:\n        if plugins.index(\"enhance_pdf_ocr\") > plugins.index(\"enhance_detect_language_tika_server\"):\n            # remove after\n            plugins.remove(\"enhance_pdf_ocr\")\n            # add before\n            plugins.insert(plugins.index(\n                \"enhance_detect_language_tika_server\"), \"enhance_pdf_ocr\")\n\n    # manual annotations should be found by fulltext search too\n    # (automatic entity linking does by including the text or synonym)\n    # so read before generating the default search fields like _text_ or text_txt_languageX by enhance_multilingual\n    if \"enhance_rdf_annotations_by_http_request\" in plugins and \"enhance_multilingual\" in plugins:\n        if plugins.index(\"enhance_rdf_annotations_by_http_request\") > plugins.index(\"enhance_multilingual\"):\n            # remove after\n            plugins.remove(\n                \"enhance_rdf_annotations_by_http_request\")\n            # add before\n            plugins.insert(plugins.index(\n                \"enhance_multilingual\"), \"enhance_rdf_annotations_by_http_request\")\n\n    if \"enhance_annotations\" in plugins and \"enhance_multilingual\" in plugins:\n        if plugins.index(\"enhance_annotations\") > plugins.index(\"enhance_multilingual\"):\n            # remove after\n            plugins.remove(\n                \"enhance_annotations\")\n            # add before\n            plugins.insert(plugins.index(\n                \"enhance_multilingual\"), \"enhance_annotations\")\n\n    return plugins\n"
  },
  {
    "path": "src/opensemanticetl/etl_delete.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport importlib\n\nfrom etl import ETL\nimport enhance_mapping_id\n\nclass Delete(ETL):\n    def __init__(self, verbose=False, quiet=True):\n\n        ETL.__init__(self, verbose=verbose)\n\n        self.quiet = quiet\n\n        self.set_configdefaults()\n\n        self.read_configfiles()\n\n        # read on what DB or search server software our index is\n        export = self.config['export']\n\n        # call delete function of the configured exporter\n        module = importlib.import_module(export)\n        objectreference = getattr(module, export)\n        self.connector = objectreference()\n\n    def set_configdefaults(self):\n        #\n        # Standard config\n        #\n        # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files\n        #\n\n        ETL.set_configdefaults(self)\n\n        self.config['force'] = False\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        # Windows style filenames\n        self.read_configfile('conf\\\\opensemanticsearch-etl')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-files')\n\n        # Linux style filenames\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/connector-files')\n\n    def delete(self, uri):\n\n        if 'mappings' in self.config:\n            uri = enhance_mapping_id.mapping(value=uri, mappings=self.config['mappings'])\n        \n        if self.verbose:\n            print(\"Deleting from index {}\".format(uri))\n\n        self.connector.delete(parameters=self.config, docid=uri)\n\n    def empty(self):\n\n        if self.verbose:\n            print(\"Deleting all documents from index\")\n\n        self.connector.delete(parameters=self.config, query=\"*:*\")\n\n\n#\n# Read command line arguments and start\n#\n\n# if running (not imported to use its functions), run main function\nif __name__ == \"__main__\":\n\n    from optparse import OptionParser\n\n    # get uri or filename from args\n\n    parser = OptionParser(\"etl-delete [options] URI(s)\")\n    parser.add_option(\"-e\", \"--empty\", dest=\"empty\", action=\"store_true\",\n                      default=False,\n                      help=\"Empty the index (delete all documents in index)\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\", action=\"store_true\",\n                      default=None, help=\"Print debug messages\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\", default=False,\n                      help=\"Config file\")\n\n    (options, args) = parser.parse_args()\n\n    if not options.empty and len(args) < 1:\n        parser.error(\"No URI given\")\n\n    connector = Delete()\n\n    connector.read_configfile('/etc/etl/config')\n\n    # add optional config parameters\n    if options.config:\n        connector.read_configfile(options.config)\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    if options.empty:\n        print(\n            \"This will delete the whole index, are you sure ? Then enter \\\"yes\\\"\")\n        descision = input()\n        if descision == \"yes\" or \"Yes\" or \"YES\":\n            connector.empty()\n\n    # index each filename\n    for uri in args:\n        connector.delete(uri)\n"
  },
  {
    "path": "src/opensemanticetl/etl_enrich.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nfrom etl import ETL\nimport pysolr\nimport export_solr\nimport importlib\nimport threading\n\n# Todo: Abstraction of querying data to a function of output plugin\n# so this will work for other index or database than Solr, too\n\n# Todo: Yet problem, if you run only enrichment plugins (i.e. OCR) without container plugins (i.e. mailbox extractor or ZIP archive extractor), if you want to run this plugin with a container plugin:\n# because the container is marked as done in first run (but was not extract because container/extraction plugin not active) so next run with this continer_plugins/extractors wont enrich them anymore\n\n# Since we use enrichment of queries only for OCR after indexing in Open Semantic Desktop Search where we know we call the OCR plugin with all container plugins, fixing or maybe making this perfect later (maybe better classification/management of container plugins or defining in plugins, if they need access to file)\n\n\nclass ETL_Enrich(ETL):\n\n    def __init__(self, plugins=(), verbose=False):\n\n        ETL.__init__(self, plugins=list(plugins), verbose=verbose)\n\n        self.read_configfile('/etc/etl/config')\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n\n        self.fields = self.getfieldnames_from_plugins()\n\n        # init exporter\t(todo: exporter as extended PySolr)\n        self.export_solr = export_solr.export_solr()\n\n        # init PySolr\n        solr_uri = self.config['solr']\n        if not solr_uri.endswith('/'):\n            solr_uri += '/'\n        solr_uri += self.config['index']\n\n        self.solr = pysolr.Solr(solr_uri)\n\n        self.threads_max = None\n\n        # if not set explicit, autodetection of count of CPUs for amount of threads\n        if not self.threads_max:\n            import multiprocessing\n            self.threads_max = multiprocessing.cpu_count()\n            if self.verbose:\n                print(\"Setting threads to count of CPUs: \" +\n                      str(self.threads_max))\n\n        self.rows_per_step = 100\n        if self.rows_per_step < self.threads_max * 2:\n            self.rows_per_step = self.threads_max * 2\n\n        self.work_in_progress = []\n        self.delete_from_work_in_progress_lock = threading.Lock()\n\n        self.delete_from_work_in_progress_after_commit = []\n        self.work_in_progress_lock = threading.Lock()\n\n        self.e_job_done = threading.Event()\n\n    #\n    # get all the fields needed by all plugins for analysis\n    #\n\n    def getfieldnames_from_plugins(self):\n\n        # the field id is needed for every enrichment\n        fields = ['id']\n\n        # read all fieldnames, the plugins need to analyze\n        for plugin in self.config['plugins']:\n\n            module = importlib.import_module(plugin)\n            objectreference = getattr(module, plugin, False)\n            if objectreference:\n                modulefields = getattr(objectreference, 'fields', False)\n            if modulefields:\n                for field in modulefields:\n                    # only if not there yet from other plugin\n                    if not field in fields:\n                        fields.append(field)\n\n        return fields\n\n    #\n    # Start ETL process / run of all set plugins\n    #\n\n    def enrich_document(self, docid):\n\n        try:\n\n            if self.verbose:\n                print(\"Enriching {}\".format((docid)))\n\n            parameters = self.config.copy()\n\n            #\n            # read data from analyzed fields and add to parameters\n            #\n\n            # id is only field, so we do not have to get it again from index or database\n            if len(self.fields) == 1:\n                parameters['id'] = docid\n\n            # if more than id needed add fields from DB/index to parameters since that data is analysed by the plugins\n            else:\n\n                data = self.export_solr.get_data(\n                    docid=docid, fields=self.fields)\n\n                # add  to by analysed data of the first and only result to ETL/Enrichment parameters\n                parameters.update(data)\n\n            filename = docid\n            # if exist delete protocol prefix file://\n            if filename.startswith(\"file://\"):\n                filename = filename.replace(\"file://\", '', 1)\n\n            parameters['filename'] = filename\n\n            parameters['verbose'] = self.verbose\n\n            if self.verbose:\n                print(\"Parameters:\")\n                print(parameters)\n\n            # set markers that enriched by this plugins\n            data = {}\n            for plugin in self.config['plugins']:\n                data['etl_' + plugin + '_b'] = True\n\n            # start ETL / Enrichment process\n            parameters, data = self.process(parameters=parameters, data=data)\n\n        finally:\n\n            # remove blacklisting/locking for this document, since enrichment process is now done\n            self.work_in_progress_lock.acquire()\n\n            if docid in self.work_in_progress:\n                self.delete_from_work_in_progress_lock.acquire()\n\n                self.delete_from_work_in_progress_after_commit.append(docid)\n\n                self.delete_from_work_in_progress_lock.release()\n\n            self.work_in_progress_lock.release()\n\n            # set event, so main thread wakes up and knows next job/thread can be started\n            self.e_job_done.set()\n\n    #\n    # get query from plugin and start enrichment process for this query\n    #\n\n    # not usable for plugin chains (i.e. extract containers like ZIP files and than OCR contents)! Use enrich_query with a compounded query instead.\n    def enrich(self):\n\n        for plugin in self.config['plugins']:\n\n            query = \"*:* AND NOT (etl_{}_b:true)\".format(plugin)\n\n            # check if plugin has own more special query and if so, use this\n            module = importlib.import_module(plugin)\n            objectreference = getattr(module, plugin, False)\n            if objectreference:\n                query = getattr(objectreference, 'query', query)\n\n            if self.verbose:\n                print(\"Data enrichment query: {}\".format(query))\n\n            # enrich\n            self.enrich_query(query)\n\n    # get ids from query\n    # get fields from plugins\n    # run enrichment chain with this parameters\n    def enrich_query(self, query):\n\n        counter = 0\n\n        solrparameters = {\n            'fl': 'id',\n            'rows': self.rows_per_step,\n        }\n\n        # have to proceed all documents matching this query:\n\n        # - all yet not enriched (query for content type AND not plugin_b: true) results of enriched content type (plugin query)\n        # OR\n        # - container file type like ZIP archive or PST mailbox\n        # - AND not yet enriched by all set plugins\n        #\n        # - but both cases not if no file but content of a container file (for subfiles the enrichment plugins will be runned by the run of the container plugin)\n\n        # query matching i.e. the contenttype of to be enriched files (for example if doing OCR we do only have to process images, not all documents)\n        running_plugin_query = '(' + query + ')'\n\n        # not, if all set plugins yet runned on this document\n        all_plugin_query = []\n        for plugin in self.config['plugins']:\n            all_plugin_query.append(\"(etl_{}_b:true)\".format(plugin))\n        all_plugin_query = '(' + ' AND '.join(all_plugin_query) + ')'\n\n        # matching container content types like archive files\n        # since our content types like f.e. images can be in containers like for example ZIP archives, so we should enrich them too)\n        container_query = '(content_type:application\\/zip OR id:*.zip OR content_type:application\\/vnd.ms-outlook-pst OR id:*.pst)'\n        # Todo for more performance:\n        # distinct container_s from results with query for only needed content types instead of working trough all containers\n\n        query = running_plugin_query + \\\n            ' OR (' + container_query + ' AND NOT ' + all_plugin_query + ')'\n\n        # not try to enrich (not existent in filesystem) subfiles inside container files like ZIP archives or extracted mail attachments\n        # since for subfiles the enrichment plugins will be runned by the run of the container plugin)\n        query = '(' + query + ') AND NOT (container_s:*)'\n\n        if self.verbose:\n            print(\"Enrichment of matches the following query:\")\n            print(query)\n\n        results = self.solr.search(query, **solrparameters)\n\n        while len(results) > 0:\n\n            for result in results:\n\n                docid = result['id']\n\n                if self.threads_max == 1:\n\n                    # no threading, do it directly in this process\n                    self.enrich_document(docid=docid)\n                    counter += 1\n\n                else:\n\n                    #\n                    # Manage threading\n                    #\n\n                    # If doc id blacklistet (work in progress in running threads) don't start thread for docid,\n                    # since new search result can include some same documents again which not yet ready enriched because work goes on in a thread from step before.\n                    # So continue with next result.\n                    if docid in self.work_in_progress:\n                        continue\n\n                    # wait for a job done if yet maximum threads (+1 because do not count this main thread) running\n                    while threading.active_count() >= self.threads_max + 1:\n                        # wait for event that signals that a thread/job finished (set in enrich_document() at end)\n                        # use a timeout if racing condition (setting in finished job before clearing this event here)\n                        self.e_job_done.wait(1)\n\n                    # blacklist id of document work in progres\n                    self.work_in_progress_lock.acquire()\n                    self.work_in_progress.append(docid)\n                    self.work_in_progress_lock.release()\n\n                    # start enrichment of this document in new thread\n                    thread = threading.Thread(\n                        target=self.enrich_document, args=(docid, ))\n                    self.e_job_done.clear()\n                    thread.start()\n\n                    counter += 1\n\n            # do commit, so next query wont find documents again, which were processed but not available for searcher yet\n            self.commit()\n\n            #\n            # delete done IDs from blacklist\n            #\n\n            self.delete_from_work_in_progress_lock.acquire()\n\n            while len(self.delete_from_work_in_progress_after_commit) > 0:\n                docid = self.delete_from_work_in_progress_after_commit.pop()\n\n                self.work_in_progress_lock.acquire()\n                self.work_in_progress.remove(docid)\n                self.work_in_progress_lock.release()\n\n            self.delete_from_work_in_progress_lock.release()\n\n            #\n            # If last step (fewer search results than a step manages), wait for all threads to be done\n            # before starting new search / next step (which will only find the documents again, that are work in progress in running threads and not done/marked as ready yet) to prevent unnecessary search load\n            #\n\n            if len(results) < self.rows_per_step:\n                # wait until all started threads done before continue (commit and end),\n\n                while threading.active_count() > 1:\n                    # wait for event that signals that a thread/job finished (set in enrich_document() at end)\n                    # use a timeout if racing condition (setting in finished job before clearing this event here)\n                    self.e_job_done.wait(1)\n                    self.e_job_done.clear()\n\n                # commit last results, so very last enrichments have not wait the next autocommit time (if set up)\n                self.commit()\n\n            # are there (more) not yet enriched documents in search index for a next step?\n            results = self.solr.search(query, **solrparameters)\n\n        # if self.verbose:\n        print(\"Enriched {} documents\".format(counter))\n\n\n# todo: export to Solr by update\n\nif __name__ == \"__main__\":\n\n    # get uri or filename from args\n    from optparse import OptionParser\n\n    parser = OptionParser(\"etl-enrich [options] --plugins pluginname\")\n\n    parser.add_option(\"-p\", \"--plugins\", dest=\"plugins\",\n                      default=False, help=\"Plugins (comma separated)\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\",\n                      default=False, help=\"Config file\")\n    parser.add_option(\"-q\", \"--query\", dest=\"query\",\n                      default=False, help=\"Query\")\n    parser.add_option(\"-o\", \"--outputfile\", dest=\"outputfile\", default=False,\n                      help=\"Output file (if exporter set to a file format)\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n    (options, args) = parser.parse_args()\n\n    etl = ETL_Enrich()\n\n    if options.config:\n        etl.read_configfile(options.config)\n        etl.fields = etl.getfieldnames_from_plugins()\n\n    if not options.verbose or options.verbose:\n        etl.verbose = options.verbose\n\n    # set (or if config overwrite) plugin config\n    if options.plugins:\n        etl.config['plugins'] = options.plugins.split(',')\n        etl.fields = etl.getfieldnames_from_plugins()\n\n    if options.outputfile:\n        etl.config['outputfile'] = options.outputfile\n\n    # if query, enrich IDs matching this query\n    if options.query:\n\n        etl.enrich_query(options.query)\n\n    # if no query and no id's as argument, use default query from plugins\n    elif len(args) == 0:\n\n        etl.enrich()\n\n    # if not query but IDs\n    else:\n\n        for uri in args:\n\n            # todo: if not local file, download to temp file if a plugin needs parameter filename\n\n            etl.enrich_document(docid=uri)\n\n        etl.commit()\n"
  },
  {
    "path": "src/opensemanticetl/etl_file.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport os.path\nimport sys\n\nfrom etl import ETL\n\n\nclass Connector_File(ETL):\n\n    def __init__(self, verbose=False, quiet=True):\n\n        ETL.__init__(self, verbose=verbose)\n\n        self.quiet = quiet\n\n        self.set_configdefaults()\n\n        self.read_configfiles()\n\n    def set_configdefaults(self):\n        #\n        # Standard config\n        #\n        # Do not edit config here!\n        # Overwrite options in /etc/etl/\n        # or /etc/opensemanticsearch/connector-files\n        #\n\n        ETL.set_configdefaults(self)\n\n        self.config['force'] = False\n\n        # filename to URI mapping\n        self.config['mappings'] = {\"/\": \"file:///\"}\n\n        self.config['facet_path_strip_prefix'] = [\n            \"file://\",\n            \"http://www.\", \"https://www.\", \"http://\", \"https://\"]\n\n        self.config['plugins'] = [\n            'enhance_mapping_id',\n            'filter_blacklist',\n            'filter_file_not_modified',\n            'enhance_extract_text_tika_server',\n            'enhance_detect_language_tika_server',\n            'enhance_contenttype_group',\n            'enhance_pst',\n            'enhance_csv',\n            'enhance_file_mtime',\n            'enhance_path',\n            'enhance_extract_hashtags',\n            'enhance_warc',\n            'enhance_zip',\n            'clean_title',\n            'enhance_multilingual',\n        ]\n\n        self.config['blacklist'] = [\n            \"/etc/opensemanticsearch/blacklist/blacklist-url\"]\n        self.config['blacklist_prefix'] = [\n            \"/etc/opensemanticsearch/blacklist/blacklist-url-prefix\"]\n        self.config['blacklist_suffix'] = [\n            \"/etc/opensemanticsearch/blacklist/blacklist-url-suffix\"]\n        self.config['blacklist_regex'] = [\n            \"/etc/opensemanticsearch/blacklist/blacklist-url-regex\"]\n        self.config['whitelist'] = [\n            \"/etc/opensemanticsearch/blacklist/whitelist-url\"]\n        self.config['whitelist_prefix'] = [\n            \"/etc/opensemanticsearch/blacklist/whitelist-url-prefix\"]\n        self.config['whitelist_suffix'] = [\n            \"/etc/opensemanticsearch/blacklist/whitelist-url-suffix\"]\n        self.config['whitelist_regex'] = [\n            \"/etc/opensemanticsearch/blacklist/whitelist-url-regex\"]\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        # Windows style filenames\n        self.read_configfile('conf\\\\opensemanticsearch-etl')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-rdf')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-files')\n\n        # Linux style filenames\n        self.read_configfile('/etc/etl/config')\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/etl-webadmin')\n        self.read_configfile('/etc/opensemanticsearch/etl-custom')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n        self.read_configfile('/etc/opensemanticsearch/facets')\n        self.read_configfile('/etc/opensemanticsearch/connector-files')\n        self.read_configfile('/etc/opensemanticsearch/connector-files-custom')\n\n    # clean filename (convert filename given as URI to filesystem)\n    def clean_filename(self, filename):\n\n        # if exist delete prefix file://\n\n        if filename.startswith(\"file://\"):\n            filename = filename.replace(\"file://\", '', 1)\n\n        return filename\n\n    # index directory or file\n    def index(self, filename):\n\n        # clean filename (convert filename given as URI to filesystem)\n        filename = self.clean_filename(filename)\n\n        # if singe file start to index it\n        if os.path.isfile(filename):\n\n            self.index_file(filename=filename)\n\n            result = True\n\n        # if directory walkthrough\n        elif os.path.isdir(filename):\n\n            self.index_dir(rootDir=filename)\n\n            result = True\n\n        # else error message\n        else:\n\n            result = False\n\n            sys.stderr.write(\n                \"No such file or directory: {}\\n\".format(filename))\n\n        return result\n\n    # walk through all subdirectories and call index_file for each file\n    def index_dir(self, rootDir, followlinks=False):\n\n        for dirName, subdirList, fileList in os.walk(rootDir,\n                                                     followlinks=followlinks):\n\n            if self.verbose:\n                print(\"Scanning directory: {}\".format(dirName))\n\n            for fileName in fileList:\n                if self.verbose:\n                    print(\"Scanning file: {}\".format(fileName))\n\n                try:\n\n                    fullname = dirName\n                    if not fullname.endswith(os.path.sep):\n                        fullname += os.path.sep\n                    fullname += fileName\n\n                    self.index_file(filename=fullname)\n\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n                except BaseException as e:\n                    try:\n                        sys.stderr.write(\n                            \"Exception while processing file {}{}{} : {}\\n\"\n                            .format(dirName, os.path.sep, fileName, e))\n                    except BaseException:\n                        sys.stderr.write(\n                            \"Exception while processing a file and exception \"\n                            \"while printing error message (maybe problem with\"\n                            \" encoding of filename on console or converting \"\n                            \"the exception to string?)\\n\")\n\n    # Index a file\n    def index_file(self, filename, additional_plugins=()):\n\n        # clean filename (convert filename given as URI to filesystem)\n        filename = self.clean_filename(filename)\n\n        # fresh parameters / chain for each file (so processing one file will\n        # not change config/parameters for next, if directory or multiple\n        # files, which would happen if given by reference)\n        parameters = self.config.copy()\n        if additional_plugins:\n            parameters['plugins'].extend(additional_plugins)\n\n        if self.verbose:\n            parameters['verbose'] = True\n\n        data = {}\n\n        # add this connector name to ETL status\n        data['etl_file_b'] = True\n\n        if 'id' not in parameters:\n            parameters['id'] = filename\n\n        parameters['filename'] = filename\n\n        parameters, data = self.process(parameters=parameters, data=data)\n\n        return parameters, data\n\n#\n# Read command line arguments and start\n#\n\n\n# if running (not imported to use its functions), run main function\nif __name__ == \"__main__\":\n\n    from argparse import ArgumentParser\n\n    # get uri or filename and (optional) parameters from args\n\n    def key_val(s):\n        return s.split(\"=\")\n\n    parser = ArgumentParser(\"etl-file\")\n    parser.add_argument(\"-q\", \"--quiet\",\n                        action=\"store_true\",\n                        default=None,\n                        help=\"Don\\'t print status (filenames) while indexing\")\n    parser.add_argument(\"-v\", \"--verbose\", dest=\"verbose\",\n                        action=\"store_true\",\n                        default=None, help=\"Print debug messages\")\n    parser.add_argument(\"-f\", \"--force\", dest=\"force\", action=\"store_true\",\n                        default=None,\n                        help=\"Force (re)indexing, even if no changes\")\n    parser.add_argument(\"-c\", \"--config\",\n                        help=\"Config file\")\n    parser.add_argument(\"-p\", \"--plugins\",\n                        type=lambda s: s.split(\",\"),\n                        help=\"Plugin chain to use instead configured \"\n                        \"plugins (comma separated and in order)\")\n    parser.add_argument(\"-a\", \"--additional-plugins\",\n                        dest=\"additional_plugins\",\n                        type=lambda s: s.split(\",\"),\n                        help=\"Plugins to add to default/configured plugins\"\n                        \" (comma separated and in order)\")\n    parser.add_argument(\"-w\", \"--outputfile\", dest=\"outputfile\",\n                        help=\"Output file\")\n    parser.add_argument(\"--param\", action=\"append\",\n                        type=key_val,\n                        help=\"Set a config parameter (key=value). \"\n                        \"Can be specified multiple times\")\n    parser.add_argument(\"args\", nargs=\"+\", help=\"Input files\")\n\n    options = {key: val for key, val in vars(parser.parse_args()).items()\n               if val is not None}\n\n    args = options.pop(\"args\")\n\n    connector = Connector_File()\n\n    # add optional config parameters\n    config = options.pop(\"config\", None)\n    if config:\n        connector.read_configfile(config)\n\n    plugins = options.pop(\"plugins\", []) + \\\n        options.pop(\"additional_plugins\", [])\n\n    # set (or if config overwrite) plugin config\n    if plugins:\n        connector.config['plugins'] = plugins\n\n    connector.config.update(dict(options.pop(\"param\", {})))\n\n    connector.config.update(options)\n\n    # index each filename\n    for filename in args:\n        connector.index(filename)\n\n    # commit changes, if not yet done automatically by index timer\n    connector.commit()\n\n    # after file or files processed with basic/first stage config\n    # if plugins or config options configured for later stage, reprocess with additional config\n    additional_plugins_later = connector.config.get('additional_plugins_later', [])\n\n    additional_plugins_later_config = connector.config.get('additional_plugins_later_config', {})\n\n    if len(additional_plugins_later) > 0 or len(additional_plugins_later_config) > 0:\n\n        if connector.config['verbose']:\n            print(\"There are options configured for later stage, so (re)processing with additional plugins {} and/or config {}\"\n                  .format(additional_plugins_later, additional_plugins_later_config))\n\n        for option in additional_plugins_later_config:\n            connector.config[option] = additional_plugins_later_config[option]\n\n            connector.config['plugins'].extend(additional_plugins_later)\n            \n        # index each filename\n        for filename in args:\n            connector.index(filename=filename)\n\n        # commit changes, if not yet done automatically by index timer\n        connector.commit()\n"
  },
  {
    "path": "src/opensemanticetl/etl_filedirectory.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nfrom etl_file import Connector_File\n\n#\n# Parallel processing of files by adding each file to celery tasks\n#\n\n\nclass Connector_Filedirectory(Connector_File):\n\n    def __init__(self, verbose=False, quiet=False):\n\n        Connector_File.__init__(self, verbose=verbose)\n\n        self.quiet = quiet\n\n        # apply filters before adding to queue, so filtered or yet indexed files not added to queue\n        # adding to queue by plugin export_queue_files\n\n        # exporter to index filenames before text extraction and other later tasks\n        # will run before adding tasks to queue by export_queue_files\n        # so reseted plugin status will be in index before started ETL tasks apply not modified filter\n        export_to_index = self.config['export']\n\n        self.config['plugins'] = [\n            'enhance_mapping_id',\n            'filter_blacklist',\n            'filter_file_not_modified',\n            'enhance_file_mtime',\n            'enhance_path',\n            'enhance_entity_linking',\n            'enhance_multilingual',\n            export_to_index,\n            'export_queue_files',\n        ]\n\n#\n# Read command line arguments and start\n#\n\n\n# if running (not imported to use its functions), run main function\nif __name__ == \"__main__\":\n\n    from optparse import OptionParser\n\n    # get uri or filename from args\n\n    parser = OptionParser(\"etl-filedirectory [options] filename\")\n    parser.add_option(\"-q\", \"--quiet\", dest=\"quiet\", action=\"store_true\",\n                      default=None, help=\"Don\\'t print status (filenames) while indexing\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n\n    (options, args) = parser.parse_args()\n\n    if len(args) < 1:\n        parser.error(\"No filename given\")\n\n    connector = Connector_Filedirectory()\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    if options.quiet == False or options.quiet == True:\n        connector.quiet = options.quiet\n\n    # index each filename\n    for filename in args:\n        connector.index(filename)\n"
  },
  {
    "path": "src/opensemanticetl/etl_filemonitoring.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nfrom argparse import ArgumentParser\n\nimport pyinotify\n\nfrom tasks import index_file\nfrom tasks import delete\n\nfrom etl import ETL\nfrom enhance_mapping_id import mapping\n\nfrom move_indexed_file import move_files, move_dir\n\n\nclass EventHandler(pyinotify.ProcessEvent):\n\n    def __init__(self):\n        super().__init__()\n        self.verbose = False\n        self.config = {}\n\n    def process_IN_CLOSE_WRITE(self, event):\n        if self.verbose:\n            print(\"Close_write: {}\".format(event.pathname))\n\n        self.index_file(filename=event.pathname)\n\n    def process_IN_MOVED_TO(self, event):\n        if self.verbose:\n            print(\"Move: {} -> {}\".format(event.src_pathname, event.pathname))\n\n        if event.dir:\n            self.move_dir(src=event.src_pathname, dest=event.pathname)\n        else:\n            self.move_file(src=event.src_pathname, dest=event.pathname)\n\n    def process_IN_DELETE(self, event):\n\n        if self.verbose:\n            print(\"Delete {}:\".format(event.pathname))\n\n        self.delete_file(filename=event.pathname)\n\n    #\n    # write to queue\n    #\n\n    def move_file(self, src, dest):\n        if self.verbose:\n            print(\"Moving file from {} to {}\".format(src, dest))\n        solr_uri = self.config[\"solr\"] + self.config[\"index\"]\n        if not solr_uri.endswith(\"/\"):\n            solr_uri += \"/\"\n        move_files(solr_uri, moves={src: dest}, prefix=\"file://\")\n\n    def move_dir(self, src, dest):\n        if self.verbose:\n            print(\"Moving dir from {} to {}\".format(src, dest))\n        solr_uri = self.config[\"solr\"] + self.config[\"index\"]\n        if not solr_uri.endswith(\"/\"):\n            solr_uri += \"/\"\n        move_dir(solr_uri, src=src, dest=dest, prefix=\"file://\")\n\n    def index_file(self, filename):\n        if self.verbose:\n            print(\"Indexing file {}\".format(filename))\n\n        index_file.apply_async(\n            kwargs={'filename': filename}, queue='open_semantic_etl_tasks', priority=5)\n\n    def delete_file(self, filename):\n        uri = filename\n        if 'mappings' in self.config:\n            uri = mapping(value=uri, mappings=self.config['mappings'])\n\n        if self.verbose:\n            print(\"Deleting from index filename {} with URL {}\".format(\n                filename, uri))\n\n        delete.apply_async(kwargs={'uri': uri}, queue='open_semantic_etl_tasks', priority=6)\n\n\nclass Filemonitor(ETL):\n\n    def __init__(self, verbose=False):\n\n        ETL.__init__(self, verbose=verbose)\n\n        self.verbose = verbose\n\n        self.read_configfiles()\n\n        # Watched events\n        #\n        # We need IN_MOVE_SELF to track moved folder paths\n        # pyinotify-internally. If omitted, the os instructions\n        # mv /docs/src /docs/dest; touch /docs/dest/doc.pdf\n        # will produce a IN_MOVED_TO pathname=/docs/dest/ followed by\n        # IN_CLOSE_WRITE pathname=/docs/src/doc.pdf\n        # where we would like a IN_CLOSE_WRITE pathname=/docs/dest/doc.pdf\n        self.mask = (\n            pyinotify.IN_DELETE\n            | pyinotify.IN_CLOSE_WRITE\n            | pyinotify.IN_MOVED_TO\n            | pyinotify.IN_MOVED_FROM\n            | pyinotify.IN_MOVE_SELF\n        )\n\n        self.watchmanager = pyinotify.WatchManager()  # Watch Manager\n\n        self.handler = EventHandler()\n\n        self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/connector-files')\n\n    def add_watch(self, filename):\n\n        self.watchmanager.add_watch(\n            filename, self.mask, rec=True, auto_add=True)\n\n    @staticmethod\n    def add_watches_from_file(filename):\n        listfile = open(filename)\n        for line in listfile:\n            filename = line.strip()\n            # ignore empty lines and comment lines (starting with #)\n            if filename and not filename.startswith(\"#\"):\n\n                filemonitor.add_watch(filename)\n\n    def watch(self):\n\n        self.handler.config = self.config\n        self.handler.verbose = self.verbose\n\n        self.notifier.loop()\n\n\n# parse command line options\nparser = ArgumentParser(description=\"etl-filemonitor\")\nparser.add_argument(\"-v\", \"--verbose\", dest=\"verbose\",\n                    action=\"store_true\", default=False,\n                    help=\"Print debug messages\")\nparser.add_argument(\"-f\", \"--fromfile\", dest=\"fromfile\",\n                    default=None, help=\"File names config\")\nparser.add_argument(\"watchfiles\", nargs=\"*\",\n                    default=(), help=\"Files / directories to watch\")\nargs = parser.parse_args()\n\n\nfilemonitor = Filemonitor(verbose=args.verbose)\n\n\n# add watches for every file/dir given as command line parameter\nfor _filename in args.watchfiles:\n    filemonitor.add_watch(_filename)\n\n\n# add watches for every file/dir in list file\nif args.fromfile is not None:\n    filemonitor.add_watches_from_file(args.fromfile)\n\n\n# start watching\nfilemonitor.watch()\n"
  },
  {
    "path": "src/opensemanticetl/etl_hypothesis.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\n#\n# Import annotations from Hypothesis - https://hypothes.is\n#\n\nimport requests\nimport json\nimport sys\n\nfrom etl import ETL\n\nfrom etl_web import Connector_Web\n\nimport export_solr\n\n\nclass Connector_Hypothesis(ETL):\n\n    verbose = False\n\n    documents = True\n\n    token = None\n\n    api = 'https://hypothes.is/api/'\n\n    # how many annotations to download at once / per page\n    limit = 10\n\n    # initialize Open Semantic ETL\n    etl = ETL()\n    etl.read_configfile('/etc/etl/config')\n    etl.read_configfile('/etc/opensemanticsearch/etl')\n    etl.read_configfile('/etc/opensemanticsearch/hypothesis')\n    etl.verbose = verbose\n\n    exporter = export_solr.export_solr()\n\n    #\n    # index the annotated document, if not yet in index\n    #\n\n    def etl_document(self, uri):\n\n        result = True\n        doc_mtime = self.exporter.get_lastmodified(docid=uri)\n\n        if doc_mtime:\n\n            if self.verbose:\n                print(\n                    \"Annotated document in search index. No new indexing of {}\".format(uri))\n\n        else:\n            # Download and Index the new or updated uri\n\n            if self.verbose:\n                print(\n                    \"Annotated document not in search index. Start indexing of {}\".format(uri))\n\n            try:\n                etl = Connector_Web()\n                etl.index(uri=uri)\n            except KeyboardInterrupt:\n                raise KeyboardInterrupt\n            except BaseException as e:\n                sys.stderr.write(\n                    \"Exception while getting {} : {}\".format(uri, e))\n                result = False\n        return result\n\n    #\n    # import an annotation\n    #\n\n    def etl_annotation(self, annotation):\n\n        parameters = {}\n        parameters['plugins'] = ['enhance_multilingual']\n\n        # since there can be multiple annotations for same URI,\n        # do not overwrite but add value to existent values of the facet/field/property\n        parameters['add'] = True\n        data = {}\n\n        # id/uri of the annotated document, not the annotation id\n        parameters['id'] = annotation['uri']\n\n        # first index / etl the webpage / document that has been annotated if not yet in index\n        if self.documents:\n            result = self.etl_document(uri=annotation['uri'])\n        if not result:\n            data['etl_error_hypothesis_ss'] = \"Error while indexing the document that has been annotated\"\n\n        # annotation id\n        data['annotation_id_ss'] = annotation['id']\n\n        data['annotation_text_txt'] = annotation['text']\n\n        tags = []\n        if 'tags' in annotation:\n\n            if self.verbose:\n                print(\"Tags: {}\".format(annotation['tags']))\n\n            for tag in annotation['tags']:\n                tags.append(tag)\n        data['annotation_tag_ss'] = tags\n\n        # write annotation to database or index\n        self.etl.process(parameters=parameters, data=data)\n\n    #\n    # import all annotations since last imported annotation\n    #\n\n    def etl_annotations(self, last_update=\"\", user=None, group=None, tag=None, uri=None):\n\n        newest_update = last_update\n\n        if not self.api.endswith('/'):\n            self.api = self.api + '/'\n\n        searchurl = '{}search?limit={}&sort=updated&order=desc'.format(\n            self.api, self.limit)\n\n        if user:\n            searchurl += \"&user={}\".format(user)\n\n        if group:\n            searchurl += \"&group={}\".format(group)\n\n        if tag:\n            searchurl += \"&tag={}\".format(tag)\n\n        if uri:\n            searchurl += \"&uri={}\".format(uri)\n\n        # Authorization\n        headers = {'user-agent': 'Open Semantic Search'}\n\n        if self.token:\n            headers['Authorization'] = 'Bearer ' + self.token\n\n        # stats\n        stat_downloaded_annotations = 0\n        stat_imported_annotations = 0\n        stat_pages = 0\n\n        offset = 0\n        last_page = False\n\n        while not last_page:\n\n            searchurl_paged = searchurl + \"&offset={}\".format(offset)\n\n            # Call API / download annotations\n            if self.verbose:\n                print(\"Calling hypothesis API {}\".format(searchurl_paged))\n\n            request = requests.get(searchurl_paged, headers=headers)\n\n            result = json.loads(request.content.decode('utf-8'))\n\n            stat_pages += 1\n\n            if len(result['rows']) < self.limit:\n                last_page = True\n\n            # import annotations\n            for annotation in result['rows']:\n\n                stat_downloaded_annotations += 1\n\n                if annotation['updated'] > last_update:\n\n                    if self.verbose:\n                        print(\n                            \"Importing new annotation {}annotations/{}\".format(self.api, annotation['id']))\n                        print(annotation['text'])\n\n                    stat_imported_annotations += 1\n\n                    # save update time from newest annotation/edit\n                    if annotation['updated'] > newest_update:\n                        newest_update = annotation['updated']\n\n                    self.etl_annotation(annotation)\n\n                else:\n\n                    last_page = True\n\n            offset += self.limit\n\n        # commit to index, if yet buffered\n        self.etl.commit()\n\n        if self.verbose:\n            print(\"Downloaded annotations: {}\".format(\n                stat_downloaded_annotations))\n            print(\"Imported new annotations: {}\".format(\n                stat_imported_annotations))\n\n        return newest_update\n\n\n#\n# Read command line arguments and start\n#\n\n# if running (not imported to use its functions), run main function\nif __name__ == \"__main__\":\n\n    from optparse import OptionParser\n\n    # get uri or filename from args\n\n    parser = OptionParser(\"etl-file [options] filename\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n    parser.add_option(\"-a\", \"--api\", dest=\"api\",\n                      default=\"https://hypothes.is/api/\", help=\"API URL\")\n    parser.add_option(\"-p\", \"--token\", dest=\"token\",\n                      default=None, help=\"API token for authorization\")\n    parser.add_option(\"-d\", \"--documents\", dest=\"documents\", action=\"store_true\",\n                      default=True, help=\"Index content of annotated document(s), too\")\n    parser.add_option(\"-f\", \"--force\", dest=\"force\", action=\"store_true\",\n                      default=None, help=\"Force (re)indexing, even if no changes\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\",\n                      default=False, help=\"Config file\")\n    parser.add_option(\"-t\", \"--tag\", dest=\"tag\",\n                      default=None, help=\"Filter for a tag\")\n    parser.add_option(\"-u\", \"--user\", dest=\"user\",\n                      default=None, help=\"Filter for an user\")\n    parser.add_option(\"-g\", \"--group\", dest=\"group\",\n                      default=None, help=\"Filter for a group\")\n\n    (options, args) = parser.parse_args()\n\n    connector = Connector_Hypothesis()\n\n    # add optional config parameters\n    if options.config:\n        connector.read_configfile(options.config)\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    connector.documents = options.documents\n\n    if options.token:\n        connector.token = options.token\n\n    connector.api = options.api\n\n    connector.etl_annotations(\n        last_update=\"\", user=options.user, group=options.group, tag=options.tag)\n"
  },
  {
    "path": "src/opensemanticetl/etl_plugin_core.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport itertools\n\n#\n# Core functions used by multiple plugins, so they can be inherit from this class\n#\n\nclass Plugin(object):\n\n    filter_filename_suffixes = []\n    filter_mimetype_prefixes = []\n\n    # filter for mimetype prefixes or filename suffixes\n    def filter(self, parameters=None, data=None):\n\n        filtered = False\n        \n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        filename = None\n        if 'filename' in parameters:\n            filename = parameters['filename']\n\n        mimetype = None\n        if 'content_type_ss' in data:\n            mimetype = data['content_type_ss']\n        elif 'content_type_ss' in parameters:\n            mimetype = parameters['content_type_ss']\n\n        # if connector returns a list, use only first value\n        # (which is the only entry or the main content type of file)\n        if isinstance(mimetype, list):\n            mimetype = mimetype[0]\n\n        # is there a filename suffix match?\n        match_filename_suffix = False\n        if filename:\n            for suffix in self.filter_filename_suffixes:\n                if filename.lower().endswith(suffix.lower()):\n                    if verbose:\n                        print('Filename suffix matches plugin filter(s) {}'.format(self.filter_filename_suffixes))\n    \n                    match_filename_suffix = True\n            \n        # is there a mimetype prefix match?\n        match_contenttype_prefix = False\n        if mimetype:\n            for prefix in self.filter_mimetype_prefixes:\n                if mimetype.lower().startswith(prefix.lower()):\n                    if verbose:\n                        print('Contenttype matches plugin filter(s) {}'.format(self.filter_mimetype_prefixes))\n    \n                    match_contenttype_prefix = True\n\n        # if filter(s) configured for file suffix or mimetype prefix, set filtered if matches\n        if len(self.filter_mimetype_prefixes)>0 and len(self.filter_filename_suffixes) > 0:\n            if not match_filename_suffix and not match_contenttype_prefix:\n                if verbose:\n                    print('Wether filename suffix nor content type matches plugin filter(s) for mimetypes {} or filename suffixes {}, so no further processing of this plugin'.format(self.filter_mimetype_prefixes, self.filter_filename_suffixes))\n                filtered = True\n        elif len(self.filter_mimetype_prefixes)>0:\n            if not match_contenttype_prefix:\n                if verbose:\n                    print('Contenttype does not match plugin filter(s) {}, so no further processing of this plugin'.format(self.filter_mimetype_prefixes))\n                filtered = True\n        elif len(self.filter_filename_suffixes)>0:\n            if not match_filename_suffix:\n                if verbose:\n                    print('Filename suffix does not match plugin filter(s) {}, so no further processing of this plugin'.format(self.filter_filename_suffixes))\n                filtered = True\n            \n        return filtered\n\n\ndef get_text(data):\n    values_list = []\n\n    #\n    # exclude fields like technical metadata\n    #\n\n    exclude_fields_prefix = []\n\n    listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix')\n    for line in listfile:\n        line = line.strip()\n        if line and not line.startswith(\"#\"):\n            exclude_fields_prefix.append(line)\n    listfile.close()\n\n    # suffixes of non-text fields like nubers\n    exclude_fields_suffix = []\n\n    listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix')\n    for line in listfile:\n        line = line.strip()\n        if line and not line.startswith(\"#\"):\n            exclude_fields_suffix.append(line)\n    listfile.close()\n\n    # full fieldnames\n    exclude_fields = []\n    listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname')\n    for line in listfile:\n        line = line.strip()\n        if line and not line.startswith(\"#\"):\n            exclude_fields.append(line)\n    listfile.close()\n\n\n    for field in data:\n\n        is_blacklisted = False\n\n        for blacklisted_prefix in exclude_fields_prefix:\n            if field.startswith(blacklisted_prefix):\n                is_blacklisted = True\n        \n        for blacklisted_suffix in exclude_fields_suffix:\n            if field.endswith(blacklisted_suffix):\n                is_blacklisted = True\n        \n        if field in exclude_fields:\n            is_blacklisted = True\n\n        if not is_blacklisted:\n    \n            values = data[field]\n    \n            if not isinstance(values, list):\n                values = [values]\n\n            values_list.append(values)\n\n\n    # Flatten:\n    values = itertools.chain.from_iterable(values_list)\n\n    # Remove empty values:\n    values = filter(None, values)\n\n    # Make sure everything is a string:\n    values = (\n        value if isinstance(value, str) else \"{}\".format(value)\n        for value in values\n    )\n\n    # Ensure a trailing newline:\n    values = itertools.chain(values, [\"\"])\n\n    # Concatenate:\n    return \"\\n\".join(values)\n\n\n# append values (i.e. from an enhancer) to data structure\ndef append(data, facet, values):\n\n    # if facet there yet, append/extend the values, else set values to facet\n    if facet in data:\n\n        # if new value(s) single value instead of list convert to list\n        if not isinstance(values, list):\n            values = [values]\n\n        # if facet in data single value instead of list convert to list\n        if not isinstance(data[facet], list):\n            data[facet] = [data[facet]]\n\n        # add new values to this list\n        data[facet].extend(values)\n\n        # dedupe data in facet\n        data[facet] = list(set(data[facet]))\n\n        # if only one value, it has not to be a list\n        if len(data[facet]) == 1:\n            data[facet] = data[facet][0]\n\n    else:\n        data[facet] = values\n\n\n#\n# Get preferred label(s) from field in format pref label and uri\n#\ndef get_preflabels(values):\n\n    uri2preflabel_map = {}\n\n    if values:\n\n        if not isinstance(values, list):\n            values = [values]\n\n        for value in values:\n            pos_uri = value.rfind(' <')\n            uri = value[pos_uri+2:-1]\n            preflabel = value[0:pos_uri]\n            uri2preflabel_map[uri] = preflabel\n\n    return uri2preflabel_map\n\n\ndef get_all_matchtexts(values):\n    \n    results = {}\n\n    if not isinstance(values, list):\n        values = [values]\n\n    for value in values:\n\n        #get only matchtext (without ID/URI of matching entity)\n        value = value.split(\"\\t\")\n        matchid = value[0]\n        matchtext = value[1]\n        \n        if not matchid in results:\n            results[matchid] = []\n            \n        if not matchtext in results[matchid]:\n            results[matchid].append(matchtext)\n\n    return results\n"
  },
  {
    "path": "src/opensemanticetl/etl_rss.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport feedparser\nimport sys\n\nfrom etl_web import Connector_Web\n\nimport export_solr\n\n\nclass Connector_RSS(Connector_Web):\n\n    def __init__(self, verbose=False, quiet=True):\n\n        Connector_Web.__init__(self, verbose=verbose, quiet=quiet)\n\n        self.quiet = quiet\n        self.read_configfiles()\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        # windows style filenames\n        self.read_configfile('conf\\\\opensemanticsearch-connector')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-ocr')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-rdf')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-web')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-rss')\n\n        # linux style filenames\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/etl-webadmin')\n        self.read_configfile('/etc/opensemanticsearch/etl-custom')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-ocr')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n        self.read_configfile('/etc/opensemanticsearch/connector-web')\n        self.read_configfile('/etc/opensemanticsearch/connector-rss')\n\n    # Import Feed\n\n    #\n    # Import a RSS feed: If article has changed or not indexed, call download_and_index_to_solr()\n    #\n    def index(self, uri):\n\n        result = True\n\n        exporter = export_solr.export_solr()\n\n        feed = feedparser.parse(uri)\n\n        new_items = 0\n\n        for item in feed.entries:\n\n            articleuri = item.link\n\n            #\n            # Is new article or indexed in former runs?\n            #\n\n            doc_mtime = exporter.get_lastmodified(docid=articleuri)\n\n            if doc_mtime:\n\n                if self.verbose:\n                    print(\n                        \"Article indexed before, so skip new indexing: {}\".format(articleuri))\n\n            else:\n                # Download and Index the new or updated uri\n\n                if self.verbose:\n                    print(\"Article not in index: {}\".format(articleuri))\n\n                try:\n                    partresult = Connector_Web.index(self, uri=articleuri)\n                    if partresult == False:\n                        result = False\n                    new_items += 1\n\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Exception while getting {} : {}\".format(articleuri, e))\n\n        if new_items:\n            exporter.commit()\n\n        return result\n\n#\n# If runned (not importet for functions) get parameters and start\n#\n\n\nif __name__ == \"__main__\":\n\n    # todo: if no protocoll, use http://\n\n    # get uri or filename from args\n    from optparse import OptionParser\n    parser = OptionParser(\"etl-rss [options] uri\")\n    parser.add_option(\"-q\", \"--quiet\", dest=\"quiet\", action=\"store_true\",\n                      default=None, help=\"Dont print status (filenames) while indexing\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\",\n                      default=False, help=\"Config file\")\n    parser.add_option(\"-p\", \"--plugins\", dest=\"plugins\",\n                      default=False, help=\"Plugins (comma separated)\")\n    parser.add_option(\"-w\", \"--outputfile\", dest=\"outputfile\",\n                      default=False, help=\"Output file\")\n\n    (options, args) = parser.parse_args()\n\n    if len(args) != 1:\n        parser.error(\"No uri(s) given\")\n\n    connector = Connector_RSS()\n\n    # add optional config parameters\n    if options.config:\n        connector.read_configfile(options.config)\n    if options.outputfile:\n        connector.config['outputfile'] = options.outputfile\n\n    # set (or if config overwrite) plugin config\n    if options.plugins:\n        connector.config['plugins'] = options.plugins.split(',')\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    if options.quiet == False or options.quiet == True:\n        connector.quiet = options.quiet\n\n    for uri in args:\n        connector.index(uri)\n"
  },
  {
    "path": "src/opensemanticetl/etl_sitemap.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport sys\nimport urllib.request\nimport xml.etree.ElementTree as ElementTree\n\nfrom etl_web import Connector_Web\nimport tasks\n\n\nclass Connector_Sitemap(Connector_Web):\n\n    def __init__(self, verbose=False, quiet=True):\n\n        Connector_Web.__init__(self, verbose=verbose, quiet=quiet)\n\n        self.quiet = quiet\n        self.read_configfiles()\n        self.queue = True\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        # windows style filenames\n        self.read_configfile('conf\\\\opensemanticsearch-connector')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-ocr')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-rdf')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-web')\n\n        # linux style filenames\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/etl-webadmin')\n        self.read_configfile('/etc/opensemanticsearch/etl-custom')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-ocr')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n        self.read_configfile('/etc/opensemanticsearch/connector-web')\n\n    # Import sitemap\n\n    # Index every URL of the sitemap\n\n    def index(self, sitemap):\n\n        if self.verbose or self.quiet == False:\n            print(\"Downloading sitemap {}\".format(sitemap))\n\n        sitemap = urllib.request.urlopen(sitemap)\n\n        et = ElementTree.parse(sitemap)\n\n        root = et.getroot()\n\n        # process subsitemaps if sitemapindex\n        for sitemap in root.findall(\"{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap\"):\n            url = sitemap.findtext(\n                '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')\n\n            if self.verbose or self.quiet == False:\n                print(\"Processing subsitemap {}\".format(url))\n\n            self.index(url)\n\n        #\n        # get urls if urlset\n        #\n\n        urls = []\n\n        # XML schema with namespace sitemaps.org\n        for url in root.findall(\"{http://www.sitemaps.org/schemas/sitemap/0.9}url\"):\n\n            url = url.findtext(\n                '{http://www.sitemaps.org/schemas/sitemap/0.9}loc')\n\n            urls.append(url)\n\n        # XML schema with namespace Google sitemaps\n        for url in root.findall(\"{http://www.google.com/schemas/sitemap/0.84}url\"):\n\n            url = url.findtext(\n                '{http://www.google.com/schemas/sitemap/0.84}loc')\n\n            urls.append(url)\n\n        # Queue or download and index the urls\n\n        for url in urls:\n\n            if self.queue:\n\n                # add webpage to queue as Celery task\n                try:\n\n                    if self.verbose or self.quiet == False:\n                        print(\"Adding URL to queue: {}\".format(url))\n\n                    result = tasks.index_web.apply_async(\n                        kwargs={'uri': url}, queue='tasks', priority=5)\n\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Exception while adding to queue {} : {}\\n\".format(url, e))\n\n            else:\n\n                # batchmode, index page after page ourselves\n\n                try:\n                    if self.verbose or self.quiet == False:\n                        print(\"Indexing {}\".format(url))\n\n                    result = Connector_Web.index(self, uri=url)\n\n                except KeyboardInterrupt:\n                    raise KeyboardInterrupt\n                except BaseException as e:\n                    sys.stderr.write(\n                        \"Exception while indexing {} : {}\\n\".format(url, e))\n\n#\n# If runned (not imported for functions) get parameters and start\n#\n\n\nif __name__ == \"__main__\":\n\n    # get uri or filename from args\n    from optparse import OptionParser\n    parser = OptionParser(\"etl-sitemap [options] uri\")\n    parser.add_option(\"-q\", \"--quiet\", dest=\"quiet\", action=\"store_true\",\n                      default=False, help=\"Don't print status (filenames) while indexing\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n    parser.add_option(\"-b\", \"--batch\", dest=\"batchmode\", action=\"store_true\",\n                      default=None, help=\"Batch mode (Page after page instead of adding to queue)\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\",\n                      default=False, help=\"Config file\")\n    parser.add_option(\"-p\", \"--plugins\", dest=\"plugins\",\n                      default=False, help=\"Plugins (comma separated)\")\n\n    (options, args) = parser.parse_args()\n\n    if len(args) != 1:\n        parser.error(\"No sitemap uri(s) given\")\n\n    connector = Connector_Sitemap()\n\n    # add optional config parameters\n    if options.config:\n        connector.read_configfile(options.config)\n\n    # set (or if config overwrite) plugin config\n    if options.plugins:\n        connector.config['plugins'] = options.plugins.split(',')\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    if options.quiet == False or options.quiet == True:\n        connector.quiet = options.quiet\n\n    if options.batchmode == True:\n        connector.queue = False\n\n    for uri in args:\n        connector.index(uri)\n"
  },
  {
    "path": "src/opensemanticetl/etl_sparql.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport os\nimport tempfile\n\nfrom etl import ETL\nfrom enhance_rdf import enhance_rdf\n\nfrom SPARQLWrapper import SPARQLWrapper, XML, JSON\n\n\n#\n# download (part of) graph by SPARQL query from SPARQL endpoint to RDF file\n#\n\ndef download_rdf_from_sparql_endpoint(endpoint, query):\n\n    # read graph by construct query results from SPARQL endpoint\n    sparql = SPARQLWrapper(endpoint)\n    sparql.setQuery(query)\n    sparql.setReturnFormat(XML)\n    results = sparql.query().convert()\n\n    # crate temporary filename\n    file = tempfile.NamedTemporaryFile()\n    filename = file.name\n    file.close()\n\n    # export graph to RDF file\n    results.serialize(destination=filename, format=\"xml\")\n\n    return filename\n\n\n#\n# Append values from SPARQL SELECT result to plain text list file\n#\n\ndef sparql_select_to_list_file(endpoint, query, filename=None):\n\n    # read graph by construct query results from SPARQL endpoint\n    sparql = SPARQLWrapper(endpoint)\n    sparql.setQuery(query)\n    sparql.setReturnFormat(JSON)\n    results = sparql.query().convert()\n\n    if not filename:\n        # crate temporary filename\n        listfile = tempfile.NamedTemporaryFile(delete=False)\n        filename = listfile.name\n        listfile.close()\n\n    listfile = open(filename, 'a', encoding=\"utf-8\")\n\n    for result in results[\"results\"][\"bindings\"]:\n\n        for variable in results[\"head\"][\"vars\"]:\n            if variable in result:\n                if \"value\" in result[variable]:\n                    value = result[variable][\"value\"]\n                    value = value.strip()\n                    if value:\n                        listfile.write(result[variable][\"value\"] + \"\\n\")\n\n    listfile.close()\n\n    return filename\n\n\nclass Connector_SPARQL(ETL):\n\n    def __init__(self, verbose=False, quiet=True):\n\n        ETL.__init__(self, verbose=verbose)\n\n        self.read_configfiles()\n\n        self.config[\"plugins\"] = []\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        # windows style filenames\n        self.read_configfile('conf\\\\opensemanticsearch-connector')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-rdf')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-sparql')\n\n        # linux style filenames\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/etl-custom')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n        self.read_configfile('/etc/opensemanticsearch/connector-sparql')\n\n    # Import RDF from SPARQL result\n\n    def index_rdf(self, endpoint, query):\n\n        # download (part of) graph from endpoint to temporary rdf file\n        rdffilename = download_rdf_from_sparql_endpoint(\n            endpoint=endpoint, query=query)\n\n        parameters = self.config.copy()\n\n        # import the triples of rdf graph by RDF plugin\n        enhancer = enhance_rdf()\n        enhancer.etl_graph_file(\n            docid=endpoint, filename=rdffilename, parameters=parameters)\n\n        os.remove(rdffilename)\n\n    # Import fields and values from SPARQL SELECT result\n\n    def index_select(self, endpoint, query):\n\n        # read graph by construct query results from SPARQL endpoint\n        sparql = SPARQLWrapper(endpoint)\n        sparql.setQuery(query)\n        sparql.setReturnFormat(JSON)\n        results = sparql.query().convert()\n\n        i = 0\n        for result in results[\"results\"][\"bindings\"]:\n            i += 1\n            data = {}\n            data['id'] = endpoint + \"/\" + query + \"/\" + str(i)\n\n            for variable in results[\"head\"][\"vars\"]:\n                if variable in result:\n                    if \"value\" in result[variable]:\n                        data[variable] = result[variable][\"value\"]\n\n            self.process(data=data)\n\n    # Import SPARQL result\n\n    def index(self, endpoint, query):\n\n        if query.startswith(\"SELECT \"):\n            self.index_select(endpoint, query)\n        else:\n            self.index_rdf(endpoint, query)\n\n\n#\n# If runned (not imported for functions) get parameters and start\n#\n\nif __name__ == \"__main__\":\n\n    # todo: if no protocoll, use http://\n\n    # get uri or filename from args\n    from optparse import OptionParser\n    parser = OptionParser(\"etl-sparql [options] uri query\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\",\n                      default=False, help=\"Config file\")\n    parser.add_option(\"-p\", \"--plugins\", dest=\"plugins\",\n                      default=False, help=\"Plugins (comma separated)\")\n\n    (options, args) = parser.parse_args()\n\n    if len(args) != 2:\n        parser.error(\"Missing parameters endpoint URI and SPARQL query\")\n\n    connector = Connector_SPARQL()\n\n    # add optional config parameters\n    if options.config:\n        connector.read_configfile(options.config)\n\n    # set (or if config overwrite) plugin config\n    if options.plugins:\n        connector.config['plugins'] = options.plugins.split(',')\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    connector.index(endpoint=args[0], query=args[1])\n"
  },
  {
    "path": "src/opensemanticetl/etl_twitter_scraper.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport twint\nimport sys\nfrom etl import ETL\nfrom tasks import index_web\n\nmodule = sys.modules[\"twint.storage.write\"]\n\netl = ETL()\netl.read_configfile('/etc/opensemanticsearch/etl')\netl.read_configfile('/etc/opensemanticsearch/etl-webadmin')\n\netl.config['plugins'] = ['enhance_path', 'enhance_entity_linking', 'enhance_multilingual']\netl.config['facet_path_strip_prefix'] = [\"http://www.\", \"https://www.\", \"http://\", \"https://\"]\n\n\ndef index_tweet(obj, config):\n    tweet = obj.__dict__\n\n    parameters = {}\n    parameters['id'] = tweet['link']\n\n    data = {}\n    data['content_type_ss'] = 'Tweet'\n    data['content_type_group_ss'] = 'Social media post'\n\n    data['author_ss'] = tweet['name']\n    data['userid_s'] = tweet['user_id_str']\n    data['username_ss'] = tweet['username']\n\n    data['title_txt'] = tweet['tweet']\n    data['content_txt'] = tweet['tweet']\n\n    data['hashtag_ss'] = tweet['hashtags']\n\n    if tweet['place']:\n        data['location_ss'] = tweet['place']\n\n    data['urls_ss'] = tweet['urls']\n\n    data['mentions_ss'] = tweet['mentions']\n\n    data['retweets_count_i'] = tweet['retweets_count']\n    data['likes_count_i'] = tweet['likes_count']\n    data['replies_count_i'] = tweet['replies_count']\n    data['file_modified_dt'] = tweet['datestamp'] + 'T' + tweet['timestamp'] + 'Z'\n\n    if config.Index_Linked_Webpages:\n        if data['urls_ss']:\n            for url in data['urls_ss']:\n                index_web.apply_async(kwargs={'uri': url}, queue='open_semantic_etl_tasks', priority=5)\n\n    try:\n        etl.process(parameters, data)\n    except BaseException as e:\n        sys.stderr.write(\"Exception while indexing tweet {} : {}\".format(parameters['id'], e))\n\n# overwrite twint json export method with custom function index_tweet\nmodule.Json = index_tweet\n\n\ndef index(search=None, username=None, Profile_full=False, limit=None, Index_Linked_Webpages=False):\n\n    c = twint.Config()\n    c.Hide_output = True\n    c.Store_json = True\n    c.Output = \"tweets.json\"\n\n    if username:\n        c.Username = username\n\n    if search:\n        c.Search = search\n\n    if limit:\n        c.Limit = limit\n\n    c.Index_Linked_Webpages = Index_Linked_Webpages\n\n    c.Profile_full = Profile_full\n\n    if Profile_full:\n        twint.run.Profile(c)\n    else:\n        twint.run.Search(c)\n\n    etl.commit()\n\n\n\n#\n# If running from command line (not imported as library) get parameters and start\n#\n\nif __name__ == \"__main__\":\n\n    # get uri or filename from args\n\n    from optparse import OptionParser\n\n    # get uri or filename from args\n\n    parser = OptionParser(\"etl-twitter-scraper [options]\")\n    parser.add_option(\"-u\", \"--user\", dest=\"username\",\n                      default=None, help=\"User\")\n    parser.add_option(\"-s\", \"--search\", dest=\"search\",\n                      default=None, help=\"Search\")\n    parser.add_option(\"-l\", \"--limit\", dest=\"limit\",\n                      default=None, help=\"Limit\")\n\n    (options, args) = parser.parse_args()\n\n    if not options.username and not options.search:\n        parser.error(\"No Username or search given\")\n\n    index(username=options.username, search=options.search, limit=options.limit)\n\n\n"
  },
  {
    "path": "src/opensemanticetl/etl_web.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport time\nimport urllib.request\nimport os\nfrom lxml import etree\nfrom dateutil import parser as dateparser\n\nfrom etl_file import Connector_File\n\n\nclass Connector_Web(Connector_File):\n\n    def __init__(self, verbose=False, quiet=True):\n\n        Connector_File.__init__(self, verbose=verbose)\n\n        self.quiet = quiet\n        self.set_configdefaults()\n        self.read_configfiles()\n\n    def set_configdefaults(self):\n\n        Connector_File.set_configdefaults(self)\n\n        #\n        # Standard config\n        #\n        # Do not edit config here! Overwrite options in /etc/opensemanticsearch/connector-web\n        #\n\n        # no filename to uri mapping\n        self.config['uri_prefix_strip'] = False\n        self.config['uri_prefix'] = False\n\n        # strip in facet path\n        self.config['facet_path_strip_prefix'] = ['http://www.',\n                                                  'http://',\n                                                  'https://www.',\n                                                  'https://',\n                                                  'ftp://'\n                                                  ]\n\n        self.config['plugins'] = [\n            'filter_blacklist',\n            'enhance_extract_text_tika_server',\n            'enhance_detect_language_tika_server',\n            'enhance_contenttype_group',\n            'enhance_pst',\n            'enhance_csv',\n            'enhance_path',\n            'enhance_zip',\n            'enhance_warc',\n            'enhance_extract_hashtags',\n            'clean_title',\n            'enhance_multilingual',\n\n        ]\n\n    def read_configfiles(self):\n        #\n        # include configs\n        #\n\n        # Windows style filenames\n        self.read_configfile('conf\\\\opensemanticsearch-etl')\n        self.read_configfile('conf\\\\opensemanticsearch-enhancer-rdf')\n        self.read_configfile('conf\\\\opensemanticsearch-connector-web')\n\n        # Linux style filenames\n        self.read_configfile('/etc/opensemanticsearch/etl')\n        self.read_configfile('/etc/opensemanticsearch/etl-webadmin')\n        self.read_configfile('/etc/opensemanticsearch/etl-custom')\n        self.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n        self.read_configfile('/etc/opensemanticsearch/facets')\n        self.read_configfile('/etc/opensemanticsearch/connector-web')\n        self.read_configfile('/etc/opensemanticsearch/connector-web-custom')\n\n    def read_mtime_from_html(self, tempfilename):\n        mtime = False\n\n        try:\n            parser = etree.HTMLParser()\n            tree = etree.parse(tempfilename, parser)\n\n            try:\n                mtimestring = tree.xpath(\n                    \"//meta[@http-equiv='last-modified']\")[0].get(\"content\")\n            except:\n                mtimestring = False\n\n            try:\n                mtimestring = tree.xpath(\n                    \"//meta[@name='last-modified']\")[0].get(\"content\")\n            except:\n                mtimestring = False\n        except:\n            mtimestring = False\n\n        if mtimestring:\n\n            if self.verbose:\n                print(\"Modification time in HTML: \", mtimestring)\n\n            try:\n                mtime = time.strptime(mtimestring)\n            except:\n                mtime = False\n\n            try:\n                # parse datetime\n                mtime = dateparser.parse(mtimestring)\n                # convert datetime to time\n                mtime = mtime.timetuple()\n\n            except BaseException as e:\n                print(\"Exception while reading last-modified from content: {}\".format(e))\n\n        if self.verbose:\n            print(\"Extracted modification time: {}\".format(mtime))\n\n        return mtime\n\n    def index(self, uri, last_modified=False, downloaded_file=False, downloaded_headers=None):\n        if downloaded_headers is None:\n            downloaded_headers = {}\n\n        parameters = self.config.copy()\n\n        if self.verbose:\n            parameters['verbose'] = True\n\n        data = {}\n\n        uri = uri.strip()\n        # if no protocol, add http://\n        if not uri.lower().startswith(\"http://\") and not uri.lower().startswith(\"https://\") and not uri.lower().startswith(\"ftp://\") and not uri.lower().startswith(\"ftps://\"):\n            uri = 'http://' + uri\n\n        parameters['id'] = uri\n\n        #\n        # Download to tempfile, if not yet downloaded by crawler\n        #\n\n        if downloaded_file:\n            tempfilename = downloaded_file\n            headers = downloaded_headers\n\n        else:\n\n            if self.verbose:\n                print(\"Downloading {}\".format(uri))\n\n            tempfilename, headers = urllib.request.urlretrieve(uri)\n\n            if self.verbose:\n                print(\"Download done\")\n\n        parameters['filename'] = tempfilename\n\n        #\n        # Modification time\n        #\n        mtime = False\n\n        # get meta \"last-modified\" from content\n        mtime = self.read_mtime_from_html(tempfilename)\n\n        # use HTTP status modification time\n        if not mtime:\n            try:\n\n                last_modified = headers['last-modified']\n\n                if self.verbose:\n                    print(\"HTTP Header Last-modified: {}\".format(last_modified))\n\n                mtime = dateparser.parse(last_modified)\n                # convert datetime to time\n                mtime = mtime.timetuple()\n\n                if self.verbose:\n                    print(\"Parsed date: {}\".format(mtime))\n\n            except:\n                mtime = False\n                print(\"Failed to parse HTTP header last-modified\")\n\n        # else HTTP create date\n        if not mtime:\n            try:\n                date = headers['date']\n\n                if self.verbose:\n                    print(\"HTTP Header date: {}\".format(date))\n\n                mtime = dateparser.parse(date)\n                # convert datetime to time\n                mtime = mtime.timetuple()\n\n                if self.verbose:\n                    print(\"Parsed date: {}\".format(mtime))\n\n            except:\n                mtime = False\n                print(\"Failed to parse HTTP header date\")\n\n        # else now\n        if not mtime:\n            mtime = time.localtime()\n\n        mtime_masked = time.strftime(\"%Y-%m-%dT%H:%M:%SZ\", mtime)\n\n        data['file_modified_dt'] = mtime_masked\n\n        # Enrich data and write to search index\n        parameters, data = self.process(parameters=parameters, data=data)\n\n        os.remove(tempfilename)\n\n\n#\n# If runned (not importet for functions) get parameters and start\n#\n\nif __name__ == \"__main__\":\n\n    # get uri or filename from args\n\n    from optparse import OptionParser\n\n    # get uri or filename from args\n\n    parser = OptionParser(\"etl-web [options] URL\")\n    parser.add_option(\"-q\", \"--quiet\", dest=\"quiet\", action=\"store_true\",\n                      default=None, help=\"Do not print status (filenames) while indexing\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=None, help=\"Print debug messages\")\n    parser.add_option(\"-f\", \"--force\", dest=\"force\", action=\"store_true\",\n                      default=None, help=\"Force (re)indexing, even if no changes\")\n    parser.add_option(\"-c\", \"--config\", dest=\"config\",\n                      default=False, help=\"Config file\")\n    parser.add_option(\"-p\", \"--plugins\", dest=\"plugins\",\n                      default=False, help=\"Plugins (comma separated)\")\n    parser.add_option(\"-w\", \"--outputfile\", dest=\"outputfile\",\n                      default=False, help=\"Output file\")\n\n    (options, args) = parser.parse_args()\n\n    if len(args) != 1:\n        parser.error(\"No URI(s) given\")\n\n    connector = Connector_Web()\n\n    # add optional config parameters\n    if options.config:\n        connector.read_configfile(options.config)\n    if options.outputfile:\n        connector.config['outputfile'] = options.outputfile\n\n    # set (or if config overwrite) plugin config\n    if options.plugins:\n        connector.config['plugins'] = options.plugins.split(',')\n\n    if options.verbose == False or options.verbose == True:\n        connector.verbose = options.verbose\n\n    if options.quiet == False or options.quiet == True:\n        connector.quiet = options.quiet\n\n    if options.force == False or options.force == True:\n        connector.config['force'] = options.force\n\n    for uri in args:\n        connector.index(uri)\n"
  },
  {
    "path": "src/opensemanticetl/etl_web_crawl.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport tempfile\nimport re\n\nfrom scrapy.crawler import CrawlerProcess\n\nfrom scrapy.spiders import CrawlSpider, Rule\nfrom scrapy.linkextractors import LinkExtractor\n\nfrom tasks import index_web\n\n\nclass OpenSemanticETL_Spider(CrawlSpider):\n\n    name = \"Open Semantic ETL\"\n\n    def parse_item(self, response):\n\n        # write downloaded body to temp file\n        file = tempfile.NamedTemporaryFile(\n            mode='w+b', delete=False, prefix=\"etl_web_crawl_\")\n        file.write(response.body)\n        filename = file.name\n        file.close()\n\n        self.logger.info(\n            'Adding ETL task for downloaded page or file from %s', response.url)\n\n        downloaded_headers = {}\n        if 'date' in response.headers:\n                downloaded_headers['date'] = response.headers['date'].decode(\"utf-8\", errors=\"ignore\")\n        if 'last-modified' in response.headers:\n                downloaded_headers['last-modified'] = response.headers['last-modified'].decode(\"utf-8\", errors=\"ignore\")\n\n        # add task to index the downloaded file/page by ETL web in Celery task worker\n        index_web.apply_async(kwargs={'uri': response.url, 'downloaded_file': filename,\n                                      'downloaded_headers': downloaded_headers}, queue='open_semantic_etl_tasks', priority=5)\n\n\ndef index(uri, crawler_type=\"PATH\"):\n\n    configfile = '/etc/opensemanticsearch/connector-web'\n\n    # read config file\n    config = {}\n    exec(open(configfile).read(), locals())\n\n    name = \"Open Semantic ETL {}\".format(uri)\n\n    start_urls = [uri]\n\n    process = CrawlerProcess({\n        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'\n    })\n\n    if crawler_type == \"PATH\":\n        # crawl only the path\n        filter_regex = re.escape(uri) + '*'\n        rules = (\n            Rule(LinkExtractor(allow=filter_regex, deny_extensions=config['webcrawler_deny_extensions']), callback='parse_item'),\n        )\n        process.crawl(OpenSemanticETL_Spider,\n                      start_urls=start_urls, rules=rules, name=name)\n\n    else:\n        # crawl full domain and subdomains\n\n        allowed_domain = uri\n        # remove protocol prefix\n        if allowed_domain.lower().startswith('http://www.'):\n            allowed_domain = allowed_domain[11:]\n        elif allowed_domain.lower().startswith('https://www.'):\n            allowed_domain = allowed_domain[12:]\n        elif allowed_domain.lower().startswith('http://'):\n            allowed_domain = allowed_domain[7:]\n        elif allowed_domain.lower().startswith('https://'):\n            allowed_domain = allowed_domain[8:]\n\n        # get only domain name without path\n        allowed_domain = allowed_domain.split(\"/\")[0]\n\n        rules = (\n            Rule(LinkExtractor(deny_extensions=config['webcrawler_deny_extensions']), callback='parse_item'),\n        )\n        process.crawl(OpenSemanticETL_Spider, start_urls=start_urls,\n                      allowed_domains=[allowed_domain], rules=rules, name=name)\n\n    # the start URL itselves shall be indexed, too, so add task to index the downloaded file/page by ETL web in Celery task worker\n    index_web.apply_async(kwargs={'uri': uri}, queue='open_semantic_etl_tasks', priority=5)\n\n    process.start()  # the script will block here until the crawling is finished\n\n\nif __name__ == \"__main__\":\n\n    # get uri or filename from args\n\n    from optparse import OptionParser\n\n    # get uri or filename from args\n\n    parser = OptionParser(\"etl-web-crawl URL\")\n\n    (options, args) = parser.parse_args()\n\n    if len(args) != 1:\n        parser.error(\"No URL(s) given\")\n\n    for uri in args:\n        index(uri)\n"
  },
  {
    "path": "src/opensemanticetl/export_elasticsearch.py",
    "content": "from elasticsearch import Elasticsearch\n\n\n# Connect to Elastic Search\n\nclass export_elasticsearch(object):\n\n    def __init__(self, config=None):\n        if config is None:\n            config = {}\n\n        self.config = config\n\n        if not 'index' in self.config:\n            self.config['index'] = 'opensemanticsearch'\n\n        if not 'verbose' in self.config:\n            self.config['verbose'] = False\n\n    #\n    # Write data to Elastic Search\n    #\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        self.config = parameters\n\n        # post data\n        self.update(parameters=parameters, data=data)\n\n        return parameters, data\n\n    # send the updated field data to Elastic Search\n    def update(self, docid=None, data=None, parameters=None):\n        if data is None:\n            data = {}\n        if parameters is None:\n            parameters = {}\n\n        if docid:\n            parameters['id'] = docid\n        else:\n            docid = parameters['id']\n\n        es = Elasticsearch()\n        result = es.index(\n            index=self.config['index'], doc_type='document', id=docid, body=data)\n\n        return result\n\n    # get last modified date for document\n    def get_lastmodified(self, docid, parameters=None):\n        if parameters is None:\n            parameters = {}\n\n        es = Elasticsearch()\n\n        doc_exists = es.exists(\n            index=self.config['index'], doc_type=\"document\", id=docid)\n\n        # if doc with id exists in index, read modification date\n        if doc_exists:\n            doc = es.get(index=self.config['index'], doc_type=\"document\",\n                         id=docid, _source=False, fields=\"file_modified_dt\")\n            last_modified = doc['fields']['file_modified_dt'][0]\n        else:\n            last_modified = None\n\n        return last_modified\n\n    # commits are managed by Elastic Search setup, so no explicit commit here\n    def commit(self):\n        return\n"
  },
  {
    "path": "src/opensemanticetl/export_json.py",
    "content": "import json\n\n\nclass export_json(object):\n\n    def __init__(self, config=None):\n        if config is None:\n            config = {'verbose': False}\n\n        self.config = config\n\n    #\n    # Json data\n    #\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        # if outputfile write json to file\n        if 'outputfile' in parameters:\n\n            import io\n            with io.open(parameters['outputfile'], 'w', encoding='utf-8') as f:\n                f.write(json.dumps(data, ensure_ascii=False))\n        else:  # else print json\n            print(json.dumps(data))\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/export_neo4j.py",
    "content": "import os\n\nfrom py2neo import Graph, Node, Relationship\n\n#\n# Export entities and connections to neo4j\n#\n\n\nclass export_neo4j(object):\n\n    def __init__(self, config=None):\n        if config is None:\n            config = {'verbose': False}\n        self.config = config\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        if 'verbose' in parameters:\n            self.config['verbose'] = parameters['verbose']\n\n        # for this facets, do not add additional entity to connect with, but write to properties of the entity\n        properties = ['content_type_ss',\n                      'content_type_group_ss', 'language_ss', 'language_s']\n\n        host = 'localhost'\n        if 'neo4j_host' in parameters:\n            host = parameters['neo4j_host']\n        if os.getenv('OPEN_SEMANTIC_ETL_NEO4J_HOST'):\n            host = os.getenv('OPEN_SEMANTIC_ETL_NEO4J_HOST')\n\n        user = 'neo4j'\n        if 'neo4j_user' in parameters:\n            user = parameters['neo4j_user']\n\n        password = 'neo4j'\n        if 'neo4j_password' in parameters:\n            password = parameters['neo4j_password']\n\n        neo4j_auth = os.getenv('NEO4J_AUTH', '')\n        if '/' in neo4j_auth:\n            user, _, password = neo4j_auth.partition('/')\n\n        graph = Graph(host=host, user=user, password=password)\n\n        document_node = Node('Document', name=parameters['id'])\n\n        if 'title' in data:\n            document_node['title'] = data['title']\n\n        # add properties from facets\n        for entity_class in parameters['facets']:\n\n            if entity_class in data:\n\n                entity_class_label = parameters['facets'][entity_class]['label']\n\n                if entity_class in properties:\n\n                    document_node[entity_class_label] = data[entity_class]\n\n        graph.merge(document_node, 'Document', 'name')\n\n        # add / connect linked entities from facets\n\n        for entity_class in parameters['facets']:\n\n            if entity_class in data:\n\n                entity_class_label = entity_class\n                if parameters['facets'][entity_class]['label']:\n                    entity_class_label = parameters['facets'][entity_class]['label']\n\n                if not entity_class in properties:\n\n                    relationship_label = entity_class_label\n\n                    if entity_class in ['person_ss', 'organization_ss', 'location_ss']:\n                        relationship_label = \"Named Entity Recognition\"\n\n                    # convert to array, if single entity / not multivalued field\n                    if isinstance(data[entity_class], list):\n                        entities = data[entity_class]\n                    else:\n                        entities = [data[entity_class]]\n\n                    for entity in entities:\n\n                        if self.config['verbose']:\n                            print(\"Export to Neo4j: Merging entity {} of class {}\".format(\n                                entity, entity_class_label))\n\n                        # if not yet there, add the entity to graph\n                        entity_node = Node(entity_class_label, name=entity)\n                        graph.merge(entity_node, entity_class_label, 'name')\n\n                        # if not yet there, add relationship to graph\n                        relationship = Relationship(\n                            document_node, relationship_label, entity_node)\n                        graph.merge(relationship)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/export_print.py",
    "content": "import pprint\n\n\nclass export_print(object):\n\n    def __init__(self, config=None):\n        if config is None:\n            config = {'verbose': False}\n\n        self.config = config\n\n    #\n    # Print data\n    #\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        pprint.pprint(data)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/export_queue_files.py",
    "content": "#\n# Write filename to Celery queue for batching and parallel processing\n#\n\nfrom tasks import index_file\n\n\nclass export_queue_files(object):\n\n    def __init__(self, config=None):\n        if config is None:\n            config = {'verbose': False}\n        self.config = config\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        # add file to ETL queue with standard prioritization\n        # but don't if only plugins not ran that should run later (which will be added to queue in step below)\n        if not 'only_additional_plugins_later' in parameters:\n            index_file.apply_async(\n                kwargs={'filename': parameters['filename']}, queue='open_semantic_etl_tasks', priority=5)\n\n        # add file to (lower prioritized) ETL queue with additional plugins or options which should run later after all files tasks of standard prioritized queue done\n        # to run ETL of the file later again with additional plugins like OCR which need much time/resources while meantime all files are searchable by other plugins which need fewer resources\n        if 'additional_plugins_later' in parameters or 'additional_plugins_later_config' in parameters:\n\n            additional_plugins_later = parameters.get('additional_plugins_later', [])\n\n            additional_plugins_later_config = parameters.get('additional_plugins_later_config', {})\n\n            if len(additional_plugins_later) > 0 or len(additional_plugins_later_config) > 0:\n\n                index_file.apply_async(kwargs={\n                                       'filename': parameters['filename'], 'additional_plugins': additional_plugins_later, 'config': additional_plugins_later_config}, queue='open_semantic_etl_tasks', priority=1)\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/export_solr.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport os\nimport json\nimport requests\nimport sys\nimport time\n\nimport urllib.request\nimport urllib.parse\n\n# Export data to Solr\n\n\nclass export_solr(object):\n\n    def __init__(self, config=None):\n        if config is None:\n            config = {}\n\n        self.config = config\n\n        if os.getenv('OPEN_SEMANTIC_ETL_SOLR'):\n            self.config['solr'] = os.getenv('OPEN_SEMANTIC_ETL_SOLR')\n\n        if not 'solr' in self.config:\n            self.config['solr'] = 'http://localhost:8983/solr/'\n\n        if not 'index' in self.config:\n            self.config['index'] = 'opensemanticsearch'\n\n        self.solr = self.config['solr']\n        self.core = self.config['index']\n\n        if not 'verbose' in self.config:\n            self.config['verbose'] = False\n\n        self.verbose = self.config['verbose']\n\n    #\n    # Write data to Solr\n    #\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        # if not there, set config defaults\n        if 'verbose' in parameters:\n            self.verbose = parameters['verbose']\n\n        if self.verbose:            \n            print ('Starting Exporter: Solr')\n\n        if 'solr' in parameters:\n            self.solr = parameters['solr']\n            if not self.solr.endswith('/'):\n                self.solr += '/'\n\n        if 'index' in parameters:\n            self.core = parameters['index']\n\n        add = parameters.get('add', False)\n\n        fields_set = parameters.get('fields_set', [])\n\n        commit = parameters.get('commit', None)\n\n        if not 'id' in data:\n            data['id'] = parameters['id']\n\n        # post data to Solr\n        do_export = True\n\n        # but do not post, if only id (which will contain no add or set commands for fields and will be seen as overwrite for whole document)\n        if len(data) < 2:\n            if self.verbose:\n                print('Not exported to Solr because no data or only the ID.')\n            do_export = False\n\n        # and do not post, if yet posted before (exporter not exporter, but in plugin queue, f.e. in multi stage processing before adding task to queue)\n        if 'etl_export_solr_b' in data:\n            if self.verbose:\n                print('Not exported to Solr because no data or yet exported in this ETL run, because exporter was runned as plugin.')\n                do_export = False\n\n        if do_export:\n            self.update(data=data, add=add, fields_set=fields_set, commit=commit)\n\n\n        return parameters, data\n\n    # update document in index, set fields in data to updated or new values or add new/additional values\n    # if no document yet, it will be added\n    def update(self, data, add=False, fields_set=(), commit=None):\n\n        update_fields = {}\n\n        for fieldname in data:\n            if fieldname == 'id':\n                update_fields['id'] = data['id']\n            else:\n                update_fields[fieldname] = {}\n\n                if add and not fieldname in fields_set:\n                    # add value to existent values of the field\n                    update_fields[fieldname]['add-distinct'] = data[fieldname]\n                else:\n                    # if document there with values for this fields, the existing values will be overwritten with new values\n                    update_fields[fieldname]['set'] = data[fieldname]\n\n        self.post(data=update_fields, commit=commit)\n\n    def post(self, data=None, docid=None, commit=None):\n        if data is None:\n            data = {}\n\n        solr_uri = self.solr + self.core + '/update'\n\n        if docid:\n            data['id'] = docid\n\n        datajson = '[' + json.dumps(data) + ']'\n\n        params = {}\n\n        if commit:\n            params['commit'] = 'true'\n\n        if self.verbose:\n            print(\"Sending update request to {}\".format(solr_uri))\n            print(datajson)\n\n\n        retries = 0\n        retrytime = 1\n        # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry\n        retrytime_max = 120\n        no_connection = True\n\n        while no_connection:\n            try:\n                if retries > 0:\n                    print('Will retry to connect to Solr in {} second(s).'.format(retrytime))\n                    time.sleep(retrytime)\n                    retrytime = retrytime * 2\n                    if retrytime > retrytime_max:\n                        retrytime = retrytime_max\n\n                r = requests.post(solr_uri, data=datajson, params=params, headers={'Content-Type': 'application/json'})\n\n                # if bad status code, raise exception\n                r.raise_for_status()\n\n                if retries > 0:\n                    print('Successfull retried to connect Solr.')\n\n                no_connection = False\n\n            except KeyboardInterrupt:\n                raise KeyboardInterrupt\n\n            except requests.exceptions.ConnectionError as e:\n\n                    retries += 1\n\n                    sys.stderr.write(\"Connection to Solr failed (will retry in {} seconds). Exception: {}\\n\".format(retrytime, e))\n\n            except requests.exceptions.HTTPError as e:\n                if e.response.status_code == 503:\n\n                    retries += 1\n\n                    sys.stderr.write(\"Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\\n\".format(retrytime, e))\n\n                else:\n                    no_connection = False\n\n                    sys.stderr.write('Error while posting data to Solr: {}'.format(e))\n\n                    raise(e)\n\n            except BaseException as e:\n                no_connection = False\n\n                sys.stderr.write('Error while posting data to Solr: {}'.format(e))\n\n                raise(e)\n\n\n    # tag a document by adding new value to field\n    def tag(self, docid=None, field=None, value=None, data=None):\n        if data is None:\n            data = {}\n\n        data_merged = data.copy()\n\n        if docid:\n            data_merged['id'] = docid\n\n        if field:\n            if field in data_merged:\n                # if not list, convert to list\n                if not isinstance(data_merged[field], list):\n                    data_merged[field] = [data_merged[field]]\n                # add value to field\n                data_merged[field].append(value)\n            else:\n                data_merged[field] = value\n\n        result = self.update(data=data_merged, add=True)\n\n        return result\n\n    # search for documents with query and without tag and update them with the tag\n    def update_by_query(self, query, field=None, value=None, data=None, queryparameters=None):\n        if data is None:\n            data = {}\n\n        import pysolr\n\n        count = 0\n\n        solr = pysolr.Solr(self.solr + self.core)\n\n        #\n        # extend query: do not return documents, that are tagged\n        #\n\n        query_marked_before = ''\n\n        if field:\n            query_marked_before = field + ':\"' + solr_mask(value) + '\"'\n\n        # else extract field and value from data to build query of yet tagged docs to exclude\n\n        for fieldname in data:\n\n            if isinstance(data[fieldname], list):\n\n                for value in data[fieldname]:\n\n                    if query_marked_before:\n                        query_marked_before += \" AND \"\n\n                    query_marked_before += fieldname + \\\n                        ':\"' + solr_mask(value) + '\"'\n            else:\n\n                value = data[fieldname]\n                if query_marked_before:\n                    query_marked_before += \" AND \"\n\n                query_marked_before += fieldname + \\\n                    ':\"' + solr_mask(value) + '\"'\n\n        solrparameters = {\n            'fl': 'id',\n            'defType': 'edismax',\n            'rows': 10000000,\n        }\n\n        # add custom Solr parameters (if the same parameter, overwriting the obove defaults)\n        if queryparameters:\n            solrparameters.update(queryparameters)\n\n        if query_marked_before:\n            # don't extend query but use filterquery for more performance (cache) on aliases\n            solrparameters[\"fq\"] = 'NOT (' + query_marked_before + ')'\n\n        if self.verbose:\n            print(\"Solr query:\")\n            print(query)\n            print(\"Solr parameters:\")\n            print(solrparameters)\n\n        results = solr.search(query, **solrparameters)\n\n        for result in results:\n            docid = result['id']\n\n            if self.verbose:\n                print(\"Tagging {}\".format(docid))\n\n            self.tag(docid=docid, field=field, value=value, data=data)\n\n            count += 1\n\n        return count\n\n    def get_data(self, docid, fields):\n\n        uri = self.solr + self.core + '/get?id=' + \\\n            urllib.parse.quote(docid) + '&fl=' + ','.join(fields)\n\n        request = urllib.request.urlopen(uri)\n        encoding = request.info().get_content_charset('utf-8')\n        data = request.read()\n        request.close()\n\n        solr_doc = json.loads(data.decode(encoding))\n\n        data = None\n        if 'doc' in solr_doc:\n            data = solr_doc['doc']\n\n        return data\n\n    def commit(self):\n\n        uri = self.solr + self.core + '/update?commit=true'\n        if self.verbose:\n            print(\"Committing to {}\".format(uri))\n        request = urllib.request.urlopen(uri)\n        request.close()\n\n    def get_lastmodified(self, docid):\n        # convert mtime to solr format\n        solr_doc_mtime = None\n\n        solr_doc = self.get_data(docid=docid, fields=[\"file_modified_dt\"])\n\n        if solr_doc:\n            if 'file_modified_dt' in solr_doc:\n                solr_doc_mtime = solr_doc['file_modified_dt']\n\n            # todo: for each plugin\n#\t\t\tsolr_meta_mtime = False\n#\t\t\tif 'meta_modified_dt' in solr_doc['doc']:\n#\t\t\t\tsolr_meta_mtime = solr_doc['doc']['meta_modified_dt']\n\n        return solr_doc_mtime\n\n    def delete(self, parameters, docid=None, query=None,):\n        import pysolr\n\n        if 'solr' in parameters:\n            self.solr = parameters['solr']\n            if not self.solr.endswith('/'):\n                self.solr += '/'\n\n        if 'index' in parameters:\n            self.core = parameters['index']\n\n        solr = pysolr.Solr(self.solr + self.core)\n\n        if docid:\n            result = solr.delete(id=docid)\n\n        if query:\n            result = solr.delete(q=query)\n\n        return result\n\n    #\n    # append synonyms by Solr REST API for managed resources\n    #\n\n    def append_synonyms(self, resourceid, synonyms):\n\n        url = self.solr + self.core + '/schema/analysis/synonyms/' + resourceid\n        headers = {'content-type': 'application/json'}\n\n        r = requests.post(url=url, data=json.dumps(synonyms), headers=headers)\n\n\ndef solr_mask(string_to_mask, solr_specialchars='\\+-&|!(){}[]^\"~*?:/'):\n\n    masked = string_to_mask\n    # mask every special char with leading \\\n    for char in solr_specialchars:\n        masked = masked.replace(char, \"\\\\\" + char)\n\n    return masked\n"
  },
  {
    "path": "src/opensemanticetl/filter_blacklist.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\n\n\ndef is_in_lists(listfiles, value, match=None):\n\n    result = False\n\n    for listfile in listfiles:\n\n        try:\n            if is_in_list(filename=listfile, value=value, match=match):\n                result = True\n                break\n\n        except BaseException as e:\n            print(\"Exception while checking blacklist {}:\".format(listfile))\n            print(e.args[0])\n\n    return result\n\n\n#\n# is a value in a textfile with a list\n#\ndef is_in_list(filename, value, match=None):\n\n    result = False\n    listfile = open(filename)\n\n    # search all the lines\n    for line in listfile:\n        line = line.strip()\n\n        # ignore empty lines and comment lines (starting with #)\n        if line and not line.startswith(\"#\"):\n\n            if match == 'prefix':\n                if value.startswith(line):\n                    result = True\n            elif match == 'suffix':\n                if value.endswith(line):\n                    result = True\n            elif match == 'regex':\n                if re.search(line, value):\n                    result = True\n\n            else:\n                if line == value:\n                    result = True\n\n            if result:\n\n                # we dont have to check rest of list\n                break\n\n    listfile.close()\n\n    return result\n\n\n#\n# add to configured facet, if entry in list is in text\n#\n\nclass filter_blacklist(object):\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        blacklisted = False\n\n        verbose = False\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                verbose = True\n\n        uri = parameters['id']\n\n        # if blacklist type configurated in parameters, check this blacklists for URI\n\n        if 'blacklist_prefix' in parameters:\n\n            if is_in_lists(listfiles=parameters['blacklist_prefix'], value=uri, match=\"prefix\"):\n                blacklisted = True\n\n        if not blacklisted and 'blacklist_suffix' in parameters:\n\n            if is_in_lists(listfiles=parameters['blacklist_suffix'], value=uri, match=\"suffix\"):\n                blacklisted = True\n\n        if not blacklisted and 'blacklist_regex' in parameters:\n\n            if is_in_lists(listfiles=parameters['blacklist_regex'], value=uri, match=\"regex\"):\n                blacklisted = True\n\n        if not blacklisted and 'blacklist' in parameters:\n\n            if is_in_lists(listfiles=parameters['blacklist'], value=uri):\n                blacklisted = True\n\n        # check whitelists for URI, if blacklisted\n\n        if blacklisted and 'whitelist_prefix' in parameters:\n            if is_in_lists(listfiles=parameters['whitelist_prefix'], value=uri, match=\"prefix\"):\n                blacklisted = False\n\n        if blacklisted and 'whitelist_suffix' in parameters:\n            if is_in_lists(listfiles=parameters['whitelist_suffix'], value=uri, match=\"suffix\"):\n                blacklisted = False\n\n        if blacklisted and 'whitelist_regex' in parameters:\n            if is_in_lists(listfiles=parameters['whitelist_regex'], value=uri, match=\"regex\"):\n                blacklisted = False\n\n        if blacklisted and 'whitelist' in parameters:\n            if is_in_lists(listfiles=parameters['whitelist'], value=uri):\n                blacklisted = False\n\n        # if blacklisted and not matched whitelist, return parameter break, so no further processing\n        if blacklisted:\n            parameters['break'] = True\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/filter_file_not_modified.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os\nimport datetime\nimport sys\nimport importlib\n\n#\n# do not index (set parameters[\"break\"] = True), if yet former ETL with all configured plugins / yet indexed\n#\n\n\nclass filter_file_not_modified(object):\n\n    def __init__(self):\n\n        self.verbose = False\n        self.quiet = False\n\n        # if a critical plugin failed in former ETL / indexed document, reindex file to retry\n        self.force_reindex_if_former_etl_plugin_errors = [\n            'enhance_extract_text_tika_server']\n\n    def process(self, parameters=None, data=None):\n        if parameters is None:\n            parameters = {}\n        if data is None:\n            data = {}\n\n        if 'verbose' in parameters:\n            if parameters['verbose']:\n                self.verbose = True\n\n        if 'quiet' in parameters:\n            self.quiet = parameters['quiet']\n\n        filename = parameters['filename']\n\n        force = False\n\n        if 'force' in parameters:\n            force = parameters['force']\n\n        # check if file size and file date are the same in DB\n        # if exist delete protocol prefix file://\n        if filename.startswith(\"file://\"):\n            filename = filename.replace(\"file://\", '', 1)\n\n        # if relative path change to absolute path\n        filename = os.path.abspath(filename)\n\n        # get modification time from file\n        file_mtime = os.path.getmtime(filename)\n\n        # get id\n        docid = parameters['id']\n\n        export = False\n        indexed_doc_mtime = None\n        plugins_failed = []\n        critical_plugins_failed = []\n        plugins_runned = []\n        plugins_not_runned = []\n        additional_plugins_later_not_runned = []\n        do_not_reindex_because_plugin_yet_not_processed = []\n\n        # use abstracted function from exporter module to get last modification time of file in index\n        if 'export' in parameters:\n            export = parameters['export']\n            module = importlib.import_module(export)\n            objectreference = getattr(module, export)\n            exporter = objectreference(parameters)\n\n            # get modtime and ETL errors from document saved in index\n            metadatafields = ['file_modified_dt', 'etl_error_plugins_ss']\n\n            # get plugin status fields\n            for configured_plugin in parameters['plugins']:\n                if not configured_plugin == 'export_queue_files' and not configured_plugin == parameters['export']:\n                    metadatafields.append('etl_' + configured_plugin + '_b')\n            if 'additional_plugins_later' in parameters:\n                for configured_plugin in parameters['additional_plugins_later']:\n                    metadatafields.append('etl_' + configured_plugin + '_b')\n\n            # get config option status field for OCR\n            if 'ocr' in parameters:\n                if parameters['ocr']:\n                    metadatafields.append(\n                        'etl_enhance_extract_text_tika_server_ocr_enabled_b')\n            if 'additional_plugins_later_config' in parameters:\n                if 'ocr' in parameters['additional_plugins_later_config']:\n                    if parameters['additional_plugins_later_config']['ocr']:\n                        metadatafields.append(\n                            'etl_enhance_extract_text_tika_server_ocr_enabled_b')\n\n            if 'do_not_reindex_because_plugin_yet_not_processed' in parameters:\n                do_not_reindex_because_plugin_yet_not_processed=parameters['do_not_reindex_because_plugin_yet_not_processed']\n\n            # read yet indexed metadata, if there\n            indexed_metadata = exporter.get_data(\n                docid=docid, fields=metadatafields)\n\n            if indexed_metadata:\n                if 'file_modified_dt' in indexed_metadata:\n                    indexed_doc_mtime = indexed_metadata['file_modified_dt']\n                if 'etl_error_plugins_ss' in indexed_metadata:\n                    plugins_failed = indexed_metadata['etl_error_plugins_ss']\n\n        # mask file_mtime for comparison in same format than in Lucene index\n        file_mtime_masked = datetime.datetime.fromtimestamp(\n            file_mtime).strftime(\"%Y-%m-%dT%H:%M:%SZ\")\n\n        # Is it a new file (not indexed, so the initial None different to filemtime)\n        # or modified (also doc_mtime <> file_mtime of file)?\n\n        if indexed_doc_mtime == file_mtime_masked:\n\n            # Doc was found in index and field moddate of solr doc same as files mtime\n            # so file was indexed before and is unchanged\n\n            # all now configured plugins processed in former ETL/their analysis is in index?\n            for configured_plugin in parameters['plugins']:\n                if not configured_plugin == 'export_queue_files' and not configured_plugin == parameters['export']:\n                    plugin_runned = indexed_metadata.get('etl_' + configured_plugin + '_b', False)\n                    if plugin_runned:\n                        plugins_runned.append(configured_plugin)\n                    else:\n                        if not configured_plugin in do_not_reindex_because_plugin_yet_not_processed:\n                            plugins_not_runned.append(configured_plugin)\n\n            if 'additional_plugins_later' in parameters:\n                for configured_plugin in parameters['additional_plugins_later']:\n                    plugin_runned = indexed_metadata.get('etl_' + configured_plugin + '_b', False)\n                    if plugin_runned:\n                        plugins_runned.append(configured_plugin)\n                    else:\n                        if not configured_plugin in do_not_reindex_because_plugin_yet_not_processed:\n                            additional_plugins_later_not_runned.append(configured_plugin)\n\n            # Tika OCR was enabled in former ETL/their analysis is in index?\n            if 'ocr' in parameters:\n                if parameters['ocr']:\n                    plugin_runned = indexed_metadata.get('etl_enhance_extract_text_tika_server_ocr_enabled_b', False)\n                    if plugin_runned:\n                        plugins_runned.append('enhance_extract_text_tika_server_ocr_enabled')\n                    else:\n                        if not configured_plugin in do_not_reindex_because_plugin_yet_not_processed:\n                            plugins_not_runned.append('enhance_extract_text_tika_server_ocr_enabled')\n\n            if 'additional_plugins_later_config' in parameters:\n                if 'ocr' in parameters['additional_plugins_later_config']:\n                    if parameters['additional_plugins_later_config']['ocr']:\n                        plugin_runned = indexed_metadata.get('etl_enhance_extract_text_tika_server_ocr_enabled_b', False)\n                        if plugin_runned:\n                            plugins_runned.append('enhance_extract_text_tika_server_ocr_enabled')\n                        else:\n                            if not configured_plugin in do_not_reindex_because_plugin_yet_not_processed:\n                                additional_plugins_later_not_runned.append('enhance_extract_text_tika_server_ocr_enabled')\n\n            for critical_plugin in self.force_reindex_if_former_etl_plugin_errors:\n                if critical_plugin in plugins_failed:\n                    critical_plugins_failed.append(critical_plugin)\n\n            if len(plugins_not_runned) > 0 or len(additional_plugins_later_not_runned) > 0:\n\n                doindex = True\n\n                # print status\n                if self.verbose or self.quiet == False:\n                    try:\n                        print('Repeating indexing of unchanged file because (additional configured) plugin(s) or options {} not ran yet: {}'.format(\n                            plugins_not_runned + additional_plugins_later_not_runned, filename))\n                    except:\n                        sys.stderr.write(\n                            \"Repeating indexing of unchanged file because former fail of critical plugin, but exception while printing message (problem with encoding of filename or console? Is console set to old ASCII standard instead of UTF-8?)\")\n\n                if len(plugins_not_runned) == 0:\n                    parameters['only_additional_plugins_later'] = True\n\n            # a critical plugin failed in former ETL\n            elif len(critical_plugins_failed) > 0:\n\n                doindex = True\n\n                # print status\n                if self.verbose or self.quiet == False:\n                    try:\n                        print('Repeating indexing of unchanged file because critical plugin(s) {} failed in former run: {}'.format(\n                            critical_plugins_failed, filename))\n                    except:\n                        sys.stderr.write(\n                            \"Repeating indexing of unchanged file because critical plugin(s) failed in former run, but exception while printing message (problem with encoding of filename or console? Is console set to old ASCII standard instead of UTF-8?)\")\n\n            # If force option, do further processing even if unchanged\n            elif force:\n\n                doindex = True\n\n                # print status\n                if self.verbose or self.quiet == False:\n                    try:\n                        print(\n                            'Forced indexing of unchanged file: {}'.format(filename))\n                    except:\n                        sys.stderr.write(\n                            \"Forced indexing of unchanged file but exception while printing message (problem with encoding of filename or console? Is console set to old ASCII standard instead of UTF-8?)\")\n\n            else:\n\n                doindex = False\n\n                # print status\n                if self.verbose:\n                    try:\n                        print(\"Not indexing unchanged file: {}\".format(filename))\n                    except:\n                        sys.stderr.write(\n                            \"Not indexing unchanged file but exception while printing message (problem with encoding of filename or console?)\")\n\n        else:  # doc not found in index or other/old modification time in index\n\n            doindex = True\n\n            # print status, if new document\n            if self.verbose or self.quiet == False:\n\n                if indexed_doc_mtime == None:\n                    try:\n                        print(\"Indexing new file: {}\".format(filename))\n                    except:\n                        sys.stderr.write(\n                            \"Indexing new file but exception while printing message (problem with encoding of filename or console?)\")\n                else:\n                    try:\n                        print('Indexing modified file: {}'.format(filename))\n                    except:\n                        sys.stderr.write(\n                            \"Indexing modified file. Exception while printing filename (problem with encoding of filename or console?)\\n\")\n\n        # if not modified and no critical ETL errors, stop ETL process, because all done on last run\n        if not doindex:\n            parameters['break'] = True\n        else:\n            # reset plugin status of plugins of next stage\n            # so reprocessing of updated data works by tasks in later stages,\n            # which else would have plugin status processed\n            # from first/last processing of old version of content\n\n            commit = False\n            if len(plugins_runned) > 0:\n                \n                for runned_plugin in plugins_runned:\n                    if not runned_plugin in [parameters['export'], 'enhance_mapping_id', 'filter_blacklist', 'filter_file_not_modified']:\n                        data['etl_' + runned_plugin + '_b'] = False\n                        commit = True\n\n            # immediately commit (else Solr autocommit after some time) of etl status reset(s) in exporter before adding new ETL tasks which need the status for plugin filter_file_not_modified\n            if commit:\n                parameters['commit'] = True\n\n        return parameters, data\n"
  },
  {
    "path": "src/opensemanticetl/move_indexed_file.py",
    "content": "#!/usr/bin/env python3\n\nimport urllib.request\nimport urllib.parse\nimport json\nfrom itertools import starmap\n\n\ndef move_files(host: str, moves: dict, prefix=\"\"):\n    \"\"\"Moves files within the index (not physically).\n\n    Example of usage:\n    host = \"http://solr:8983/solr/opensemanticsearch/\"\n    move_files(host, {\"/b2\": \"/book2\",\n                      \"/b1\": \"/folder/book1\"}, prefix=\"file://\")\n\n\n    :host: Url to the solr instance\n    :moves: A dict of the form {src: dest, ...}, where src is\n            the source path and dest is the destination path.\n    \"\"\"\n    src = moves.keys()\n    indexed_data = get_files(host, map(append_prefix(prefix), src))\n    # In the following we have to remap the destination path to\n    # the individual metadata entries, since the ordering of\n    # the query result and query may differ:\n    moved_data = starmap(change_path(prefix),\n                         zip(indexed_data,\n                             map(dict_map(moves),\n                                 map(extract_path,\n                                     indexed_data))))\n    request_payload = prepare_payload(\n        moved_data, (d[\"id\"] for d in indexed_data))\n    post(host, request_payload)\n\n\ndef move_dir(host: str, src: str, dest: str, prefix=\"\"):\n    \"\"\"Moves directories within the index (not physically).\n\n    Example of usage:\n    host = \"http://solr:8983/solr/opensemanticsearch/\"\n    move_dir(host, src=/docs/a/, dest=/docs/b, prefix=\"file://\")\n\n\n    :host: Url to the solr instance\n    :src: Source directory\n    :dest: Destination directory\n    \"\"\"\n    indexed_data = get_files_in_dir(host, src)\n    moved_data = map(change_dir(prefix, src=src, dest=dest),\n                     indexed_data)\n    request_payload = prepare_payload(\n        moved_data, (d[\"id\"] for d in indexed_data))\n    post(host, request_payload)\n\n\ndef change_path(prefix: str):\n    \"\"\"Returns a mapping function to be used with starmap\n    \"\"\"\n    def change(data: dict, dest: str) -> dict:\n        \"\"\"Creates a modified version of data\n\n        :data: The indexed metadata of the moved file\n        :dest: The destination path\n        \"\"\"\n        dest_components = dest.strip(\"/\").split(\"/\")\n        return _change_path(data, dest_components, prefix=prefix)\n    return change\n\n\ndef change_dir(prefix: str, src: str, dest: str):\n    \"\"\"Returns a mapping function to be used with map\n    \"\"\"\n    dest_components = dest.strip(\"/\").split(\"/\")\n    src_path_components = src.strip(\"/\").split(\"/\")\n\n    def change(data: dict) -> dict:\n        \"\"\"Creates a modified version of data\n\n        :data: The indexed metadata of the moved file\n        :dest: The destination path\n        \"\"\"\n        indexed_components = extract_path_components(data)\n        # Attention: zip consumes the generator up to the number\n        # of elements in indexed_components. If you switch the two\n        # arguments of zip, an additional element will be consumed\n        # from indexed_components, as zip will perform a next on\n        # its first argument to see if the iterable is exhausted.\n        for idx_component, src_component in zip(src_path_components,\n                                                indexed_components):\n            if idx_component != src_component:\n                raise ValueError(\n                    \"Path component of file and input file differs: '\"\n                    + idx_component + \"' <-> '\" + src_component + \"'\")\n        return _change_path(data, dest_components + list(indexed_components),\n                            prefix=prefix)\n    return change\n\n\ndef _change_path(data: dict, dest_components: tuple, prefix: str = \"\") -> dict:\n    \"\"\"Creates a modified version of data\n\n    :data: The indexed metadata of the moved file\n    :dest_components: The destination path split into components\n    \"\"\"\n    moved_data = data.copy()\n    del moved_data[\"_version_\"]\n    moved_data[\"id\"] = prefix + \"/\" + \"/\".join(dest_components)\n    *dest_dir_components, base_name = dest_components\n    moved_data.update({\"path{}_s\".format(i): component\n                       for i, component in enumerate(dest_dir_components)})\n    moved_data[\"path_basename_s\"] = base_name\n    n = len(dest_dir_components)\n    while True:\n        if moved_data.pop(\"path{}_s\".format(n), None) is None:\n            break\n        n += 1\n    return moved_data\n\n\ndef prepare_payload(adds, delete_ids):\n    \"\"\"Takes metadata to be added to the index and ids to be deleted\n    from the index. Creates the corresponding solr json request payload\n    \"\"\"\n    payload = {DuplicateKey(\"add\"): {\"doc\": doc} for doc in adds}\n    payload[\"delete\"] = [\n        {\"id\": id_} for id_ in delete_ids]\n    return payload\n\n\nclass DuplicateKey(str):\n    \"\"\"Allows dicts having multiple identical keys\"\"\"\n\n    def __hash__(self):\n        return id(self)\n\n\ndef extract_path(data: dict) -> str:\n    \"\"\"Extracts the path of the metadata\n    \"\"\"\n    return \"/\" + \"/\".join(extract_path_components(data))\n\n\ndef extract_path_components(data: dict):\n    \"\"\"Extracts the path of the metadata in form of a\n    generator yielding the components of the path\n    \"\"\"\n    i = 0\n    while True:\n        component = data.get(\"path{}_s\".format(i))\n        if component is None:\n            break\n        yield component\n        i += 1\n    yield data[\"path_basename_s\"]\n\n\ndef dict_map(mapping: dict):\n    \"\"\"Converts a dict into a function (for usage with map)\"\"\"\n    def _map(s):\n        return mapping[s]\n    return _map\n\n\ndef append_prefix(prefix: str):\n    \"\"\"A mapping function to be used with :map:\"\"\"\n    def append(s: str):\n        return prefix + s\n    return append\n\n\ndef get_files(host: str, ids: list) -> list:\n    \"\"\"Queries solr, searches for files whose id is in :ids:\"\"\"\n    return get(host,\n               \"(\" + \", \".join(\n                   map('id:\"{}\"'.format, ids)) + \")\"\n               )\n\n\ndef get_files_in_dir(host: str, path: str) -> list:\n    \"\"\"Queries solr, searches for files in the folder :path:\"\"\"\n    path_components = path.strip(\"/\").split(\"/\")\n    return get(host,\n               \" AND \".join(\n                   starmap(\n                       'path{}_s:\"{}\"'.format, enumerate(path_components)\n                   )))\n\n\ndef get(host: str, query: str) -> list:\n    return sum(get_pages(host, query), [])\n\n\ndef get_pages(host: str, query: str, limit=50):\n    \"\"\"An iterator over the pages of a solr request response\"\"\"\n    start = 0\n    n_docs = limit\n    query_url_template = host + \"select?start={}&rows={}&q={}\".format(\n        \"{}\", limit, urllib.parse.quote(query))\n    while start < n_docs:\n        response = urllib.request.urlopen(\n            query_url_template.format(start))\n        data = json.loads(response.read().decode())[\"response\"]\n        n_docs = data[\"numFound\"]\n        start += limit\n        yield data[\"docs\"]\n\n\ndef post(host: str, data: dict):\n    request = urllib.request.Request(\n        host + \"update/json?commit=true\",\n        data=json.dumps(data).encode(),\n        headers={\"Content-Type\": \"application/json\"}\n    )\n    urllib.request.urlopen(request)\n"
  },
  {
    "path": "src/opensemanticetl/requirements.txt",
    "content": "celery\nfeedparser\nlxml\nnumerizer\npy2neo\npycurl\npyinotify\npysolr\npython-dateutil\nrequests\nrdflib\nscrapy\nSPARQLWrapper\ntika\ntwint\nwarcio\n"
  },
  {
    "path": "src/opensemanticetl/tasks.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\n#\n# Queue tasks for batch processing and parallel processing\n#\n\nimport os\nimport time\nfrom celery import Celery\nfrom kombu import Queue, Exchange\n\n# ETL connectors\nfrom etl import ETL\nfrom etl_delete import Delete\nfrom etl_file import Connector_File\nfrom etl_web import Connector_Web\nfrom etl_rss import Connector_RSS\n\n\nverbose = True\nquiet = False\n\nbroker = 'amqp://localhost'\nif os.getenv('OPEN_SEMANTIC_ETL_MQ_BROKER'):\n    broker = os.getenv('OPEN_SEMANTIC_ETL_MQ_BROKER')\n\napp = Celery('etl.tasks', broker=broker)\n\napp.conf.task_queues = [Queue('open_semantic_etl_tasks', Exchange(\n    'open_semantic_etl_tasks'), routing_key='open_semantic_etl_tasks', queue_arguments={'x-max-priority': 100})]\n\napp.conf.worker_max_tasks_per_child = 1\napp.conf.worker_prefetch_multiplier = 1\napp.conf.task_acks_late = True\n\n\n# Max parallel tasks (Default: Use as many parallel ETL tasks as CPUs available).\n# Warning: Some tools called by ETL plugins use multithreading, too,\n# so used CPUs/threads can be more than that setting!\n\nif os.getenv('OPEN_SEMANTIC_ETL_CONCURRENCY'):\n    app.conf.worker_concurrency = int(os.getenv('OPEN_SEMANTIC_ETL_CONCURRENCY'))\n\n\netl_delete = Delete()\netl_web = Connector_Web()\netl_rss = Connector_RSS()\n\n\n#\n# Delete document with URI from index\n#\n\n@app.task(name='etl.delete')\ndef delete(uri):\n    etl_delete.delete(uri=uri)\n\n\n#\n# Index a file\n#\n\n@app.task(name='etl.index_file')\ndef index_file(filename, additional_plugins=(), wait=0, commit=False, config=None):\n\n    if wait:\n        time.sleep(wait)\n\n    etl_file = Connector_File()\n\n    # set alternate config options (will overwrite config options from config file)\n    if config:\n        for option in config:\n            etl_file.config[option] = config[option]\n\n    etl_file.index_file(filename=filename,\n                        additional_plugins=additional_plugins)\n\n    if commit:\n        etl_file.commit()\n\n#\n# Index file directory\n#\n\n\n@app.task(name='etl.index_filedirectory')\ndef index_filedirectory(filename, config=None):\n\n    from etl_filedirectory import Connector_Filedirectory\n\n    etl_filedirectory = Connector_Filedirectory()\n\n    # set alternate config options (will overwrite config options from config file)\n    if config:\n        for option in config:\n            etl_filedirectory.config[option] = config[option]\n\n    result = etl_filedirectory.index(filename)\n    etl_filedirectory.commit()\n\n    return result\n\n\n#\n# Index a webpage\n#\n@app.task(name='etl.index_web')\ndef index_web(uri, wait=0, downloaded_file=False, downloaded_headers=None):\n\n    if wait:\n        time.sleep(wait)\n\n    result = etl_web.index(uri, downloaded_file=downloaded_file,\n                           downloaded_headers=downloaded_headers)\n\n    return result\n\n\n#\n# Index full website\n#\n\n@app.task(name='etl.index_web_crawl')\ndef index_web_crawl(uri, crawler_type=\"PATH\"):\n\n    import etl_web_crawl\n\n    etl_web_crawl.index(uri, crawler_type)\n\n\n#\n# Index webpages from sitemap\n#\n\n@app.task(name='etl.index_sitemap')\ndef index_sitemap(uri):\n\n    from etl_sitemap import Connector_Sitemap\n\n    connector_sitemap = Connector_Sitemap()\n\n    result = connector_sitemap.index(uri)\n\n    return result\n\n\n#\n# Index RSS Feed\n#\n\n@app.task(name='etl.index_rss')\ndef index_rss(uri):\n\n    result = etl_rss.index(uri)\n\n    return result\n\n\n#\n# Enrich with / run plugins\n#\n\n@app.task(name='etl.enrich')\ndef enrich(plugins, uri, wait=0):\n\n    if wait:\n        time.sleep(wait)\n\n    etl = ETL()\n    etl.read_configfile('/etc/opensemanticsearch/etl')\n    etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf')\n\n    etl.config['plugins'] = plugins.split(',')\n\n    filename = uri\n\n    # if exist delete protocoll prefix file://\n    if filename.startswith(\"file://\"):\n        filename = filename.replace(\"file://\", '', 1)\n\n    parameters = etl.config.copy()\n\n    parameters['id'] = uri\n    parameters['filename'] = filename\n\n    parameters, data = etl.process(parameters=parameters, data={})\n\n    return data\n\n\n@app.task(name='etl.index_twitter_scraper')\ndef index_twitter_scraper(search=None, username=None, Profile_full=False, limit=None, Index_Linked_Webpages=False):\n\n    import opensemanticetl.etl_twitter_scraper\n\n    opensemanticetl.etl_twitter_scraper.index(username=username, search=search, limit=limit, Profile_full=Profile_full, Index_Linked_Webpages=Index_Linked_Webpages)\n\n\n\n#\n# Read command line arguments and start\n#\n\n# if running (not imported to use its functions), run main function\nif __name__ == \"__main__\":\n\n    from optparse import OptionParser\n\n    parser = OptionParser(\"etl-tasks [options]\")\n    parser.add_option(\"-q\", \"--quiet\", dest=\"quiet\", action=\"store_true\",\n                      default=False, help=\"Don\\'t print status (filenames) while indexing\")\n    parser.add_option(\"-v\", \"--verbose\", dest=\"verbose\",\n                      action=\"store_true\", default=False, help=\"Print debug messages\")\n\n    (options, args) = parser.parse_args()\n\n    if options.verbose == False or options.verbose == True:\n        verbose = options.verbose\n        etl_delete.verbose = options.verbose\n        etl_web.verbose = options.verbose\n        etl_rss.verbose = options.verbose\n\n    if options.quiet == False or options.quiet == True:\n        quiet = options.quiet\n\n    app.worker_main(['worker'])\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_detect_language_tika_server.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_detect_language_tika_server\n\nclass Test_enhance_detect_language_tika_server(unittest.TestCase):\n\n    def test(self):\n\n        enhancer = enhance_detect_language_tika_server.enhance_detect_language_tika_server()\n\n        # English\n        parameters, data = enhancer.process(data={'content_txt': 'This sentence is written in english language.'})\n        self.assertEqual(data['language_s'], 'en')\n\n        # German\n        parameters, data = enhancer.process(data={'content_txt': 'Dies ist ein Satz in der Sprache Deutsch.'})\n        self.assertEqual(data['language_s'], 'de')\n\n       \nif __name__ == '__main__':\n    unittest.main()\n\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_extract_email.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_extract_email\n\nclass Test_enhance_extract_email(unittest.TestCase):\n\n    def test(self):\n\n        enhancer = enhance_extract_email.enhance_extract_email()\n\n        data = {}\n        data['content_txt'] = \"one@localnet.localdomain at begin and two@localnet2.localdomain in the middle and end of the line three@localnet3.localdomain\\na_underscore@localnet.localdomain and some.points.here@localnet.localdomain\"\n\n        parameters, data = enhancer.process(data=data)\n\n        self.assertTrue('one@localnet.localdomain' in data['email_ss'])\n        self.assertTrue('two@localnet2.localdomain' in data['email_ss'])\n        self.assertTrue('three@localnet3.localdomain' in data['email_ss'])\n        self.assertTrue('a_underscore@localnet.localdomain' in data['email_ss'])\n        self.assertTrue('some.points.here@localnet.localdomain' in data['email_ss'])\n\n        self.assertTrue('localnet.localdomain' in data['email_domain_ss'])\n        self.assertTrue('localnet2.localdomain' in data['email_domain_ss'])\n        self.assertTrue('localnet3.localdomain' in data['email_domain_ss'])\n       \nif __name__ == '__main__':\n    unittest.main()\n\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_extract_law.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nfrom etl import ETL\n\nclass Test_enhance_extract_law(unittest.TestCase):\n\n    def test(self):\n\n        etl = ETL()\n        etl.config['plugins'] = ['enhance_entity_linking', 'enhance_extract_law']\n        etl.config['raise_pluginexception'] = True\n        data = {}\n        data['content_txt'] = \"\\n\".join([\n            \"abc § 888 xyz\"\n            \"abc § 987 b xyz\"\n            \"§12\",\n            \"§ 123\",\n            \"§345a\",\n            \"§456 b\",\n            \"§ 567 c\",\n            \"BGB § 153 Abs. 1 Satz 2\",\n            \"§ 52 Absatz 1 Nummer 2 Buchstabe c STGB\",\n            \"§ 444 CC\"\n        ])\n\n\n        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)\n        parameters, data = etl.process(parameters={'id': 'test_enhance_extract_law'}, data=data)\n\n        self.assertTrue('§ 888' in data['law_clause_ss'])\n        self.assertTrue('§ 987 b' in data['law_clause_ss'])\n        self.assertTrue('§ 12' in data['law_clause_ss'])\n        self.assertTrue('§ 123' in data['law_clause_ss'])\n        self.assertTrue('§ 345a' in data['law_clause_ss'])\n        self.assertTrue('§ 456 b' in data['law_clause_ss'])\n        self.assertTrue('§ 567 c' in data['law_clause_ss'])\n\n        self.assertTrue('§ 153 Abs. 1 Satz 2' in data['law_clause_ss'])\n        self.assertTrue('§ 52 Absatz 1 Nummer 2 Buchstabe c' in data['law_clause_ss'])\n        \n        self.assertTrue('Strafgesetzbuch' in data['law_code_ss'])\n        self.assertTrue('Bürgerliches Gesetzbuch' in data['law_code_ss'])\n        \n        self.assertTrue('Swiss Civil Code' in data['law_code_ss'])\n\n\n    def test_blacklist(self):\n\n        etl = ETL()\n        etl.config['plugins'] = ['enhance_entity_linking', 'enhance_extract_law']\n        etl.config['raise_pluginexception'] = True\n        data = {}\n        data['content_txt'] = \"\\n\".join([\n            \"No clause for law code alias CC\"\n        ])\n\n        parameters, data = etl.process(parameters={'id': 'test_enhance_extract_law'}, data=data)\n        \n        self.assertFalse('Swiss Civil Code' in data['law_code_ss'])\n\n        data['content_txt'] = \"\\n\".join([\n            \"No clause for blacklisted law code alias CC but not blacklisted label of this alias: Swiss Civil Code\"\n        ])\n\n        parameters, data = etl.process(parameters={'id': 'test_enhance_extract_law'}, data=data)\n        \n        self.assertTrue('Swiss Civil Code' in data['law_code_ss'])\n\n\nif __name__ == '__main__':\n    unittest.main()\n\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_extract_money.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nfrom etl import ETL\n\nclass Test_enhance_extract_money(unittest.TestCase):\n\n    def test(self):\n\n        etl = ETL()\n        etl.config['plugins'] = ['enhance_entity_linking', 'enhance_extract_money']\n        etl.config['raise_pluginexception'] = True\n        data = {}\n        data['content_txt'] = \"\\n\".join([\n            \"abc $ 123 xyz\",\n            \"abc $ 124,000 xyz\",\n            \"abc 234 $ xyz\",\n            \"abc 235,000 $ xyz\",\n            \"abc 236,99 $ xyz\",\n            \"abc $1234 xyz\",\n            \"abc 2345$ xyz\",\n            \"4444 dollar\",\n            \"44444 USD\",\n            \"444 €\",\n            \"445.000 €\",\n            \"450,99 €\",\n            \"4444 EUR\",\n            \"46.000 EUR\",\n            \"47.000,99 EUR\",\n            \"44,22 EURO\",\n            \"if ambiguous like $ 77 € for more completeness we want to extract both possible variants\",\n        ])\n\n\n        parameters, data = etl.process(parameters={'id': 'test_enhance_extract_money'}, data=data)\n\n        self.assertTrue('$ 123' in data['money_ss'])\n        self.assertTrue('$ 124,000' in data['money_ss'])\n        self.assertTrue('234 $' in data['money_ss'])\n        self.assertTrue('235,000 $' in data['money_ss'])\n        self.assertTrue('236,99 $' in data['money_ss'])\n        self.assertTrue('$1234' in data['money_ss'])\n        self.assertTrue('2345$' in data['money_ss'])\n        self.assertTrue('4444 dollar' in data['money_ss'])\n        self.assertTrue('44444 USD' in data['money_ss'])\n        self.assertTrue('444 €' in data['money_ss'])\n        self.assertTrue('445.000 €' in data['money_ss'])\n        self.assertTrue('450,99 €' in data['money_ss'])\n        self.assertTrue('4444 EUR' in data['money_ss'])\n        self.assertTrue('46.000 EUR' in data['money_ss'])\n        self.assertTrue('47.000,99 EUR' in data['money_ss'])\n        self.assertTrue('44,22 EURO' in data['money_ss'])\n        self.assertTrue('$ 77' in data['money_ss'])\n        self.assertTrue('77 €' in data['money_ss'])\n\n    def test_numerizer(self):\n\n        etl = ETL()\n        etl.config['plugins'] = ['enhance_entity_linking', 'enhance_extract_money']\n        etl.config['raise_pluginexception'] = True\n        data = { 'language_s': 'en' }\n        data['content_txt'] = \"\\n\".join([\n            \"So two million two hundred and fifty thousand and seven $ were given to them\",\n            \"We got twenty one thousand four hundred and seventy three dollars from someone\",\n        ])\n\n        parameters, data = etl.process(parameters={'id': 'test_enhance_extract_money_numerize'}, data=data)\n\n        self.assertTrue('2250007 $' in data['money_ss'])\n        self.assertTrue('21473 dollars' in data['money_ss'])\n\nif __name__ == '__main__':\n    unittest.main()\n\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_extract_text_tika_server.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nimport enhance_extract_text_tika_server\n\nclass TestEnhanceExtractTextTikaServer(unittest.TestCase):\n\n    # delete OCR cache entries for the images used in this test class\n    def delete_ocr_cache_entries(self):\n        filenames = [\n            '/var/cache/tesseract/eng-4c6bf51d4455e1cb58b7d8dd20fb8846f15a3d2c884dc8859802ed689f74ae7a-e96c4b1545a83d86d05f7fbb12ade96d.txt',\n            '/var/cache/tesseract/eng-526959d31f4e6b1947bb00c3a02959ef008ce19b9487d95b3df0656159f55a7a-e96c4b1545a83d86d05f7fbb12ade96d.txt',\n            '/var/cache/tesseract/eng-c93c49c9dfc9764a4307c2757eb378b2d8cd00f3007ac450605b83f23ecda900-e96c4b1545a83d86d05f7fbb12ade96d.txt',\n            '/var/cache/tesseract/eng-ebce8ee4ea7d3d24fe9384212d944adeb58e8f18be15ec06103454f7eade70f5-e96c4b1545a83d86d05f7fbb12ade96d.txt'\n        ]\n        for filename in filenames:\n            if os.path.exists(filename):\n                os.remove(filename)\n\n    def setUp(self):\n        self.delete_ocr_cache_entries()\n    def tearDown(self):\n        self.delete_ocr_cache_entries()\n\n    def test_text_extraction_pdf(self):\n\n        enhancer = enhance_extract_text_tika_server.enhance_extract_text_tika_server()\n\n        parameters = {'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/test.pdf'}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        # check extracted content type\n        self.assertTrue(data['content_type_ss'] == 'application/pdf'\n                        or sorted(data['content_type_ss']) == ['application/pdf', 'image/jpeg', 'image/png'])\n\n        # check extracted title\n        self.assertEqual(data['title_txt'], 'TestPDFtitle')\n\n        # check extracted content of PDF text\n        self.assertTrue('TestPDFContent1 on TestPDFPage1' in data['content_txt'])\n        self.assertTrue('TestPDFContent2 on TestPDFPage2' in data['content_txt'])\n\n        # check disabled OCR of embedded images in PDF\n        self.assertFalse('TestPDFOCRImage1Content1' in data['content_txt'])\n        self.assertFalse('TestPDFOCRImage1Content2' in data['content_txt'])\n        self.assertFalse('TestPDFOCRImage2Content1' in data['content_txt'])\n        self.assertFalse('TestPDFOCRImage2Content2' in data['content_txt'])\n\n    def test_text_extraction_pdf_ocr(self):\n\n        enhancer = enhance_extract_text_tika_server.enhance_extract_text_tika_server()\n\n        parameters = {'ocr': True, 'plugins': ['enhance_pdf_ocr'],\n                      'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/test.pdf'}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        # check extracted content type\n        self.assertTrue(sorted(data['content_type_ss']) == ['application/pdf', 'image/jpeg', 'image/png'])\n\n        # check extracted title\n        self.assertEqual(data['title_txt'], 'TestPDFtitle')\n\n        # check extracted content of PDF text\n        self.assertTrue('TestPDFContent1 on TestPDFPage1' in data['content_txt'])\n        self.assertTrue('TestPDFContent2 on TestPDFPage2' in data['content_txt'])\n\n        # check OCR of embedded images in PDF\n        self.assertTrue('TestPDFOCRImage1Content1' in data['content_txt'])\n        self.assertTrue('TestPDFOCRImage1Content2' in data['content_txt'])\n        self.assertTrue('TestPDFOCRImage2Content1' in data['content_txt'])\n        self.assertTrue('TestPDFOCRImage2Content2' in data['content_txt'])\n\n    def test_text_extraction_pdf_ocr_cache(self):\n\n        # add text (changed for this test) to ocr cache, so we can prove that the cache was used\n        file = open('/var/cache/tesseract/eng-c93c49c9dfc9764a4307c2757eb378b2d8cd00f3007ac450605b83f23ecda900-e96c4b1545a83d86d05f7fbb12ade96d.txt', \"w\")\n        file.write(\"TestPDFOCRCacheImage1Content1\\n\\nTestPDFOCRCacheImage1Content2\")\n        file.close()\n\n        file = open('/var/cache/tesseract/eng-526959d31f4e6b1947bb00c3a02959ef008ce19b9487d95b3df0656159f55a7a-e96c4b1545a83d86d05f7fbb12ade96d.txt', \"w\")\n        file.write(\"TestPDFOCRCacheImage2Content1\\n\\nTestPDFOCRCacheImage2Content2\")\n        file.close()\n\n        enhancer = enhance_extract_text_tika_server.enhance_extract_text_tika_server()\n\n        parameters = {'ocr': True, 'plugins': ['enhance_pdf_ocr'],\n                      'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/test.pdf'}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        # check extracted content type\n        self.assertTrue(sorted(data['content_type_ss']) == ['application/pdf', 'image/jpeg', 'image/png'])\n\n        # check extracted title\n        self.assertEqual(data['title_txt'], 'TestPDFtitle')\n\n        # check extracted content of PDF text\n        self.assertTrue('TestPDFContent1 on TestPDFPage1' in data['content_txt'])\n        self.assertTrue('TestPDFContent2 on TestPDFPage2' in data['content_txt'])\n\n        # check OCR of embedded images in PDF\n        self.assertTrue('TestPDFOCRCacheImage1Content1' in data['content_txt'])\n        self.assertTrue('TestPDFOCRCacheImage1Content2' in data['content_txt'])\n        self.assertTrue('TestPDFOCRCacheImage2Content1' in data['content_txt'])\n        self.assertTrue('TestPDFOCRCacheImage2Content2' in data['content_txt'])\n\n    def test_ocr_png(self):\n\n        enhancer = enhance_extract_text_tika_server.enhance_extract_text_tika_server()\n\n        parameters = {'ocr': True,\n                      'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/Test_OCR_Image1.png'}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        # check extracted content type\n        self.assertEqual(data['content_type_ss'], 'image/png')\n\n        # check OCR\n        self.assertTrue('TestOCRImage1Content1' in data['content_txt'])\n        self.assertTrue('TestOCRImage1Content2' in data['content_txt'])\n\n    def test_ocr_jpg(self):\n\n        enhancer = enhance_extract_text_tika_server.enhance_extract_text_tika_server()\n\n        parameters = {'ocr': True,\n                      'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/Test_OCR_Image2.jpg'}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        # check extracted content type\n        self.assertEqual(data['content_type_ss'], 'image/jpeg')\n\n        # check OCR\n        self.assertTrue('TestOCRImage2Content1' in data['content_txt'])\n        self.assertTrue('TestOCRImage2Content2' in data['content_txt'])\n\n    def test_disabled_ocr_png(self):\n\n        enhancer = enhance_extract_text_tika_server.enhance_extract_text_tika_server()\n\n        parameters = {'ocr': False,\n                      'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/Test_OCR_Image1.png'}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        # check extracted content type\n        self.assertEqual(data['content_type_ss'], 'image/png')\n\n        # check disabled OCR\n        self.assertFalse('TestOCRImage1Content1' in data['content_txt'])\n        self.assertFalse('TestOCRImage1Content2' in data['content_txt'])\n\n        # check if Fake tesseract wrapper returned status\n        self.assertTrue('[Image (no OCR yet)]' in data['content_txt'])\n       \n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_mapping_id.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_mapping_id\n\nclass Test_enhance_mapping_id(unittest.TestCase):\n\n    def test(self):\n\n        enhancer = enhance_mapping_id.enhance_mapping_id()\n\n        mappings = {\n                       \"/\": \"file:///\",\n                       \"/testdir1/\": \"file:///deep1testdir1/\",\n                       \"/testdir1/testdir2/\": \"file:///deep2testdir1/deep2testdir2/\",\n        }\n        \n        docid = '/test'\n        parameters, data = enhancer.process(parameters={'id': docid, 'mappings': mappings})\n        self.assertEqual(parameters['id'], 'file:///test')\n\n        docid = '/testdir1/test'\n        parameters, data = enhancer.process(parameters={'id': docid, 'mappings': mappings})\n        self.assertEqual(parameters['id'], 'file:///deep1testdir1/test')\n\n        docid = '/testdir1/testdir2/test'\n        parameters, data = enhancer.process(parameters={'id': docid, 'mappings': mappings})\n        self.assertEqual(parameters['id'], 'file:///deep2testdir1/deep2testdir2/test')\n\n\n    def test_reverse(self):\n\n        mappings = {\n                       \"/\": \"file:///\",\n                       \"/testdir1/\": \"file:///deep1testdir1/\",\n                       \"/testdir1/testdir2/\": \"file:///deep2testdir1/deep2testdir2/\",\n        }\n        \n        docid = 'file:///test'\n        reversed_value = enhance_mapping_id.mapping_reverse (docid, mappings)\n        self.assertEqual(reversed_value, '/test')\n\n        docid = 'file:///deep1testdir1/test'\n        reversed_value = enhance_mapping_id.mapping_reverse (docid, mappings)\n        self.assertEqual(reversed_value, '/testdir1/test')\n\n        docid = 'file:///deep2testdir1/deep2testdir2/test'\n        reversed_value = enhance_mapping_id.mapping_reverse (docid, mappings)\n        self.assertEqual(reversed_value, '/testdir1/testdir2/test')\n\n       \nif __name__ == '__main__':\n    unittest.main()\n\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_ner_spacy.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_ner_spacy\n\nconfig = {\n    'spacy_ner_classifiers': {\n        'de': 'de_core_news_sm',\n        'en': 'en_core_web_md'\n    }\n}\n\nclass Test_enhance_ner_spacy(unittest.TestCase):\n\n    def test_en(self):\n\n        enhancer = enhance_ner_spacy.enhance_ner_spacy()\n\n        parameters = config.copy()\n        data = {\n            'language_s': 'en',\n            'content_txt': \"Some years ago, Mr. Barack Obama, a member of Democratic Party, was president of the USA.\"\n        }\n\n        parameters, data = enhancer.process(parameters=parameters, data=data)\n\n        self.assertTrue('Barack Obama' in data['person_ss'])\n        self.assertTrue('Democratic Party' in data['organization_ss'])\n        self.assertTrue('USA' in data['location_ss'])\n\n\n    def test_de(self):\n\n        enhancer = enhance_ner_spacy.enhance_ner_spacy()\n\n        parameters = config.copy()\n        data = {\n            'language_s': 'de',\n            'content_txt': \"Der Text ist über Frau Dr. Angela Merkel. Sie ist Mitglied in der CDU. Sie lebt in Deutschland.\"\n        }\n\n        parameters, data = enhancer.process(parameters=parameters, data=data)\n\n        self.assertTrue('Angela Merkel' in data['person_ss'])\n        self.assertTrue('CDU' in data['organization_ss'])\n        self.assertTrue('Deutschland' in data['location_ss'])\n\n       \nif __name__ == '__main__':\n    unittest.main()\n\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_path.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_path\n\nclass Test_enhance_path(unittest.TestCase):\n\n    def test(self):\n\n        enhancer = enhance_path.enhance_path()\n\n\n        docid = '/home/user/test.pdf'\n        parameters, data = enhancer.process(parameters={'id': docid})\n\n        self.assertEqual(data['path0_s'], 'home')\n        self.assertEqual(data['path1_s'], 'user')\n        self.assertEqual(data['path_basename_s'], 'test.pdf')\n        self.assertEqual(data['filename_extension_s'], 'pdf')\n\n\n        docid = '/home/user/test_without_filename_extension'\n        parameters, data = enhancer.process(parameters={'id': docid})\n\n        self.assertFalse('filename_extension_s' in data)\n\n\n        docid = '/home/user/test.PDF'\n        parameters, data = enhancer.process(parameters={'id': docid})\n\n        self.assertEqual(data['filename_extension_s'], 'pdf')\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_pdf_ocr.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nimport enhance_pdf_ocr\n\nclass Test_enhance_pdf_ocr(unittest.TestCase):\n\n    # check OCR of embedded images in PDF\n    def test_pdf_ocr(self):\n        \n        enhancer = enhance_pdf_ocr.enhance_pdf_ocr()\n\n        parameters = {'ocr_pdf_tika': False, 'filename': os.path.dirname(os.path.realpath(__file__)) + '/testdata/test.pdf', 'ocr_cache': '/var/cache/tesseract', 'content_type_ss': 'application/pdf', 'plugins':[]}\n\n        parameters, data = enhancer.process(parameters=parameters)\n\n        self.assertTrue('TestPDFOCRImage1Content1' in data['ocr_t'])\n        self.assertTrue('TestPDFOCRImage1Content2' in data['ocr_t'])\n        self.assertTrue('TestPDFOCRImage2Content1' in data['ocr_t'])\n        self.assertTrue('TestPDFOCRImage2Content2' in data['ocr_t'])\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_regex.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_regex\n\nclass Test_enhance_regex(unittest.TestCase):\n\n    def test(self):\n\n        enhancer = enhance_regex.enhance_regex()\n\n        parameters = {}\n        parameters['verbose'] = True\n        parameters['regex_lists'] = ['/etc/opensemanticsearch/regex/iban.tsv']\n\n        data = {}\n        data['content_txt'] = \"An IBAN DE75512108001245126199 from Germany and GB33BUKB20201555555555 from GB and not 75512108001245126199\"\n\n        parameters, data = enhancer.process(data=data, parameters=parameters)\n\n        self.assertTrue('DE75512108001245126199' in data['iban_ss'])\n        self.assertTrue('GB33BUKB20201555555555' in data['iban_ss'])\n\n        self.assertFalse('75512108001245126199' in data['iban_ss'])\n       \n       \nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "src/opensemanticetl/test_enhance_warc.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nfrom etl_file import Connector_File\nfrom etl_delete import Delete\nfrom export_solr import export_solr\n\nclass Test_enhance_warc(unittest.TestCase):\n\n    @unittest.expectedFailure # Test fails on deleting in Solr index until next release of pysolr (https://github.com/opensemanticsearch/open-semantic-etl/issues/154)\n    def test_warc(self):\n\n        etl_file = Connector_File()\n        exporter = export_solr()\n\n        filename = os.path.dirname(os.path.realpath(__file__)) + '/testdata/example.warc'\n\n        # run ETL of example.warc with configured plugins and warc extractor\n        parameters, data = etl_file.index_file(filename = filename)\n\n        contained_doc_id = 'http://example.com/<urn:uuid:a9c51e3e-0221-11e7-bf66-0242ac120005>'\n        fields = ['id', 'title_txt', 'content_type_ss', 'content_txt']\n\n        data = exporter.get_data(contained_doc_id, fields)\n\n        # delete from search index\n        etl_delete = Delete()\n        etl_delete.delete(filename)\n        etl_delete.delete(contained_doc_id)\n\n        self.assertEqual(data['title_txt'], ['Example Domain'])\n\n        self.assertEqual(data['content_type_ss'], ['text/html; charset=UTF-8'])\n\n        self.assertTrue('This domain is established to be used for illustrative examples in documents.' in data['content_txt'][0])\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "src/opensemanticetl/test_etl_file.py",
    "content": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nfrom etl_file import Connector_File\nfrom etl_delete import Delete\n\nclass Test_ETL_file(unittest.TestCase):\n\n    def test_pdf_and_ocr_by_tika(self):\n\n        etl_file = Connector_File()\n        filename = os.path.dirname(os.path.realpath(__file__)) + '/testdata/test.pdf'\n\n        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)\n        parameters, data = etl_file.index_file(filename = filename, additional_plugins=['enhance_pdf_ocr'])\n\n        # delete from search index\n        etl_delete = Delete()\n        etl_delete.delete(filename)\n\n        # check extracted content type\n        self.assertTrue(data['content_type_ss'] == 'application/pdf' or sorted(data['content_type_ss']) == ['application/pdf', 'image/jpeg', 'image/png'])\n\n        # check content type group which is mapped to this content type (result of plugin enhance_contenttype_group.py)\n        self.assertTrue(data['content_type_group_ss'] == ['Text document'] or sorted(data['content_type_group_ss']) == ['Image', 'Text document'])\n\n        # check extracted title (result of plugin enhance_extract_text_tika_server.py)\n        self.assertEqual(data['title_txt'], 'TestPDFtitle')\n\n        # check extracted content of PDF text (result of plugin enhance_extract_text_tika_server.py)\n        self.assertTrue('TestPDFContent1 on TestPDFPage1' in data['content_txt'])\n        self.assertTrue('TestPDFContent2 on TestPDFPage2' in data['content_txt'])\n\n        # check OCR of embedded images in PDF (result of plugin enhance_pdf_ocr.py)\n        self.assertTrue('TestPDFOCRImage1Content1' in data['content_txt'])\n        self.assertTrue('TestPDFOCRImage1Content2' in data['content_txt'])\n        self.assertTrue('TestPDFOCRImage2Content1' in data['content_txt'])\n        self.assertTrue('TestPDFOCRImage2Content2' in data['content_txt'])\n\n        # OCR done by Tika so in field content_txt, not in OCR plugin field ocr_t\n        self.assertFalse('ocr_t' in data)\n\n        # OCR text copied to default search field by plugin enhance_multilingual?\n        default_search_field_data = ' '.join(data['_text_'])\n        self.assertTrue('TestPDFOCRImage1Content1' in default_search_field_data)\n        self.assertTrue('TestPDFOCRImage1Content2' in default_search_field_data)\n        self.assertTrue('TestPDFOCRImage2Content1' in default_search_field_data)\n        self.assertTrue('TestPDFOCRImage2Content2' in default_search_field_data)\n\n        # check if a Open Semantic ETL plugin threw an exception\n        self.assertEqual(data['etl_error_plugins_ss'], [])\n\n\n    def test_ocr_by_plugin_enhance_pdf_ocr(self):\n\n        etl_file = Connector_File()\n        filename = os.path.dirname(os.path.realpath(__file__)) + '/testdata/test.pdf'\n\n        etl_file.config['ocr_pdf_tika'] = False\n\n        # run ETL of test.pdf with configured plugins and PDF OCR (result of etl_file.py)\n        parameters, data = etl_file.index_file(filename = filename, additional_plugins=['enhance_pdf_ocr'])\n\n        # delete from search index\n        etl_delete = Delete()\n        etl_delete.delete(filename)\n\n        # check OCR of embedded images in PDF (result of plugin enhance_pdf_ocr.py)\n        self.assertTrue('TestPDFOCRImage1Content1' in data['ocr_t'])\n        self.assertTrue('TestPDFOCRImage1Content2' in data['ocr_t'])\n        self.assertTrue('TestPDFOCRImage2Content1' in data['ocr_t'])\n        self.assertTrue('TestPDFOCRImage2Content2' in data['ocr_t'])\n\n        # OCR text copied to default search field?\n        default_search_field_data = ' '.join(data['_text_'])\n        self.assertTrue('TestPDFOCRImage1Content1' in default_search_field_data)\n        self.assertTrue('TestPDFOCRImage1Content2' in default_search_field_data)\n        self.assertTrue('TestPDFOCRImage2Content1' in default_search_field_data)\n        self.assertTrue('TestPDFOCRImage2Content2' in default_search_field_data)\n\n        # check if a Open Semantic ETL plugin threw an exception\n        self.assertEqual(data['etl_error_plugins_ss'], [])\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "src/opensemanticetl/test_move_indexed_files.py",
    "content": "import unittest\nfrom unittest import mock\n\nimport json\nimport itertools\n\nimport move_indexed_file\n\n\nclass TestMove(unittest.TestCase):\n    def test_move_files(self):\n        mock_get_files = mock.Mock(return_value=[\n            {'id': 'file:///book1', 'title_t': 'Snow Crash',\n             'copies_i': 5, 'cat_ss': ['Science Fiction'],\n             'path_basename_s': 'book1', '_version_': 1641756143516647424},\n            {'id': 'file:///folder/book2',\n             'title_t': 'Other book',\n             'copies_i': 3, 'cat_ss': ['Round House Kicks'],\n             'path0_s': 'folder',\n             'path_basename_s': 'book2', '_version_': 1641756143518744576}])\n        mock_post = mock.Mock()\n\n        def mock_prepare(adds, delete_ids):\n            self.assertEqual(\n                list(adds),\n                [\n                    {'id': 'file:///snow_crash', 'title_t': 'Snow Crash',\n                     'copies_i': 5, 'cat_ss': ['Science Fiction'],\n                     'path_basename_s': 'snow_crash'},\n                    {'id': 'file:///other_book',\n                     'title_t': 'Other book',\n                     'copies_i': 3, 'cat_ss': ['Round House Kicks'],\n                     'path_basename_s': 'other_book'}])\n            self.assertEqual(tuple(delete_ids),\n                             (\"file:///book1\", \"file:///folder/book2\"))\n        with mock.patch(\"move_indexed_file.get_files\", mock_get_files), \\\n                mock.patch(\"move_indexed_file.post\", mock_post), \\\n                mock.patch(\"move_indexed_file.prepare_payload\",\n                           mock_prepare):\n            move_indexed_file.move_files(\n                None, {\"/book1\": \"/snow_crash\",\n                       \"/folder/book2\": \"/other_book\"},\n                prefix=\"file://\")\n\n    def test_move_dir(self):\n        mock_get_files = mock.Mock(return_value=[\n            {'id': 'file:///folder/book1', 'title_t': 'Snow Crash',\n             'copies_i': 5, 'cat_ss': ['Science Fiction'],\n             'path0_s': 'folder',\n             'path_basename_s': 'book1', '_version_': 1641756143516647424},\n            {'id': 'file:///folder/book2',\n             'title_t': 'Other book',\n             'copies_i': 3, 'cat_ss': ['Round House Kicks'],\n             'path0_s': 'folder',\n             'path_basename_s': 'book2', '_version_': 1641756143518744576}])\n        mock_post = mock.Mock()\n\n        def mock_prepare(adds, delete_ids):\n            self.assertEqual(\n                list(adds),\n                [\n                    {'id': 'file:///dest/book1', 'title_t': 'Snow Crash',\n                     'copies_i': 5, 'cat_ss': ['Science Fiction'],\n                     'path0_s': 'dest',\n                     'path_basename_s': 'book1'},\n                    {'id': 'file:///dest/book2',\n                     'title_t': 'Other book',\n                     'copies_i': 3, 'cat_ss': ['Round House Kicks'],\n                     'path0_s': 'dest',\n                     'path_basename_s': 'book2'}])\n            self.assertEqual(tuple(delete_ids),\n                             (\"file:///folder/book1\", \"file:///folder/book2\"))\n        with mock.patch(\"move_indexed_file.get_files_in_dir\",\n                        mock_get_files), \\\n                mock.patch(\"move_indexed_file.post\", mock_post), \\\n                mock.patch(\"move_indexed_file.prepare_payload\",\n                           mock_prepare):\n            move_indexed_file.move_dir(\n                None, src=\"/folder\", dest=\"/dest/\",\n                prefix=\"file://\")\n\n    def test_get_pages(self):\n        step = 3\n\n        original_docs = [{'id': i} for i in range(10)]\n\n        def mock_urlopen():\n            docs_iter = iter(original_docs)\n            responses = (\n                mock_response(\n                    {\n                        \"response\": {\n                            \"numFound\": len(original_docs),\n                            \"docs\": list(itertools.islice(docs_iter, step))\n                        }\n                    }\n                )\n                for __ in range(0, len(original_docs), step))\n\n            def _mock(*_, **__):\n                return next(responses)\n            return _mock\n        with mock.patch(\"urllib.request.urlopen\",\n                        mock_urlopen()):\n            docs = sum(move_indexed_file.get_pages(\"\", \"\", limit=step), [])\n        self.assertEqual(docs, original_docs)\n\n\ndef mock_response(data):\n    return type(\"MockResponse\", (object,), {\n        \"read\": json.dumps(data).encode})\n"
  },
  {
    "path": "src/opensemanticetl/testdata/README.md",
    "content": "Automated tests by unittest\n===========================\n\nAutomated tests are implemented using the Python library unittest:\n\nhttps://docs.python.org/3/library/unittest.html\n\nThe code for the unit tests is located in the directory \"src/opensemanticetl\" in files with the prefix \"test_\".\n\nSome files with testdata like test documents (see section \"Testdata\") are located in the subdirectory \"test\".\n\n\nRun all tests\n=============\n\nWithin the directory \"src/opensemanticetl\" call\n\npython3 -m unittest\n\nto run all available tests for all modules and plugins.\n\n\nRun tests for a single plugin\n=============================\n\nYou can run only the tests for a single plugin you currently work on:\n\nFor example to test only the Tika plugin for text extraction (\"enhance_extract_text_tika_server.py\"), call\n\npython3 -m unittest test_enhance_extract_text_tika_server\n\n\nCI/CD\n=====\n\nThe script run_tests.sh is called for automated tests within a Docker container configured by docker-compose.test.yml in the root directory of this Git repository.\n\n\nTestdata\n========\n\nTest documents located in subdirectory \"test\":\n\n\ntest.pdf\n--------\n\nA test PDF with two pages with text content to test content extraction and embedded images to test OCR.\n\n\nTest_OCR_Image1.png\n-------------------\n\nPNG image with content \"TestOCRImage1Content1\" and \"TestOCRImage1Content2\" to test OCR.\n\n\nTest_OCR_Image2.jpg\n-------------------\n\nJPEG image with content \"TestOCRImage2Content1\" and \"TestOCRImage2Content2\" to test OCR.\n\n"
  },
  {
    "path": "src/opensemanticetl/testdata/run_integrationtests.sh",
    "content": "#!/bin/sh\n\npython3 -m unittest discover -s /usr/lib/python3/dist-packages/entity_linking/\n\npython3 -m unittest discover -s /usr/lib/python3/dist-packages/opensemanticetl/\n"
  },
  {
    "path": "src/opensemanticetl/testdata/run_tests.sh",
    "content": "#!/bin/sh\n\ncd /usr/lib/python3/dist-packages/opensemanticetl\n\npython3 -m unittest \\\n test_enhance_extract_email \\\n test_enhance_mapping_id \\\n test_enhance_path\n"
  }
]