Showing preview only (396K chars total). Download the full file or copy to clipboard to get everything.
Repository: opensemanticsearch/open-semantic-etl
Branch: master
Commit: f51efea6c18f
Files: 123
Total size: 363.3 KB
Directory structure:
gitextract_2awl829e/
├── .github/
│ └── FUNDING.yml
├── .gitignore
├── .gitmodules
├── DEBIAN/
│ ├── conffiles
│ ├── control
│ ├── postinst
│ └── prerm
├── Dockerfile
├── LICENSE
├── build-deb
├── docker-compose.test.yml
├── docker-compose.ubuntu.test.yml
├── docker-entrypoint.sh
├── etc/
│ ├── opensemanticsearch/
│ │ ├── blacklist/
│ │ │ ├── blacklist-url
│ │ │ ├── blacklist-url-prefix
│ │ │ ├── blacklist-url-regex
│ │ │ ├── blacklist-url-suffix
│ │ │ ├── enhance_extract_law/
│ │ │ │ └── blacklist-lawcode-if-no-clause
│ │ │ ├── enhance_zip/
│ │ │ │ ├── blacklist-contenttype
│ │ │ │ ├── blacklist-contenttype-prefix
│ │ │ │ ├── blacklist-contenttype-regex
│ │ │ │ ├── blacklist-contenttype-suffix
│ │ │ │ ├── whitelist-contenttype
│ │ │ │ ├── whitelist-contenttype-prefix
│ │ │ │ ├── whitelist-contenttype-regex
│ │ │ │ └── whitelist-contenttype-suffix
│ │ │ ├── textanalysis/
│ │ │ │ ├── blacklist-fieldname
│ │ │ │ ├── blacklist-fieldname-prefix
│ │ │ │ └── blacklist-fieldname-suffix
│ │ │ ├── whitelist-url
│ │ │ ├── whitelist-url-prefix
│ │ │ ├── whitelist-url-regex
│ │ │ └── whitelist-url-suffix
│ │ ├── connector-files
│ │ ├── connector-web
│ │ ├── enhancer-rdf
│ │ ├── etl
│ │ ├── facets
│ │ ├── filemonitoring/
│ │ │ └── files
│ │ ├── ocr/
│ │ │ └── dictionary.txt
│ │ ├── regex/
│ │ │ ├── email.tsv
│ │ │ ├── iban.tsv
│ │ │ └── phone.tsv
│ │ └── task_priorities
│ └── systemd/
│ └── system/
│ ├── opensemanticetl-filemonitoring.service
│ └── opensemanticetl.service
└── src/
└── opensemanticetl/
├── __init__.py
├── clean_title.py
├── enhance_annotations.py
├── enhance_contenttype_group.py
├── enhance_csv.py
├── enhance_detect_language_tika_server.py
├── enhance_entity_linking.py
├── enhance_extract_email.py
├── enhance_extract_hashtags.py
├── enhance_extract_law.py
├── enhance_extract_money.py
├── enhance_extract_phone.py
├── enhance_extract_text_tika_server.py
├── enhance_file_mtime.py
├── enhance_file_size.py
├── enhance_html.py
├── enhance_mapping_id.py
├── enhance_mimetype.py
├── enhance_multilingual.py
├── enhance_ner_spacy.py
├── enhance_ner_stanford.py
├── enhance_ocr.py
├── enhance_path.py
├── enhance_pdf_ocr.py
├── enhance_pdf_page.py
├── enhance_pdf_page_preview.py
├── enhance_pst.py
├── enhance_rdf.py
├── enhance_rdf_annotations_by_http_request.py
├── enhance_regex.py
├── enhance_sentence_segmentation.py
├── enhance_warc.py
├── enhance_xml.py
├── enhance_xmp.py
├── enhance_zip.py
├── etl.py
├── etl_delete.py
├── etl_enrich.py
├── etl_file.py
├── etl_filedirectory.py
├── etl_filemonitoring.py
├── etl_hypothesis.py
├── etl_plugin_core.py
├── etl_rss.py
├── etl_sitemap.py
├── etl_sparql.py
├── etl_twitter_scraper.py
├── etl_web.py
├── etl_web_crawl.py
├── export_elasticsearch.py
├── export_json.py
├── export_neo4j.py
├── export_print.py
├── export_queue_files.py
├── export_solr.py
├── filter_blacklist.py
├── filter_file_not_modified.py
├── move_indexed_file.py
├── requirements.txt
├── tasks.py
├── test_enhance_detect_language_tika_server.py
├── test_enhance_extract_email.py
├── test_enhance_extract_law.py
├── test_enhance_extract_money.py
├── test_enhance_extract_text_tika_server.py
├── test_enhance_mapping_id.py
├── test_enhance_ner_spacy.py
├── test_enhance_path.py
├── test_enhance_pdf_ocr.py
├── test_enhance_regex.py
├── test_enhance_warc.py
├── test_etl_file.py
├── test_move_indexed_files.py
└── testdata/
├── README.md
├── example.warc
├── run_integrationtests.sh
└── run_tests.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/FUNDING.yml
================================================
custom: ['https://www.paypal.me/MMandalka']
================================================
FILE: .gitignore
================================================
__pycache__
.project
.pydevproject
.settings
================================================
FILE: .gitmodules
================================================
[submodule "src/open-semantic-entity-search-api"]
path = src/open-semantic-entity-search-api
url = https://github.com/opensemanticsearch/open-semantic-entity-search-api.git
branch = master
[submodule "src/tesseract-ocr-cache"]
path = src/tesseract-ocr-cache
url = https://github.com/opensemanticsearch/tesseract-ocr-cache.git
================================================
FILE: DEBIAN/conffiles
================================================
/etc/opensemanticsearch/etl
/etc/opensemanticsearch/filemonitoring/files
/etc/opensemanticsearch/connector-files
/etc/opensemanticsearch/connector-web
/etc/opensemanticsearch/enhancer-rdf
/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname
/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix
/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix
/etc/opensemanticsearch/blacklist/blacklist-url
/etc/opensemanticsearch/blacklist/blacklist-url-prefix
/etc/opensemanticsearch/blacklist/blacklist-url-suffix
/etc/opensemanticsearch/blacklist/blacklist-url-regex
/etc/opensemanticsearch/blacklist/whitelist-url
/etc/opensemanticsearch/blacklist/whitelist-url-prefix
/etc/opensemanticsearch/blacklist/whitelist-url-suffix
/etc/opensemanticsearch/blacklist/whitelist-url-regex
/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype
/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-prefix
/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-suffix
/etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-regex
/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype
/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-prefix
/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-suffix
/etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-regex
================================================
FILE: DEBIAN/control
================================================
Package: open-semantic-etl
Version: 21.10.18
Section: misc
Priority: optional
Architecture: all
Depends: tika-server(>=0), python3-tika(>=0), curl(>=0), python3-pycurl(>=0), python3-rdflib(>=0), python3-sparqlwrapper(>=0), file(>=0), python3-requests(>=0), python3-pysolr(>=0), python3-dateutil(>=0), python3-lxml(>=0), python3-feedparser(>=0), poppler-utils(>=0), pst-utils(>=0),rabbitmq-server(>=0),python3-pyinotify(>=0),python3-pip(>=0), python3-dev(>=0), build-essential(>=0), libssl-dev(>=0), libffi-dev(>=0), tesseract-ocr(>=0), tesseract-ocr-deu(>=0)
Installed-Size: 100
Maintainer: Markus Mandalka <debian@mandalka.name>
Homepage: https://opensemanticsearch.org/
Description: Crawler to index files and directories to Solr
Index your files to Solr.
If tesseract-ocr installed there will be character recognition on images.
Hint: install ocr language files like tesseract-ocr-deu for german texts.
================================================
FILE: DEBIAN/postinst
================================================
#!/bin/sh
adduser --system --disabled-password opensemanticetl
groupadd -r tesseract_cache
usermod -a -G tesseract_cache opensemanticetl
# rights for OCR cache
chown opensemanticetl:tesseract_cache /var/cache/tesseract
chmod 770 /var/cache/tesseract
# rights for thumbnail dir
chown opensemanticetl /var/opensemanticsearch/media/thumbnails
chmod o+w /var/opensemanticsearch/media/thumbnails
# install dependencies
pip3 install -r /usr/lib/python3/dist-packages/opensemanticetl/requirements.txt
# load our additional systemd service config
systemctl daemon-reload
# start while booting
systemctl enable opensemanticetl
systemctl enable opensemanticetl-filemonitoring
# (re)start after installation (or upgrade)
systemctl restart opensemanticetl
================================================
FILE: DEBIAN/prerm
================================================
#!/bin/sh
systemctl disable opensemanticetl-filemonitoring
systemctl stop opensemanticetl-filemonitoring
systemctl disable opensemanticetl
systemctl stop opensemanticetl
exit 0
================================================
FILE: Dockerfile
================================================
ARG FROM=debian:bullseye
FROM ${FROM}
ENV DEBIAN_FRONTEND=noninteractive
ENV CRYPTOGRAPHY_DONT_BUILD_RUST=1
RUN apt-get update && apt-get install --no-install-recommends --yes \
build-essential \
curl \
file \
libffi-dev \
librabbitmq4 \
libssl-dev \
poppler-utils \
pst-utils \
python3-dateutil \
python3-dev \
python3-feedparser \
python3-lxml \
python3-pip \
python3-pycurl \
python3-pyinotify \
python3-pysolr \
python3-rdflib \
python3-requests \
python3-scrapy \
python3-setuptools \
python3-sparqlwrapper \
python3-wheel \
tesseract-ocr \
# tesseract-ocr-all \
&& apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
COPY ./src/opensemanticetl/requirements.txt /usr/lib/python3/dist-packages/opensemanticetl/requirements.txt
# install Python PIP dependecies
RUN pip3 install -r /usr/lib/python3/dist-packages/opensemanticetl/requirements.txt
COPY ./src/opensemanticetl /usr/lib/python3/dist-packages/opensemanticetl
COPY ./src/tesseract-ocr-cache/tesseract_cache /usr/lib/python3/dist-packages/tesseract_cache
COPY ./src/tesseract-ocr-cache/tesseract_fake /usr/lib/python3/dist-packages/tesseract_fake
COPY ./src/open-semantic-entity-search-api/src/entity_linking /usr/lib/python3/dist-packages/entity_linking
COPY ./src/open-semantic-entity-search-api/src/entity_manager /usr/lib/python3/dist-packages/entity_manager
COPY docker-entrypoint.sh /
RUN chmod 755 /docker-entrypoint.sh
# add user
RUN adduser --system --disabled-password opensemanticetl
RUN mkdir /var/cache/tesseract
RUN chown opensemanticetl /var/cache/tesseract
USER opensemanticetl
# start Open Semantic ETL celery workers (reading and executing ETL tasks from message queue)
CMD ["/docker-entrypoint.sh"]
================================================
FILE: LICENSE
================================================
GNU GENERAL PUBLIC LICENSE
Version 3, 29 June 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU General Public License is a free, copyleft license for
software and other kinds of works.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
the GNU General Public License is intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users. We, the Free Software Foundation, use the
GNU General Public License for most of our software; it applies also to
any other work released this way by its authors. You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
To protect your rights, we need to prevent others from denying you
these rights or asking you to surrender the rights. Therefore, you have
certain responsibilities if you distribute copies of the software, or if
you modify it: responsibilities to respect the freedom of others.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must pass on to the recipients the same
freedoms that you received. You must make sure that they, too, receive
or can get the source code. And you must show them these terms so they
know their rights.
Developers that use the GNU GPL protect your rights with two steps:
(1) assert copyright on the software, and (2) offer you this License
giving you legal permission to copy, distribute and/or modify it.
For the developers' and authors' protection, the GPL clearly explains
that there is no warranty for this free software. For both users' and
authors' sake, the GPL requires that modified versions be marked as
changed, so that their problems will not be attributed erroneously to
authors of previous versions.
Some devices are designed to deny users access to install or run
modified versions of the software inside them, although the manufacturer
can do so. This is fundamentally incompatible with the aim of
protecting users' freedom to change the software. The systematic
pattern of such abuse occurs in the area of products for individuals to
use, which is precisely where it is most unacceptable. Therefore, we
have designed this version of the GPL to prohibit the practice for those
products. If such problems arise substantially in other domains, we
stand ready to extend this provision to those domains in future versions
of the GPL, as needed to protect the freedom of users.
Finally, every program is threatened constantly by software patents.
States should not allow patents to restrict development and use of
software on general-purpose computers, but in those that do, we wish to
avoid the special danger that patents applied to a free program could
make it effectively proprietary. To prevent this, the GPL assures that
patents cannot be used to render the program non-free.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Use with the GNU Affero General Public License.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU Affero General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the special requirements of the GNU Affero General Public License,
section 13, concerning interaction through a network will apply to the
combination as such.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
{one line to give the program's name and a brief idea of what it does.}
Copyright (C) {year} {name of author}
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If the program does terminal interaction, make it output a short
notice like this when it starts in an interactive mode:
{project} Copyright (C) {year} {fullname}
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, your program's commands
might be different; for a GUI interface, you would use an "about box".
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU GPL, see
<http://www.gnu.org/licenses/>.
The GNU General Public License does not permit incorporating your program
into proprietary programs. If your program is a subroutine library, you
may consider it more useful to permit linking proprietary applications with
the library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License. But first, please read
<http://www.gnu.org/philosophy/why-not-lgpl.html>.
================================================
FILE: build-deb
================================================
#!/bin/sh
VERSION=`date +%y.%m.%d`
PACKAGE=open-semantic-etl_${VERSION}.deb
BUILDDIR=/tmp/open-semantic-etl-$$.deb
#
# Build standard package (preconfigured for Solr)
#
echo "Building ${PACKAGE} in temp directory ${BUILDDIR}"
mkdir ${BUILDDIR}
cp -a DEBIAN ${BUILDDIR}/
cp -a etc ${BUILDDIR}/
cp -a usr ${BUILDDIR}/
mkdir -p ${BUILDDIR}/usr/lib/python3/dist-packages
cp -a src/* ${BUILDDIR}/usr/lib/python3/dist-packages/
mkdir -p ${BUILDDIR}/var/cache/tesseract
mkdir -p ${BUILDDIR}/var/opensemanticsearch/media/thumbnails
# Build standard package (preconfigured for Solr)
dpkg -b ${BUILDDIR} ${PACKAGE}
#
# Build alternate package (preconfigured for Elasticsearch)
#
# change config file and set export plugin to Elasticsearch
PACKAGE=open-semantic-etl-elasticsearch_${VERSION}.deb
echo "Building ${PACKAGE} in temp directory ${BUILDDIR}"
# change option "config['export']" in ${BUILDDIR}/etc/opensemanticsearch/etl from "solr" to "elasticsearch" by commenting / uncommenting
sed -r -e "s/(config\['export'\] = 'export_solr')/#\1/g" -e "s/(config\['index'\] = 'core1')/#\1/g" -e "s/(#)(config\['export'\] = 'export_elasticsearch')/\2/" -e "s/(#)(config\['index'\] = 'opensemanticsearch')/\2/" -i ${BUILDDIR}/etc/opensemanticsearch/etl
# todo: delete dependency on pysolr
# Build the alternate package
dpkg -b ${BUILDDIR} ${PACKAGE}
================================================
FILE: docker-compose.test.yml
================================================
sut:
build: .
command: /usr/lib/python3/dist-packages/opensemanticetl/test/run_tests.sh
================================================
FILE: docker-compose.ubuntu.test.yml
================================================
version: '3'
services:
sut:
build:
context: .
args:
FROM: ubuntu:focal
command: /usr/lib/python3/dist-packages/opensemanticetl/test/run_tests.sh
================================================
FILE: docker-entrypoint.sh
================================================
#! /bin/sh
# docker-entrypoint for opensemanticsearch/open-semantic-etl
# wait for the apps container to finish initializing:
while ! curl -m 1 -sf http://apps >/dev/null 2>&1
do
sleep 1
done
exec /usr/bin/python3 /usr/lib/python3/dist-packages/opensemanticetl/tasks.py
================================================
FILE: etc/opensemanticsearch/blacklist/blacklist-url
================================================
# Blacklist of URLs
================================================
FILE: etc/opensemanticsearch/blacklist/blacklist-url-prefix
================================================
# Blacklist of URL Prefixes like domains or paths
================================================
FILE: etc/opensemanticsearch/blacklist/blacklist-url-regex
================================================
# Blacklist URLs with text patterns by regular expressions (regex)
================================================
FILE: etc/opensemanticsearch/blacklist/blacklist-url-suffix
================================================
# Blacklist of URL Suffixes like file endings
.css
.CSS
.Css
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_extract_law/blacklist-lawcode-if-no-clause
================================================
# Preferred labels of Law codes will be only added to facet "Law code",
# if the following configured (alternate) labels are directly before or after a
# law clause (f.e. in text "abc § 123 CC xyz"), but not if such blacklisted
# (alternate) label stands alone
# (f.e. in text "abc CC xyz" or in "CC: mail@domain) because too ambiguous
# too ambiguous alternate label from Wikidata entity Q206834 "Swiss Civil Code"
CC
# too ambiguous alternate label from Wikidata entity Q56045 "Basic Law for the Federal Republic of Germany"
GG
# too ambiguous alternate label from Wikidata entity Q187719 "Corpus Juris Civilis"
Institutes
# too ambiguous alternate label from Wikidata entity Q7101313 "Oregon Revised Statutes"
ORS
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype
================================================
# Blacklist of contenttypes
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-prefix
================================================
# Blacklist of contenttype prefixes
# Open Office / Libreoffice / MS Office
# Open document format and MS office open xml format is a zip archive with the document as XML, the embedded images and meta data as XML
# Tika will extract the main content, which - if you do not forensics - is enough in most cases.
# So we dont want additional handle each single (metadata) file in this archive, so we deactivate the ZIP plugin for that content type
# Since this is a prefix blacklist, it will stop unzip application/vnd.oasis.opendocument.text, application/vnd.oasis.opendocument.spreadsheet and so on ...
application/vnd.oasis.opendocument.
application/vnd.openxmlformats-officedocument.
application/msword
application/vnd.ms-word.
application/msexcel
application/vnd.ms-excel.
application/mspowerpoint
application/vnd.ms-powerpoint.
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-regex
================================================
# Blacklist contenttypes with text patterns by regular expressions (regex)
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-suffix
================================================
# Blacklist of contenttype suffixes
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype
================================================
# Whitelist of contenttypes
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-prefix
================================================
# Whitelist of contenttype prefixes
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-regex
================================================
# Whitelist contenttypes with text patterns by regular expressions (regex)
================================================
FILE: etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-suffix
================================================
# Whitelist of contenttype suffixes
================================================
FILE: etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname
================================================
language_s
content_type_ss
content_type_group_ss
AEB Bracket Value_ss
AE Setting_ss
AF Area Height_ss
AF Area Width_ss
AF Area X Positions_ss
AF Area Y Positions_ss
AF Image Height_ss
AF Image Width_ss
AF Point Count_ss
AF Point Selected_ss
AF Points in Focus_ss
Aperture Value_ss
Auto Exposure Bracketing_ss
Auto ISO_ss
Auto Rotate_ss
Base ISO_ss
Bulb Duration_ss
Camera Info Array_ss
Camera Serial Number_ss
Camera Temperature_ss
Camera Type_ss
Canon Model ID_ss
Contrast_ss
Components Configuration_ss
Compressed Bits Per Pixel_ss
Compression_ss
Color Balance Array_ss
Color Space_ss
Color Temperature_ss
Color Tone_ss
Content-Encoding_s
Continuous Drive Mode_ss
Control Mode_ss
Custom Functions_ss
Custom Rendered_ss
created_ss
Creation-Date_ss
Data BitsPerSample_ss
Data PlanarConfiguration_ss
Data Precision_ss
Data SampleFormat_ss
Data SignificantBitsPerSample_ss
date_ss
dc:format_ss
dcterms:created_ss
dcterms:modified_ss
Dimension ImageOrientation_ss
Dimension PixelAspectRatio_ss
Digital Zoom_ss
Display Aperture_ss
Easy Shooting Mode_ss
embeddedResourceType_ss
Exif Version_ss
exif:DateTimeOriginal_ss
exif:ExposureTime_ss
exif:Flash_ss
exif:FocalLength_ss
exif:FNumber_ss
Exif Image Height_ss
Exif Image Width_ss
exif:IsoSpeedRatings_ss
Exposure Bias Value_ss
Exposure Compensation_ss
Exposure Mode_ss
Exposure Time_ss
F-Number_ss
F Number_ss
File Name_ss
File Length_ss
File Modified Date_ss
File Info Array_ss
File Size_ss
Firmware Version_ss
Flash_ss
FlashPix Version_ss
Flash Activity_ss
Flash Details_ss
Flash Exposure Compensation_ss
Flash Guide Number_ss
Focal Length_ss
Flash Mode_ss
Focal Plane Resolution Unit_ss
Focal Plane X Resolution_ss
Focal Plane Y Resolution_ss
Focal Units per mm_ss
Focus Continuous_ss
Focus Distance Lower_ss
Focus Distance Upper_ss
Focus Mode_ss
Focus Type_ss
height_ss
ISO Speed Ratings_ss
IHDR_ss
Image Height_ss
Image Number_ss
Image Size_ss
Image Width_ss
Image Type_ss
Interoperability Index_ss
Interoperability Version_ss
Iso_ss
Last-Modified_ss
Last-Save-Date_ss
Lens Type_ss
Long Focal Length_ss
Macro Mode_ss
Manual Flash Output_ss
Max Aperture_ss
Max Aperture Value_ss
Measured Color Array_ss
Measured EV_ss
meta:creation-date_ss
meta:save-date_ss
Metering Mode_ss
Min Aperture_ss
modified_ss
ND Filter_ss
Number of Components_ss
Number of Tables_ss
Orientation_ss
Optical Zoom Code_ss
pdf:PDFVersion_ss
pdf:docinfo:created_ss
pdf:docinfo:creator_tool_ss
pdf:docinfo:modified_ss
pdf:docinfo:producer_ss
pdf:encrypted_ss
pdf:charsPerPage_ss
pdf:unmappedUnicodeCharsPerPage_ss
Photo Effect_ss
producer_ss
Record Mode_ss
Related Image Height_ss
Related Image Width_ss
Resolution Unit_ss
Resolution Units_ss
Saturation_ss
sBIT sBIT_RGBAlpha_ss
Scene Capture Type_ss
Sensing Method_ss
Sequence Number_ss
Serial Number Format_ss
Slow Shutter_ss
Sharpness_ss
Short Focal Length_ss
Shutter Speed Value_ss
Spot Metering Mode_ss
SRAW Quality_ss
Target Aperture_ss
Target Exposure Time_ss
tiff:BitsPerSample_ss
tiff:ImageLength_ss
tiff:ImageWidth_ss
tiff:Make_ss
tiff:Model_ss
tiff:Orientation_ss
tiff:ResolutionUnit_ss
tiff:XResolution_ss
tiff:YResolution_ss
Thumbnail Height Pixels_ss
Thumbnail Width Pixels_ss
Thumbnail Image Valid Area_ss
Thumbnail Length_ss
Thumbnail Offset_ss
Transparency Alpha_ss
Valid AF Point Count_ss
width_ss
X-Parsed-By_ss
X-TIKA:parse_time_millis_ss
X Resolution_ss
xmpTPg:NPages_ss
xmp:CreatorTool_ss
YCbCr Positioning_ss
Y Resolution_ss
Zoom Source Width_ss
Zoom Target Width_ss
================================================
FILE: etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix
================================================
etl_
X-TIKA
AF Point
Chroma
Compression
Component
Date/Time
Measured EV
Primary AF Point
Self Timer
Unknown Camera Setting
Unknown tag
White Balance
access_permission:
================================================
FILE: etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix
================================================
_i
_is
_l
_ls
_b
_bs
_f
_fs
_d
_ds
_f
_fs
_dt
_dts
_uri_ss
_matchtext_ss
================================================
FILE: etc/opensemanticsearch/blacklist/whitelist-url
================================================
# Whitelist of URLs
================================================
FILE: etc/opensemanticsearch/blacklist/whitelist-url-prefix
================================================
# Whitelist of URL Prefixes like domains or paths
================================================
FILE: etc/opensemanticsearch/blacklist/whitelist-url-regex
================================================
# Whitelist URLs with text patterns by regular expressions (regex)
================================================
FILE: etc/opensemanticsearch/blacklist/whitelist-url-suffix
================================================
# Whitelist of URL Suffixes like file endings
================================================
FILE: etc/opensemanticsearch/connector-files
================================================
# -*- coding: utf-8 -*-
# Config for opensemanticsearch-index-file
# print Debug output
#config['verbose'] = True
# Index files again even if indexed before and modification time of file unchanged
#config['force'] = True
#
# Mapping filename to URI
#
# if the users have other path (mountpoint with other path then the servers full path)
# or protocol (http:// instead of file://)
# you can map the servers path to the users path
# default: user can access the file system, so /fullpath/filename will be mapped to file:///fullpath/filename
config['mappings'] = { "/": "file:///" }
# If documents access not via filesystem but via website (http)
# your files in /var/www/documents/ should be mapped to http://www.opensemanticsearch.org/documents/
#config['mappings'] = { "/var/www/documents/": "http://www.opensemanticsearch.org/documents/" }
#
# UI Path navigator: Strip parts of path facet
#
# The path facet is the sidebar component to navigate (sub)paths.
# If all your different directories are in one path like /documents
# or even worse the main content dirs are subdirs like /mnt/fileserver/onesubdir and /mnt/fileserver/othersubdirectory
# you might want that the user can select or navigate the subdirectories directly (which from the content perspective are main dirs)
# instead of forcing the user first navigate to ./mnt, then to ./fileserver and so on...
# this option wont change the uri (which is the base of this option and can be mapped and stripped above),
# it will only change/strip/shorten the path facet in the interactive navigation of the user interface
#config['facet_path_strip_prefix'] = [ "file:///home/", "file://" ]
================================================
FILE: etc/opensemanticsearch/connector-web
================================================
# -*- coding: utf-8 -*-
#
# Config for opensemanticsearch-index-web-crawl
#
#
# common file extensions that are not followed if they occur in links
#
config['webcrawler_deny_extensions'] = [
# archives
'7z', '7zip', 'bz2', 'rar', 'tar', 'tar.gz', 'xz', 'zip',
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', 'cdr', 'ico',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# video
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
'm4a', 'm4v', 'flv', 'webm',
# office suites (commented, since we want to index office documents)
#'xls', 'xlsx', 'ppt', 'pptx', 'pps', 'doc', 'docx', 'odt', 'ods', 'odg',
#'odp', 'pdf',
# other
'css', 'exe', 'bin', 'rss', 'dmg', 'iso', 'apk'
]
# Uncomment, if you do not want to exclude file extensions
# Warning: You might not want to download Gigabytes or Terabytes of archives, videos, CD-ROM/DVD ISOs and so on...
#config['webcrawler_deny_extensions'] = []
================================================
FILE: etc/opensemanticsearch/enhancer-rdf
================================================
# -*- coding: utf-8 -*-
# Config for RDF metadata server
# URL of the meta data server (RDF)
# if set to False don't use additional metadata from server (like tags or annotations)
#
# Templates:
# [uri] for URL of annotated page
# [uri_md5] for MD5 Sum of the URL
config['metaserver'] = False
# Use Drupal as meta server
#config['metaserver'] = [ 'http://localhost/drupal/rdf?uri=[uri]' ]
# Use Semantic Mediawiki as meta server
#config['metaserver'] = [ 'http://localhost/mediawiki/index.php/Special:ExportRDF?xmlmime=rdf&page=[uri_md5]' ]
# Use tagger app as meta server
config['metaserver'] = [ 'http://localhost/search-apps/annotate/rdf?uri=[uri]' ]
# mapping of RDF properties or RDF classes to facets / columns
config['property2facet'] = {
'http://www.wikidata.org/entity/Q5': 'person_ss',
'http://www.wikidata.org/entity/Q43229': 'organization_ss',
'http://www.wikidata.org/entity/Q178706': 'organization_ss',
'http://www.wikidata.org/entity/Q18810687': 'organization_ss',
'http://www.wikidata.org/entity/Q2221906': 'location_ss',
'http://schema.org/Person': 'person_ss',
'http://schema.org/Organization': 'organization_ss',
'http://schema.org/Place': 'location_ss',
'http://schema.org/location': 'location_ss',
'http://schema.org/address': 'location_ss',
'http://schema.org/keywords': 'tag_ss',
'http://schema.org/Comment': 'comment_txt',
'http://semantic-mediawiki.org/swivt/1.0#specialProperty_dat': 'meta_date_dts'
}
================================================
FILE: etc/opensemanticsearch/etl
================================================
# -*- coding: utf-8 -*-
#
# ETL config for connector(s)
#
# print debug messages
#config['verbose'] = True
#
# Languages for language specific index
#
# Each document is analyzed without grammar rules in the index fields like content, additionally it can be added/copied to language specific index fields/analyzers
# Document language is autodetected by default plugin enhance_detect_language_tika_server
# If index support enhanced analytics for specific languages, we can add/copy data to language specific fields/analyzers
# Set which languages are configured and shall be used in index for language specific analysis/stemming/synonyms
# Default / if not set all languages that are supported will be analyzed additionally language specific
#config['languages'] = ['en','de','fr','hu','it','pt','nl','cz','ro','ru','ar','fa']
# force to language specific analysis additional in this language(s) grammar & synonyms, even if language autodetection detects other language
#config['languages_force'] = ['en','de']
# only use language for language specific analysis which are added / uncommented later
#config['languages'] = []
# add English
#config['languages'].append('en')
# add German / Deutsch
#config['languages'].append('de')
# add French / Francais
#config['languages'].append('fr')
# add Hungarian
#config['languages'].append('hu')
# add Spanish
#config['languages'].append('es')
# add Portuguese
#config['languages'].append('pt')
# add Italian
#config['languages'].append('it')
# add Czech
#config['languages'].append('cz')
# add Dutch
#config['languages'].append('nl')
# add Romanian
#config['languages'].append('ro')
# add Russian
#config['languages'].append('ru')
#
# Index/storage
#
#
# Solr URL and port
#
config['export'] = 'export_solr'
# Solr server
config['solr'] = 'http://localhost:8983/solr/'
# Solr core
config['index'] = 'opensemanticsearch'
#
# Elastic Search
#
#config['export'] = 'export_elasticsearch'
# Index
#config['index'] = 'opensemanticsearch'
#
# Tika for text and metadata extraction
#
# Tika server (with tesseract-ocr-cache)
# default: http://localhost:9998
#config['tika_server'] = 'http://localhost:9998'
# Tika server with fake OCR cache of tesseract-ocr-cache used if OCR in later ETL tasks
# default: http://localhost:9999
#config['tika_server_fake_ocr'] = 'http://localhost:9999'
#
# Annotations
#
# add plugin for annotation/tagging/enrichment of documents
config['plugins'].append('enhance_annotations')
# set alternate URL of annotation server
#config['metadata_server'] = 'http://localhost/search-apps/annotate/json'
#
# RDF Knowledge Graph
#
# add RDF Metadata Plugin for granular import of RDF file statements to entities of knowledge graphs
config['plugins'].append('enhance_rdf')
#
# Config for OCR (automatic text recognition of text in images)
#
# Disable OCR for image files (i.e for more performance and/or because you don't need the text within images or have only photos without photographed text)
#config['ocr'] = False
# Option to disable OCR of embedded images in PDF by Tika
# so (if alternate plugin is enabled) OCR will be done only by alternate
# plugin enhance_pdf_ocr (which else works only as fallback, if Tika exceptions)
#config['ocr_pdf_tika'] = False
# Use OCR cache
config['ocr_cache'] = '/var/cache/tesseract'
# Option to disable OCR cache
#config['ocr_cache'] = None
# Do OCR for images embedded in PDF documents (i.e. designed images or scanned or photographed documents)
config['plugins'].append('enhance_pdf_ocr')
#OCR language
#If other than english you have to install package tesseract-XXX (tesseract language support) for your language
#and set ocr_lang to this value (be careful, the tesseract package for english is "eng" (not "en") german is named "deu", not "de"!)
# set OCR language to English/default
#config['ocr_lang'] = 'eng'
# set OCR language to German/Deutsch
#config['ocr_lang'] = 'deu'
# set multiple OCR languages
config['ocr_lang'] = 'eng+deu'
#
# Regex pattern for extraction
#
# Enable Regex plugin
config['plugins'].append('enhance_regex')
# Regex config for IBAN extraction
config['regex_lists'].append('/etc/opensemanticsearch/regex/iban.tsv')
#
# Email address and email domain extraction
#
config['plugins'].append('enhance_extract_email')
#
# Phone number extraction
#
config['plugins'].append('enhance_extract_phone')
#
# Config for Named Entities Recognition (NER) and Named Entity Linking (NEL)
#
# Enable Entity Linking / Normalization and dictionary based Named Entities Extraction from thesaurus and ontologies
config['plugins'].append('enhance_entity_linking')
# Enable SpaCy NER plugin
config['plugins'].append('enhance_ner_spacy')
# Spacy NER Machine learning classifier (for which language and with which/how many classes)
# Default classifier if no classifier for specific language
# disable NER for languages where no classifier defined in config['spacy_ner_classifiers']
config['spacy_ner_classifier_default'] = None
# Set default classifier to English (only if you are sure, that all documents you index are english)
# config['spacy_ner_classifier_default'] = 'en_core_web_sm'
# Set default classifier to German (only if you are sure, that all documents you index are german)
# config['spacy_ner_classifier_default'] = 'de_core_news_sm'
# Language specific classifiers (mapping to autodetected document language to Spacy classifier / language)
#
# You have to download additional language classifiers for example english (en) or german (de) by
# python3 -m spacy download en
# python3 -m spacy download de
# ...
config['spacy_ner_classifiers'] = {
'da': 'da_core_news_sm',
'de': 'de_core_news_sm',
'en': 'en_core_web_sm',
'es': 'es_core_news_sm',
'fr': 'fr_core_news_sm',
'it': 'it_core_news_sm',
'lt': 'lt_core_news_sm',
'nb': 'nb_core_news_sm',
'nl': 'nl_core_news_sm',
'pl': 'pl_core_news_sm',
'pt': 'pt_core_news_sm',
'ro': 'ro_core_news_sm',
}
# Enable Stanford NER plugin
#config['plugins'].append('enhance_ner_stanford')
# Stanford NER Machine learning classifier (for which language and with how many classes, which need more computing time)
# Default classifier if no classifier for specific language
# disable NER for languages where no classifier defined in config['stanford_ner_classifiers']
config['stanford_ner_classifier_default'] = None
# Set default classifier to English (only if you are sure, that all documents you index are english)
#config['stanford_ner_classifier_default'] = '/usr/share/java/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz'
# Set default classifier to German (only if you are sure, that all documents you index are german)
#config['stanford_ner_classifier_default'] = '/usr/share/java/stanford-ner/classifiers/german.conll.germeval2014.hgc_175m_600.crf.ser.gz'
# Language specific classifiers (mapping to autodetected document language)
# Before you have to download additional language classifiers to the configured path
config['stanford_ner_classifiers'] = {
'en': '/usr/share/java/stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
'es': '/usr/share/java/stanford-ner/classifiers/spanish.ancora.distsim.s512.crf.ser.gz',
'de': '/usr/share/java/stanford-ner/classifiers/german.conll.germeval2014.hgc_175m_600.crf.ser.gz',
}
# If Stanford NER JAR not in standard path
config['stanford_ner_path_to_jar'] = "/usr/share/java/stanford-ner/stanford-ner.jar"
# Stanford NER Java options like RAM settings
config['stanford_ner_java_options'] = '-mx1000m'
#
# Law clauses extraction
#
config['plugins'].append('enhance_extract_law')
#
# Money extraction
#
config['plugins'].append('enhance_extract_money')
#
# Neo4j graph database
#
# exports named entities and relations to Neo4j graph database
# Enable plugin to export entities and connections to Neo4j graph database
#config['plugins'].append('export_neo4j')
# Neo4j server
#config['neo4j_host'] = 'localhost'
# Username & password
#config['neo4j_user'] = 'neo4j'
#config['neo4j_password'] = 'neo4j'
================================================
FILE: etc/opensemanticsearch/facets
================================================
# Warning: Do not edit here!
# This config file will be overwritten
# by web admin user interface after config changes
# and on initialization by /var/lib/opensemanticsearch/manage.py entities
#
# Default facet config if no facets are configured
#
config['facets'] = {
'author_ss': {'label': 'Author(s)', 'uri': 'http://schema.org/Author', 'facet_limit': '10', 'snippets_limit': '10'},
'tag_ss': {'label': 'Tags', 'uri': 'http://schema.org/keywords', 'facet_limit': '10', 'snippets_limit': '10'},
'annotation_tag_ss': {'label': 'Tags (Hypothesis)', 'uri': 'http://schema.org/keywords', 'facet_limit': '10', 'snippets_limit': '10'},
'person_ss': {'label': 'Persons', 'uri': 'http://schema.org/Person', 'facet_limit': '10', 'snippets_limit': '10'},
'organization_ss': {'label': 'Organizations', 'uri': 'http://schema.org/Organization', 'facet_limit': '10', 'snippets_limit': '10'},
'location_ss': {'label': 'Locations', 'uri': 'http://schema.org/Place', 'facet_limit': '10', 'snippets_limit': '10'},
'language_s': {'label': 'Language', 'uri': 'http://schema.org/inLanguage', 'facet_limit': '10', 'snippets_limit': '10'},
'email_ss': {'label': 'Email', 'uri': 'http://schema.org/email', 'facet_limit': '10', 'snippets_limit': '10'},
'Message-From_ss': {'label': 'Message from', 'uri': 'http://schema.org/sender', 'facet_limit': '10', 'snippets_limit': '10'},
'Message-To_ss': {'label': 'Message to', 'uri': 'http://schema.org/toRecipient', 'facet_limit': '10', 'snippets_limit': '10'},
'Message-CC_ss': {'label': 'Message CC', 'uri': 'http://schema.org/ccRecipient', 'facet_limit': '10', 'snippets_limit': '10'},
'Message-BCC_ss': {'label': 'Message BCC', 'uri': 'http://schema.org/bccRecipient', 'facet_limit': '10', 'snippets_limit': '10'},
'hashtag_ss': {'label': 'Hashtags', 'uri': 'http://schema.org/keywords', 'facet_limit': '10', 'snippets_limit': '10'},
'email_domain_ss': {'label': 'Email domain', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'phone_normalized_ss': {'label': 'Phone numbers', 'uri': 'https://schema.org/telephone', 'facet_limit': '10', 'snippets_limit': '10'},
'phone_ss': {'label': 'Phone numbers', 'uri': 'https://schema.org/telephone', 'facet_limit': '10', 'snippets_limit': '10'},
'money_ss': {'label': 'Money', 'uri': 'http://schema.org/MonetaryAmount', 'facet_limit': '10', 'snippets_limit': '10'},
'iban_ss': {'label': 'IBAN', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'law_clause_ss': {'label': 'Law clause', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'law_code_ss': {'label': 'Law code', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'law_code_clause_ss': {'label': 'Law code clause', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'filename_extension_s': {'label': 'Filename extension', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'content_type_group_ss': {'label': 'Content type group', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'content_type_ss': {'label': 'Content type', 'uri': '', 'facet_limit': '10', 'snippets_limit': '10'},
'law_codes_rdf_ss': {'label': '', 'uri': '', 'facet_limit': '0', 'snippets_limit': '0'},
}
================================================
FILE: etc/opensemanticsearch/filemonitoring/files
================================================
================================================
FILE: etc/opensemanticsearch/ocr/dictionary.txt
================================================
================================================
FILE: etc/opensemanticsearch/regex/email.tsv
================================================
[\w\.-]+@[\w\.-]+ email_ss
================================================
FILE: etc/opensemanticsearch/regex/iban.tsv
================================================
\b[a-zA-Z]{2}(?: ?)[0-9]{2}(?: ?)[a-zA-Z0-9]{4}(?: ?)[0-9]{7}(?: ?)([a-zA-Z0-9]?){0,16}\b iban_ss
================================================
FILE: etc/opensemanticsearch/regex/phone.tsv
================================================
[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9] phone_ss
================================================
FILE: etc/opensemanticsearch/task_priorities
================================================
# Priorities of document processing in task queue
# The higher the additional priority is, the earlier the document will be processed by task queue.
#
# Priorities in task queue by filename extension
#
# the higher the additional priority, the earlier file with this file name extension will be processed
# the lower the additional priority, the later files with this file name extension will be processed
config['priorities_filename_extension] = {
'.pdf': 5,
'.doc': 5,
'.docx': 5,
'.xls': 5,
'.xlsx': 5,
'.odp': 5,
'.ppt': 5,
'.pptx': 5,
'.eml': 5,
'.pst': 4,
'.csv': 4,
'.tsv': 4,
'.txt': 4,
'.htm': 3,
'.html': 3,
'.md': 3,
'.jpg': 1,
'.jpeg': 1,
'.gif': 1,
'.png': 1,
'.tif': 1,
'.mp3': 1,
'.mp4': 1,
'.wav': 1,
'.ini': -3,
'.bat': -4,
'.apk': -5,
'.bin': -5,
'.com': -5,
'.deb': -5,
'.exe': -5,
'.msi': -5,
'.php': -5,
'.cache': -5,
'.h': -5,
'.pl': -5,
'.py': -5,
'.pyc': -5,
'.js': -5,
'.css': -5,
'.ova': -5,
'.iso': -5,
}
#
# Priorities on parts of filenames
#
# If a configures string is part of the filename, additional priority is set
config['priorities_filename] = {
'corrupt': 5,
'illegal': 5,
'important': 5,
'relevant': 5,
'problem': 5,
'urgent': 5,
'passwor': 5,
'account': 4,
'agreement': 4,
'bank': 4,
'complian': 4,
'cost': 4,
'contract': 4,
'legal': 4,
'treaty': 4,
}
================================================
FILE: etc/systemd/system/opensemanticetl-filemonitoring.service
================================================
[Unit]
Description=Open Semantic ETL filemonitoring
After=network.target
[Service]
Type=simple
User=opensemanticetl
ExecStart=/usr/bin/opensemanticsearch-filemonitoring --fromfile /etc/opensemanticsearch/filemonitoring/files
Restart=always
[Install]
WantedBy=multi-user.target
================================================
FILE: etc/systemd/system/opensemanticetl.service
================================================
[Unit]
Description=Open Semantic ETL
After=network.target
[Service]
Type=simple
User=opensemanticetl
Environment=OMP_THREAD_LIMIT=1
ExecStart=/usr/bin/etl_tasks
Restart=always
[Install]
WantedBy=multi-user.target
================================================
FILE: src/opensemanticetl/__init__.py
================================================
================================================
FILE: src/opensemanticetl/clean_title.py
================================================
import sys
# Replace empty title with useful info from other fields for better usability
class clean_title(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
#
# if no title but subject (i.e. emails), use subject as document / result title
#
try:
# if no field title exists, but field subject, use it
if not 'title_txt' in data:
if 'subject_ss' in data:
data['title_txt'] = data['subject_ss']
else:
# if title empty and field subject exists, use subjects value
if not data['title_txt']:
if 'subject_ss' in data:
if data['subject_ss']:
data['title_txt'] = data['subject_ss']
except:
sys.stderr.write(
"Error while trying to clean empty title with subject")
# if no title yet, use the filename part of URI
try:
# if no field title exists, but field subject, use it
if not 'title_txt' in data:
# get filename from URI
filename = parameters['id'].split('/')[-1]
data['title_txt'] = filename
except:
sys.stderr.write(
"Error while trying to clean empty title with filename")
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_annotations.py
================================================
import os
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import etl_plugin_core
# Get tags and annotations from annotation server
class enhance_annotations(etl_plugin_core.Plugin):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
# get parameters
docid = parameters['id']
if os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER'):
server = os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER')
elif 'metadata_server' in parameters:
server = parameters['metadata_server']
else:
server = 'http://localhost/search-apps/annotate/json'
adapter = HTTPAdapter(max_retries=Retry(total=10, backoff_factor=1))
http = requests.Session()
http.mount("https://", adapter)
http.mount("http://", adapter)
response = http.get(server, params={'uri': docid})
response.raise_for_status()
annotations = response.json()
for facet in annotations:
etl_plugin_core.append(data, facet, annotations[facet])
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_contenttype_group.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Map/aggregate content type to content type group
#
class enhance_contenttype_group(object):
fieldname = 'content_type_group_ss'
contenttype_groups = {
'application/vnd.ms-excel': 'Spreadsheet',
'application/vnd.oasis.opendocument.spreadsheet': 'Spreadsheet',
'application/vnd.oasis.opendocument.spreadsheet-template': 'Spreadseheet template',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'Spreadsheet',
'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'Spreadsheet template',
'text': 'Text document',
'application/gzip text': 'Text document',
'application/pdf': 'Text document',
'application/msword': 'Text document',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'Text document',
'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'Text document template',
'application/vnd.oasis.opendocument.text': 'Text document',
'application/vnd.oasis.opendocument.text-template': 'Text document template',
'application/rtf': 'Text document',
'application/vnd.ms-powerpoint': 'Presentation',
'application/vnd.oasis.opendocument.presentation': 'Presentation',
'application/vnd.oasis.opendocument.presentation-template': 'Presentation template',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'Presentation',
'application/vnd.openxmlformats-officedocument.presentationml.template': 'Presentation template',
'image': 'Image',
'audio': 'Audio',
'video': 'Video',
'application/mp4': 'Video',
'application/x-matroska': 'Video',
'application/vnd.etsi.asic-e+zip': 'Electronic Signature Container',
'Knowledge graph': 'Knowledge graph',
}
suffix_groups = {
'.csv': "Spreadsheet",
}
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
content_types = []
if 'content_type_ss' in data:
content_types = data['content_type_ss']
if not isinstance(content_types, list):
content_types = [content_types]
groups = []
for content_type in content_types:
# Contenttype to group
for mapped_content_type, group in self.contenttype_groups.items():
if content_type.startswith(mapped_content_type):
if not group in groups:
groups.append(group)
# Suffix to group
for suffix, group in self.suffix_groups.items():
if parameters['id'].upper().endswith(suffix.upper()):
if not group in groups:
groups.append(group)
if len(groups) > 0:
data[self.fieldname] = groups
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_csv.py
================================================
import sys
import os
import csv
import urllib.request
from etl import ETL
# import each row of CSV file to index
# write CSV cols to database columns or facets
class enhance_csv(object):
def __init__(self, verbose=False):
self.verbose = verbose
self.config = {}
self.titles = False
self.cache = False
self.encoding = 'utf-8'
self.delimiter = None
self.start_row = 1
self.title_row = 0
self.cols = []
self.rows = []
self.cols_include = False
self.rows_include = False
self.sniff_dialect = True
self.quotechar = None
self.doublequote = None
self.escapechar = None
def read_parameters(self, parameters, data):
if 'verbose' in parameters:
if parameters['verbose']:
self.verbose = True
if 'encoding' in parameters:
self.encoding = parameters['encoding']
elif 'encoding_s' in data:
self.encoding = data['encoding_s']
if 'delimiter' in parameters:
self.delimiter = parameters['delimiter']
if 'cache' in parameters:
self.cache = parameters['cache']
if 'title_row' in parameters:
if parameters['title_row']:
self.title_row = parameters['title_row']
if 'start_row' in parameters:
if parameters['start_row']:
self.start_row = parameters['start_row']
if 'sniff_dialect' in parameters:
self.sniff_dialect = parameters['sniff_dialect']
if 'quotechar' in parameters:
self.quotechar = parameters['quotechar']
if 'doublequote' in parameters:
self.doublequote = parameters['doublequote']
if 'escapechar' in parameters:
self.escapechar = parameters['escapechar']
if 'rows' in parameters:
self.rows = parameters['rows']
if 'cols' in parameters:
self.cols = parameters['cols']
if 'rows_include' in parameters:
self.rows_include = parameters['rows_include']
if 'cols_include' in parameters:
self.cols_include = parameters['cols_include']
# Todo:
#
# If existing CSV parameter settings in CSV manager, use them
# even if not importing within CSV manager
#
def add_csv_parameters_from_meta_settings(self, metaserver):
pass
# get csv settings for this file from csvmnager
# json = get csvserver
# if delimiter in json:
# parameters['delimiter'] = json['delimiters']
#
# Build CSV dialect
#
# Autodetect and/or construct from parameters
def get_csv_dialect(self):
kwargs = {}
# automatically detect dialect
sniffed_dialect = False
if self.sniff_dialect:
try:
if self.verbose:
print("Opening {} for guessing CSV dialect".format(self.filename))
csvfile = open(self.filename, newline='',
encoding=self.encoding)
if self.verbose:
print("Starting dialect guessing")
# sniff dialect in first 32 MB
sniffsize = 33554432
sniffed_dialect = csv.Sniffer().sniff(csvfile.read(sniffsize))
if self.verbose:
print("Sniffed dialect: {}".format(sniffed_dialect))
except KeyboardInterrupt:
raise KeyboardInterrupt
except BaseException as e:
sys.stderr.write(
"Exception while CSV format autodetection for {}: {}".format(self.filename, e))
finally:
csvfile.close()
if sniffed_dialect:
kwargs['dialect'] = sniffed_dialect
else:
kwargs['dialect'] = 'excel'
# Overwrite options, if set
if self.delimiter:
kwargs['delimiter'] = str(self.delimiter)
if self.quotechar:
kwargs['quotechar'] = str(self.quotechar)
if self.escapechar:
kwargs['escapechar'] = str(self.escapechar)
if self.doublequote:
kwargs['doublequote'] = self.doublequote
return kwargs
def set_titles(self, row):
self.titles = []
colnumber = 0
for col in row:
colnumber += 1
self.titles.append(col)
return self.titles
def export_row_data_to_index(self, data, rownumber):
parameters = self.config.copy()
# todo: all content plugins configurated, not only this one
parameters['plugins'] = [
'enhance_path',
'enhance_entity_linking',
'enhance_multilingual',
]
etl = ETL()
try:
etl.process(parameters=parameters, data=data)
# if exception because user interrupted by keyboard, respect this and abbort
except KeyboardInterrupt:
raise KeyboardInterrupt
except BaseException as e:
sys.stderr.write(
"Exception adding CSV row {} : {}".format(rownumber, e))
if 'raise_pluginexception' in self.config:
if self.config['raise_pluginexception']:
raise e
def import_row(self, row, rownumber, docid):
colnumber = 0
data = {}
data['content_type_ss'] = "CSV row"
data['container_s'] = docid
data['page_i'] = str(rownumber)
data['id'] = docid + '#' + str(rownumber)
for col in row:
colnumber += 1
exclude_column = False
if self.cols_include:
if not colnumber in self.cols:
exclude_column = True
else:
if colnumber in self.cols:
exclude_column = True
if not exclude_column:
if self.titles and len(self.titles) >= colnumber:
fieldname = self.titles[colnumber - 1] + "_t"
else:
fieldname = 'column_' + str(colnumber).zfill(2) + "_t"
data[fieldname] = col
# if number, save as float value, too
try:
if self.titles and len(self.titles) >= colnumber:
fieldname = self.titles[colnumber - 1] + "_f"
else:
fieldname = 'column_' + str(colnumber).zfill(2) + "_f"
data[fieldname] = float(col)
except ValueError:
pass
self.export_row_data_to_index(data=data, rownumber=rownumber)
return colnumber
#
# read parameters, analyze csv dialect and import row by row
#
def enhance_csv(self, parameters, data):
self.config = parameters.copy()
docid = parameters['id']
#
# Read parameters
#
self.read_parameters(parameters, data)
if 'csvmanager' in parameters:
self.read_csv_parameters_from_meta_settings(
metaserver=parameters['csvmanager'], docid=docid)
# Download, if not a file(name) yet but URI reference
# todo: move to csv manager or downloader plugin that in that case should use etl_web
if 'filename' in parameters:
is_tempfile = False
self.filename = parameters['filename']
# if exist delete protocoll prefix file://
if self.filename.startswith("file://"):
self.filename = self.filename.replace("file://", '', 1)
else:
# Download url to an tempfile
is_tempfile = True
self.filename, headers = urllib.request.urlretrieve(self.filename)
#
# Get CSV dialect parameters
#
dialect_kwargs = self.get_csv_dialect()
if self.verbose:
print("Opening CSV file with Encoding {} and dialect {}".format(
self.encoding, dialect_kwargs))
#
# Open and read CSV
#
csvfile = open(self.filename, newline='', encoding=self.encoding)
reader = csv.reader(csvfile, **dialect_kwargs)
# increase limits to maximum, since there are often text fields with longer texts
csv.field_size_limit(sys.maxsize)
rownumber = 0
#
# Read CSV row by row
#
for row in reader:
rownumber += 1
#
# If title row, read column titles
#
if rownumber == self.title_row:
if self.verbose:
print("Importing Titles from row {}".format(self.title_row))
self.set_titles(row)
#
# Import data row
#
if rownumber >= self.start_row:
exclude_row = False
if self.rows_include:
if not rownumber in self.rows:
exclude_row = True
else:
if rownumber in self.rows:
exclude_row = True
if exclude_row:
if self.verbose:
print("Excluding row {}".format(rownumber))
else:
if self.verbose:
print("Importing row {}".format(rownumber))
count_columns = self.import_row(
row, rownumber=rownumber, docid=docid)
#
# delete if downloaded tempfile
#
if not self.cache:
if is_tempfile:
os.remove(self.filename)
#
# Print stats
#
if self.verbose:
print("Rows: " + str(rownumber))
print("Cols: " + str(count_columns))
return rownumber
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
docid = parameters['id']
# if CSV (file suffix is .csv), enhance it (import row by row)
if docid.lower().endswith('.csv') or docid.lower().endswith('.tsv') or docid.lower().endswith('.tab'):
self.enhance_csv(parameters, data)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_detect_language_tika_server.py
================================================
import os
import sys
import time
import requests
# Extract text from filename
class enhance_detect_language_tika_server(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
if os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER'):
tika_server = os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER')
elif 'tika_server' in parameters:
tika_server = parameters['tika_server']
else:
tika_server = 'http://localhost:9998'
uri = tika_server + '/language/string'
analyse_fields = ['title_txt', 'content_txt',
'description_txt', 'ocr_t', 'ocr_descew_t']
text = ''
for field in analyse_fields:
if field in data:
text = "{}{}\n".format(text, data[field])
if verbose:
print("Calling Tika server for language detection from {}".format(uri))
retries = 0
retrytime = 1
# wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
retrytime_max = 120
no_connection = True
while no_connection:
try:
if retries > 0:
print(
'Retrying to connect to Tika server in {} second(s).'.format(retrytime))
time.sleep(retrytime)
retrytime = retrytime * 2
if retrytime > retrytime_max:
retrytime = retrytime_max
r = requests.put(uri, data=text.encode('utf-8'))
no_connection = False
except requests.exceptions.ConnectionError as e:
retries += 1
sys.stderr.write(
"Connection to Tika server (will retry in {} seconds) failed. Exception: {}\n".format(retrytime, e))
language = r.content.decode('utf-8')
if verbose:
print("Detected language: {}".format(language))
data['language_s'] = language
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_entity_linking.py
================================================
#
# Named Entity Extraction by Open Semantic Entity Search API dictionary
#
import requests
import sys
import time
from entity_linking.entity_linker import Entity_Linker
import etl
import etl_plugin_core
#
# split a taxonomy entry to separated index fields
#
def taxonomy2fields(taxonomy, field, separator="\t", subfields_suffix="_ss"):
result = {}
# if not multivalued field, convert to used list/array strucutre
if not isinstance(taxonomy, list):
taxonomy = [taxonomy]
for taxonomy_entry in taxonomy:
i = 0
path = ''
for taxonomy_entry_part in taxonomy_entry.split(separator):
taxonomy_fieldname = field + '_taxonomy' + str(i) + subfields_suffix
if not taxonomy_fieldname in result:
result[taxonomy_fieldname] = []
if len(path) > 0:
path += separator
path += taxonomy_entry_part
result[taxonomy_fieldname].append(path)
i += 1
return result
class enhance_entity_linking(etl_plugin_core.Plugin):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
entity_linking_taggers = ['all_labels_ss_tag']
if 'entity_linking_taggers' in parameters:
entity_linking_taggers = parameters['entity_linking_taggers']
# add taggers for stemming
entity_linking_taggers_document_language_dependent = {}
if 'entity_linking_taggers_document_language_dependent' in parameters:
entity_linking_taggers_document_language_dependent = parameters[
'entity_linking_taggers_document_language_dependent']
if 'language_s' in data:
# is a language specific tagger there for the detected language?
if data['language_s'] in entity_linking_taggers_document_language_dependent:
for entity_linking_tagger in entity_linking_taggers_document_language_dependent[data['language_s']]:
if not entity_linking_tagger in entity_linking_taggers:
entity_linking_taggers.append(entity_linking_tagger)
openrefine_server = False
if 'openrefine_server' in parameters:
openrefine_server = parameters['openrefine_server']
taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss']
# collect/copy to be analyzed text from all fields
text = etl_plugin_core.get_text(data=data)
# tag all entities (by different taggers for different analyzers/stemmers)
for entity_linking_tagger in entity_linking_taggers:
results = {}
retries = 0
retrytime = 1
# wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
retrytime_max = 120
no_connection = True
while no_connection:
try:
if retries > 0:
print(
'Retrying to connect to Solr tagger in {} second(s).'.format(retrytime))
time.sleep(retrytime)
retrytime = retrytime * 2
if retrytime > retrytime_max:
retrytime = retrytime_max
# call REST API
if openrefine_server:
# use REST-API on (remote) HTTP server
params = {'text': text}
r = requests.post(openrefine_server, params=params)
# if bad status code, raise exception
r.raise_for_status()
results = r.json()
else:
# use local Python library
linker = Entity_Linker()
linker.verbose = verbose
results = linker.entities(text=text, taggers=[
entity_linking_tagger], additional_result_fields=taxonomy_fields)
no_connection = False
except KeyboardInterrupt:
raise KeyboardInterrupt
except requests.exceptions.ConnectionError as e:
retries += 1
if openrefine_server:
sys.stderr.write(
"Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e))
else:
sys.stderr.write(
"Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e))
except requests.exceptions.HTTPError as e:
if e.response.status_code == 503:
retries += 1
if openrefine_server:
sys.stderr.write(
"Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e))
else:
sys.stderr.write(
"Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e))
elif e.response.status_code == 400:
no_connection = False
# if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail
empty_entity_index = False
try:
errorstatus = e.response.json()
if errorstatus['error']['msg'] == 'field ' + entity_linking_tagger + ' has no indexed data':
empty_entity_index = True
except:
pass
if not empty_entity_index:
etl.error_message(
docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)
else:
no_connection = False
etl.error_message(
docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)
except BaseException as e:
no_connection = False
etl.error_message(
docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e)
if verbose:
print("Named Entity Linking by Tagger {}: {}".format(
entity_linking_tagger, results))
# write entities from result to document facets
for match in results:
for candidate in results[match]['result']:
if candidate['match']:
for facet in candidate['type']:
# use different facet for fuzzy/stemmed matches
if not entity_linking_tagger == 'all_labels_ss_tag':
# do not use another different facet if same stemmer but forced / not document language dependent
entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace(
'_stemming_force_', '_stemming_')
facet = facet + entity_linking_tagger_withoutforceoption + '_ss'
etl_plugin_core.append(data, facet, candidate['name'])
etl_plugin_core.append(data, facet + '_uri_ss',
candidate['id'])
etl_plugin_core.append(data, facet + '_preflabel_and_uri_ss',
candidate['name'] + ' <' + candidate['id'] + '>')
if 'matchtext' in candidate:
for matchtext in candidate['matchtext']:
etl_plugin_core.append(
data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext)
for taxonomy_field in taxonomy_fields:
if taxonomy_field in candidate:
separated_taxonomy_fields = taxonomy2fields(
taxonomy=candidate[taxonomy_field], field=facet)
for separated_taxonomy_field in separated_taxonomy_fields:
etl_plugin_core.append(
data, separated_taxonomy_field, separated_taxonomy_fields[separated_taxonomy_field])
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_extract_email.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import etl_plugin_core
#
# extract email addresses
#
class enhance_extract_email(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
# collect/copy to be analyzed text from all fields
text = etl_plugin_core.get_text(data=data)
for match in re.finditer('[\w\.-]+@[\w\.-]+', text, re.IGNORECASE):
value = match.group(0)
etl_plugin_core.append(data, 'email_ss', value)
# if extracted email addresses from data, do further analysis for separated specialized facets
if 'email_ss' in data:
# extract email adresses of sender (from)
for match in re.finditer('From: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE):
value = match.group(2)
etl_plugin_core.append(data, 'Message-From_ss', value)
# extract email adresses (to)
for match in re.finditer('To: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE):
value = match.group(2)
etl_plugin_core.append(data, 'Message-To_ss', value)
# extract the domain part from all emailadresses to facet email domains
data['email_domain_ss'] = []
emails = data['email_ss']
if not isinstance(emails, list):
emails = [emails]
for email in emails:
domain = email.split('@')[1]
etl_plugin_core.append(data, 'email_domain_ss', domain)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_extract_hashtags.py
================================================
import etl_plugin_core
# Extract text from filename
class enhance_extract_hashtags(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
minimallenght = 3
# collect/copy to be analyzed text from all fields
text = etl_plugin_core.get_text(data=data)
data['hashtag_ss'] = [word for word in text.split() if (
word.startswith("#") and len(word) > minimallenght)]
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_extract_law.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import etl_plugin_core
#
# get taxonomy for aggregated facets / filters
#
# example: '§ 153 Abs. 1 Satz 2' -> ['§ 153', '§ 153 Absatz 1', '§ 153 Absatz 1 Satz 2']
# todo:
def get_taxonomy(law_clause, law_code = None):
law_clauses = [law_clause]
return law_clauses
#1.a
#1(2)
#1 (2)
#
# extract law codes
#
class enhance_extract_law(etl_plugin_core.Plugin):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
clause_prefixes = [
'§',
'Article',
'Artikel',
'Art',
'Section',
'Sec',
]
clause_subsections = [
'Abschnitt',
'Absatz',
'Abs',
'Sentence',
'Satz',
'S',
'Halbsatz',
'Number',
'Nummer',
'Nr',
'Buchstabe',
]
text = etl_plugin_core.get_text(data)
clauses = []
rule = '(' + '|'.join(clause_prefixes) + ')\W*((\d+\W\w(\W|\b))|(\d+\w?))(\W?(' + '|'.join(clause_subsections) + ')\W*(\d+\w?|\w(\W|\b)))*'
for match in re.finditer(rule, text, re.IGNORECASE):
clause = match.group(0)
clause = clause.strip()
clauses.append(clause)
# if "§123" normalize to "§ 123"
if clause[0] == '§' and not clause[1] == ' ':
clause = '§ ' + clause[1:]
etl_plugin_core.append(data, 'law_clause_ss', clause)
code_matchtexts = etl_plugin_core.get_all_matchtexts(data.get('law_code_ss_matchtext_ss', []))
code_matchtexts_with_clause = []
preflabels = {}
if 'law_code_ss_preflabel_and_uri_ss' in data:
preflabels = etl_plugin_core.get_preflabels(data['law_code_ss_preflabel_and_uri_ss'])
if len(clauses)>0 and len(code_matchtexts)>0:
text = text.replace("\n", " ")
for code_match_id in code_matchtexts:
#get only matchtext (without ID/URI of matching entity)
for code_matchtext in code_matchtexts[code_match_id]:
for clause in clauses:
if clause + " " + code_matchtext in text or code_matchtext + " " + clause in text:
code_matchtexts_with_clause.append(code_matchtext)
# if "§123" normalize to "§ 123"
if clause[0] == '§' and not clause[1] == ' ':
clause = '§ ' + clause[1:]
law_code_preflabel = code_match_id
if code_match_id in preflabels:
law_code_clause_normalized = clause + " " + preflabels[code_match_id]
else:
law_code_clause_normalized = clause + " " + code_match_id
etl_plugin_core.append(data, 'law_code_clause_ss', law_code_clause_normalized)
if len(code_matchtexts)>0:
blacklist = []
listfile = open('/etc/opensemanticsearch/blacklist/enhance_extract_law/blacklist-lawcode-if-no-clause')
for line in listfile:
line = line.strip()
if line and not line.startswith("#"):
blacklist.append(line)
listfile.close()
if not isinstance(data['law_code_ss_matchtext_ss'], list):
data['law_code_ss_matchtext_ss'] = [data['law_code_ss_matchtext_ss']]
blacklisted_code_ids = []
for code_match_id in code_matchtexts:
for code_matchtext in code_matchtexts[code_match_id]:
if code_matchtext in blacklist:
if code_matchtext not in code_matchtexts_with_clause:
blacklisted_code_ids.append(code_match_id)
data['law_code_ss_matchtext_ss'].remove(code_match_id + "\t" + code_matchtext)
code_matchtexts = etl_plugin_core.get_all_matchtexts(data.get('law_code_ss_matchtext_ss', []))
if not isinstance(data['law_code_ss'], list):
data['law_code_ss'] = [data['law_code_ss']]
if not isinstance(data['law_code_ss_preflabel_and_uri_ss'], list):
data['law_code_ss_preflabel_and_uri_ss'] = [data['law_code_ss_preflabel_and_uri_ss']]
for blacklisted_code_id in blacklisted_code_ids:
if blacklisted_code_id not in code_matchtexts:
data['law_code_ss'].remove(preflabels[blacklisted_code_id])
data['law_code_ss_preflabel_and_uri_ss'].remove(preflabels[blacklisted_code_id] + ' <' + blacklisted_code_id + '>')
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_extract_money.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import etl_plugin_core
from numerizer import numerize
#
# extract money
#
class enhance_extract_money(etl_plugin_core.Plugin):
# todo: all other currency signs from Wikidata
currency_signs = ['$', '€']
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
moneys = set(data.get('money_ss', []))
text = etl_plugin_core.get_text(data)
text = text.replace("\n", " ")
# convert written numbers like "one" and "two million" to integer like "1" and "2000000"
if 'language_s' in data:
if data['language_s'] == "en":
text = numerize(text)
currencies_escaped = []
# currency signs
for currency in self.currency_signs:
currencies_escaped.append(re.escape(currency))
# currency labels
matched_currency_labels = etl_plugin_core.get_all_matchtexts(data.get('currency_ss_matchtext_ss', []))
for currency_id in matched_currency_labels:
#get only matchtext (without ID/URI of matching entity)
for matchtext in matched_currency_labels[currency_id]:
currencies_escaped.append(re.escape(matchtext))
regex_part_number = '\d+((\.|\,)\d+)*'
regex_part_currencies = '(' + '|'.join(currencies_escaped) + ')'
rule = regex_part_number + '\s?' + regex_part_currencies
for match in re.finditer(rule, text, re.IGNORECASE):
moneys.add(match.group(0))
rule = regex_part_currencies + '\s?' + regex_part_number
for match in re.finditer(rule, text, re.IGNORECASE):
moneys.add(match.group(0))
data['money_ss'] = list(moneys)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_extract_phone.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import etl_plugin_core
#
# normalize phone number (remove all non-numeric chars except leading +)
# so same number is used for aggregations/facet filters, even if written in different formats (with or without space(s) and hyphen(s))
#
def normalize_phonenumber(phone):
chars = ['+','0','1','2','3','4','5','6','7','8','9']
phone_normalized = ''
for char in phone:
if char in chars:
# only first +
if char == '+':
if not phone_normalized:
phone_normalized = '+'
else:
phone_normalized += char
return phone_normalized
#
# extract phone number(s)
#
class enhance_extract_phone(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
# collect/copy to be analyzed text from all fields
text = etl_plugin_core.get_text(data=data)
for match in re.finditer('[\+\(]?[1-9][0-9 .\-\(\)]{8,}[0-9]', text, re.IGNORECASE):
value = match.group(0)
etl_plugin_core.append(data, 'phone_ss', value)
# if extracted phone number(s), normalize to format that can be used for aggregation/filters
if 'phone_ss' in data:
phones = data['phone_ss']
if not isinstance(phones, list):
phones = [phones]
for phone in phones:
phone_normalized = normalize_phonenumber(phone)
etl_plugin_core.append(data, 'phone_normalized_ss', phone_normalized)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_extract_text_tika_server.py
================================================
import os
import tempfile
import sys
import time
import requests
def in_parsers(parser, parsers):
for value in parsers:
if isinstance(value, list):
for subvalue in value:
if subvalue == parser:
return True
else:
if value == parser:
return True
return False
# Extract text from file(name)
class enhance_extract_text_tika_server(object):
mapping = {
'Content-Type': 'content_type_ss',
'dc:creator': 'author_ss',
'Content-Encoding': 'Content-Encoding_ss',
'dc:title': 'title_txt',
'dc:subject': 'subject_ss',
}
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
filename = parameters['filename']
tika_log_path = tempfile.mkdtemp(prefix="tika-python-")
os.environ['TIKA_LOG_PATH'] = tika_log_path
os.environ['TIKA_CLIENT_ONLY'] = 'True'
import tika
from tika import parser
tika.TikaClientOnly = True
headers = {}
do_ocr = parameters.get('ocr', False)
do_ocr_pdf_tika = parameters.get('ocr_pdf_tika', True)
do_ocr_pdf = False
if 'plugins' in parameters:
if 'enhance_pdf_ocr' in parameters['plugins'] and do_ocr_pdf_tika:
do_ocr_pdf = True
# if only OCR for PDF enabled (enhance_pdf_ocr as fallback and OCR by tika enabled) but not OCR for image files,
# run OCR only if file ending .pdf so disabled OCR for other file types
if do_ocr_pdf and not do_ocr:
contenttype = data.get('content_type_ss', None)
if isinstance(contenttype, list):
contenttype = contenttype[0]
if contenttype == 'application/pdf' or filename.lower().endswith('.pdf'):
do_ocr_pdf = True
else:
do_ocr_pdf = False
if 'ocr_lang' in parameters:
headers['X-Tika-OCRLanguage'] = parameters['ocr_lang']
if do_ocr or do_ocr_pdf:
if os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER'):
tika_server = os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER')
elif 'tika_server' in parameters:
tika_server = parameters['tika_server']
else:
tika_server = 'http://localhost:9998'
# OCR embedded images in PDF, if not disabled or has to be done by other plugin
if do_ocr_pdf:
headers['X-Tika-PDFextractInlineImages'] = 'true'
else:
headers['X-Tika-PDFextractInlineImages'] = 'false'
# set OCR status in indexed document
data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = True
# OCR is enabled, so was done by this Tika call, no images left to OCR
data['etl_count_images_yet_no_ocr_i'] = 0
else:
# OCR (yet) disabled, so use the Tika instance using the fake tesseract so we only get OCR results if in cache
# else we get OCR status [Image (No OCR yet)] in content, so we know that there are images to OCR for later steps
if os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER_FAKECACHE'):
tika_server = os.getenv('OPEN_SEMANTIC_ETL_TIKA_SERVER_FAKECACHE')
elif 'tika_server_fake_ocr' in parameters:
tika_server = parameters['tika_server_fake_ocr']
else:
tika_server = 'http://localhost:9999'
headers['X-Tika-PDFextractInlineImages'] = 'true'
# set OCR status in indexed document, so next stage knows that yet no OCR
data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = False
#
# Parse on Apache Tika Server by python-tika
#
if verbose:
print("Parsing by Tika Server on {} with additional headers {}".format(tika_server, headers))
retries = 0
retrytime = 1
# wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
retrytime_max = 120
no_connection = True
while no_connection:
try:
if retries > 0:
print(
'Retrying to connect to Tika server in {} second(s).'.format(retrytime))
time.sleep(retrytime)
retrytime = retrytime * 2
if retrytime > retrytime_max:
retrytime = retrytime_max
parsed = parser.from_file(
filename=filename,
serverEndpoint=tika_server,
headers=headers,
requestOptions={'timeout': 60000})
no_connection = False
except requests.exceptions.ConnectionError as e:
retries += 1
sys.stderr.write(
"Connection to Tika server (will retry in {} seconds) failed. Exception: {}\n".format(retrytime, e))
if parsed['content']:
data['content_txt'] = parsed['content']
tika_exception = False
for tika_field in parsed["metadata"]:
# there is a field name with exceptions, so copy fieldname to failed plugins
if 'exception' in tika_field.lower():
tika_exception = True
parameters['etl_tika_exception'] = True
if 'etl_error_plugins_ss' not in data:
data['etl_error_plugins_ss'] = []
data['etl_error_plugins_ss'].append(tika_field)
# copy Tika fields to (mapped) data fields
if tika_field in self.mapping:
data[self.mapping[tika_field]] = parsed['metadata'][tika_field]
else:
data[tika_field + '_ss'] = parsed['metadata'][tika_field]
#
# anaylze and (re)set OCR status to prevent (re)process unnecessary tasks of later stage(s)
#
contenttype = data.get('content_type_ss', None)
if isinstance(contenttype, list):
contenttype = contenttype[0]
ocr_status_known = False
# file was PDF and OCR for PDF enabled, so we know status
if do_ocr_pdf:
ocr_status_known = True
# all OCR cases enabled, so we know status
if do_ocr and do_ocr_pdf:
ocr_status_known = True
# if no kind of OCR done now, we know status because fake tesseract wrapper
if not do_ocr and not do_ocr_pdf:
ocr_status_known = True
# if OCR for images done but content type is PDF and OCR of PDF by Tika is disabled
# (because using other plugin for that) we do not know status for PDF,
# since Tika runned without inline OCR for PDF
if do_ocr and not do_ocr_pdf:
if not contenttype == 'application/pdf':
ocr_status_known = True
if ocr_status_known:
# Tika made an tesseract OCR call (if OCR (yet) off, by fake Tesseract CLI wrapper)
# so there is really something to OCR?
if not in_parsers('org.apache.tika.parser.ocr.TesseractOCRParser', data['X-TIKA:Parsed-By_ss']):
# since Tika did not call (fake or cached) tesseract (wrapper), nothing to OCR in this file,
if verbose:
print('Tika OCR parser not used, so nothing to OCR in later stages, too')
# so set all OCR plugin status and OCR configs to done,
# so filter_file_not_modifield in later stage task will prevent reprocessing
# because of only this yet not runned plugins or OCR configs
data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = True
data['etl_count_images_yet_no_ocr_i'] = 0
if not tika_exception:
parameters['etl_nothing_for_ocr'] = True
data['etl_enhance_ocr_descew_b'] = True
data['etl_enhance_pdf_ocr_b'] = True
else:
# OCR parser used by Tika, so there was something to OCR
# If in this case the fake tesseract wrapper could get all results from cache,
# no additional Tika-Server run with OCR enabled needed
# So set Tika-Server OCR status of tika-server to done
if not do_ocr and 'content_txt' in data:
if verbose:
print("Tika OCR parser was used, so there is something to OCR")
# how many images yet not OCRd because no result from cache
# so we got fake OCR result "[Image (no OCR yet)]"
count_images_yet_no_ocr = data['content_txt'].count('[Image (no OCR yet)]')
data['etl_count_images_yet_no_ocr_i'] = count_images_yet_no_ocr
# got all Tika-Server Tesseract OCR results from cache,
# so no additional OCR tasks for later stage
if count_images_yet_no_ocr == 0:
if verbose:
print('But could get all OCR results in this stage from OCR cache')
# therefore set status like OCR related config
# yet runned, so on next stage filter_file_not_modified
# wont process document again only because of OCR
# (but not reset status of other plugins,
# since maybe additional image in changed file)
data['etl_enhance_extract_text_tika_server_ocr_enabled_b'] = True
data['etl_count_images_yet_no_ocr_i'] = 0
# if not a (maybe changed) PDF, set enhance_pdf_ocr to done, too,
# so no reprocessing because this additional plugin on later stage
if not contenttype == 'application/pdf':
data['etl_enhance_pdf_ocr_b'] = True
tika_log_file = tika_log_path + os.path.sep + 'tika.log'
if os.path.isfile(tika_log_file):
os.remove(tika_log_file)
os.rmdir(tika_log_path)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_file_mtime.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os.path
import datetime
#
# Add file modification time
#
class enhance_file_mtime(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
filename = parameters['filename']
# get modification time from file
file_mtime = os.path.getmtime(filename)
# convert mtime to Lucene format
file_mtime_masked = datetime.datetime.fromtimestamp(
file_mtime).strftime("%Y-%m-%dT%H:%M:%SZ")
if verbose:
print("File modification time: {}".format(file_mtime_masked))
data['file_modified_dt'] = file_mtime_masked
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_file_size.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os.path
#
# add file size
#
class enhance_file_size(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
filename = parameters['filename']
# get filesize
file_size = os.path.getsize(filename)
if verbose:
print("File size: {}".format(file_size))
data['file_size_i'] = str(file_size)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_html.py
================================================
#
# Extracts text within configured HTML tags / XML tags
#
from lxml import etree
class enhance_html(object):
def elements2data(self, element, data, path=None, recursive=True):
if self.verbose:
print("Extracting element {}".format(element.tag))
if path:
path += "/" + element.tag
else:
path = element.tag
fieldname = path + '_ss'
text = element.text
if text:
text = text.strip()
if text:
if fieldname in data:
data[fieldname].append(text)
else:
data[fieldname] = [text]
if recursive:
for child in element:
data = self.elements2data(
element=child, path=path, data=data, recursive=True)
return data
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
self.verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
self.verbose = True
filename = parameters['filename']
if 'content_type_ss' in data:
mimetype = data['content_type_ss']
else:
mimetype = parameters['content_type_ss']
# if connector returns a list, use only first value (which is the only entry of the list)
if isinstance(mimetype, list):
mimetype = mimetype[0]
if mimetype.startswith('application/xhtml+xml'):
html_extract_tags = []
if 'html_extract_tags' in parameters:
html_extract_tags = parameters['html_extract_tags']
html_extract_tags_and_children = []
if 'html_extract_tags_and_children' in parameters:
html_extract_tags_and_children = parameters['html_extract_tags_and_children']
parser = etree.HTMLParser()
et = etree.parse(filename, parser)
for xpath in html_extract_tags:
for el in et.xpath(xpath):
self.elements2data(element=el, data=data, recursive=False)
for xpath in html_extract_tags_and_children:
for el in et.xpath(xpath):
self.elements2data(element=el, data=data)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_mapping_id.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# Map paths or domains
#
class enhance_mapping_id(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
if 'mappings' in parameters:
parameters['id'] = mapping(
value=parameters['id'], mappings=parameters['mappings'])
return parameters, data
# Change value with best/deepest mapping
def mapping(value, mappings=None):
if mapping is None:
mappings = {}
max_match_len = -1
# check all mappings for matching and use the best
for map_from, map_to in mappings.items():
# map from matching value?
if value.startswith(map_from):
# if from string longer (deeper path), this is the better matching
match_len = len(map_from)
if match_len > max_match_len:
max_match_len = match_len
best_match_map_from = map_from
best_match_map_to = map_to
# if there is a match, replace first occurance of value with mapping
if max_match_len >= 0:
value = value.replace(best_match_map_from, best_match_map_to, 1)
return value
# Change mapped value to origin value
def mapping_reverse(value, mappings=None):
if mapping is None:
mappings = {}
max_match_len = -1
# check all mappings for matching and use the best
for map_from, map_to in mappings.items():
# map from matching value?
if value.startswith(map_to):
# if from string longer (deeper path), this is the better matching
match_len = len(map_to)
if match_len > max_match_len:
max_match_len = match_len
best_match_map_from = map_from
best_match_map_to = map_to
# if there is a match, replace first occurance of value with reverse mapping
if max_match_len >= 0:
value = value.replace(best_match_map_to, best_match_map_from, 1)
return value
================================================
FILE: src/opensemanticetl/enhance_mimetype.py
================================================
#!/usr/bin/python
# -*- coding: utf-8 -*-
import magic
#
# Get MimeType (Which kind of file is this?)
#
class enhance_mimetype(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
filename = parameters['filename']
mimetype = None
m = magic.open(magic.MAGIC_MIME)
m.load()
mimetype = m.file(filename)
m.close()
if verbose:
print("Detected MimeType: {}".format(mimetype))
data['content_type_magic_s'] = mimetype
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_multilingual.py
================================================
#
# Multilinguality
#
# Copy content language specific dynamic fields for language specific analysis like stemming, grammar or synonyms
#
# Language has been detected before by plugin enhance_detect_language using Apache Tika / OpenNLP
#
class enhance_multilingual(object):
verbose = False
# languages that are defined in index schema for language specific analysis and used if autodetected as documents language
languages = ['en', 'fr', 'de', 'es', 'hu', 'pt',
'nl', 'ro', 'ru', 'it', 'cz', 'ar', 'fa']
languages_hunspell = ['hu']
# languages for language specific analysis even if not the autodetected document language
languages_force = []
languages_force_hunspell = []
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
if 'verbose' in parameters:
self.verbose = parameters['verbose']
if 'languages' in parameters:
self.languages = parameters['languages']
if 'languages_hunspell' in parameters:
self.languages_hunspell = parameters['languages_hunspell']
if 'languages_force' in parameters:
self.languages_force = parameters['languages_force']
if 'languages_force_hunspell' in parameters:
self.languages_force_hunspell = parameters['languages_force_hunspell']
if 'languages_exclude_fields' in parameters:
self.exclude_fields = parameters['languages_exclude_fields']
if 'languages_exclude_fields_map' in parameters:
self.exclude_fields_map = parameters['languages_exclude_fields_map']
language = data.get('language_s', None)
#
# exclude fields like technical metadata
#
exclude_prefix = []
listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix')
for line in listfile:
line = line.strip()
if line and not line.startswith("#"):
exclude_prefix.append(line)
listfile.close()
# suffixes of non-text fields like nubers
exclude_suffix = []
listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix')
for line in listfile:
line = line.strip()
if line and not line.startswith("#"):
exclude_suffix.append(line)
listfile.close()
# full fieldnames
exclude_fields = []
listfile = open('/etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname')
for line in listfile:
line = line.strip()
if line and not line.startswith("#"):
exclude_fields.append(line)
listfile.close()
exclude_fields_map = {}
language_fields = ['_text_']
language_specific_data = {}
# language specific analysis for recognized language of document
# if language support of detected language in index schema
if language in self.languages:
language_fields.append("text_txt_" + language)
if language in self.languages_hunspell:
language_fields.append("text_txt_hunspell_" + language)
# fields for language specific analysis by forced languages even if other language or false recognized language
for language_force in self.languages_force:
language_field = "text_txt_" + language_force
if not language_field in language_fields:
language_fields.append(language_field)
for language_force in self.languages_force_hunspell:
language_field = "text_txt_hunspell_" + language_force
if not language_field in language_fields:
language_fields.append(language_field)
# copy each data field to language specific field with suffix _txt_$language
for fieldname in data:
exclude = False
# do not copy excluded fields
for exclude_field in exclude_fields:
if fieldname == exclude_field:
exclude = True
for prefix in exclude_prefix:
if fieldname.startswith(prefix):
exclude = True
for suffix in exclude_suffix:
if fieldname.endswith(suffix):
exclude = True
if not exclude and data[fieldname]:
# copy field to default field with added suffixes for language dependent stemming/analysis
for language_field in language_fields:
excluded_by_mapping = False
if language_field in exclude_fields_map:
if fieldname in exclude_fields_map[language_field]:
excluded_by_mapping = True
if self.verbose:
print("Multilinguality: Excluding field {} to be copied to {} by config of exclude_field_map".format(
fieldname, language_field))
if not excluded_by_mapping:
if self.verbose:
print("Multilinguality: Add {} to {}".format(
fieldname, language_field))
if not language_field in language_specific_data:
language_specific_data[language_field] = []
if isinstance(data[fieldname], list):
language_specific_data[language_field].extend(
data[fieldname])
else:
language_specific_data[language_field].append(
data[fieldname])
# append language specific fields to data
for key in language_specific_data:
data[key] = language_specific_data[key]
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_ner_spacy.py
================================================
import etl
import requests
import json
import os
import sys
import time
#
# SpaCy Named Entity Recognizer (NER)
#
# Appends classified (Persons, Locations, Organizations) entities (names/words) to mapped facets/fields
class enhance_ner_spacy(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
if 'spacy_ner_mapping' in parameters:
mapping = parameters['spacy_ner_mapping']
else:
mapping = {
'ORG': 'organization_ss',
'NORP': 'organization_ss',
'orgName': 'organization_ss',
'ORGANIZATION': 'organization_ss',
'PER': 'person_ss',
'PERSON': 'person_ss',
'persName': 'person_ss',
'GPE': 'location_ss',
'LOC': 'location_ss',
'placeName': 'location_ss',
'FACILITY': 'location_ss',
'PRODUCT': 'product_ss',
'EVENT': 'event_ss',
'LAW': 'law_ss',
'DATE': 'date_ss',
'TIME': 'time_ss',
'MONEY': 'money_ss',
'WORK_OF_ART': 'work_of_art_ss',
}
# default classifier
classifier = 'en_core_web_sm'
if 'spacy_ner_classifier_default' in parameters:
classifier = parameters['spacy_ner_classifier_default']
# set language specific classifier, if configured and document language detected
if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
# is a language specific classifier there for the detected language?
if data['language_s'] in parameters['spacy_ner_classifiers']:
classifier = parameters['spacy_ner_classifiers'][data['language_s']]
# if standard classifier configured to None and no classifier for detected language, exit the plugin
if not classifier:
return parameters, data
if verbose:
print("Using SpaCY NER language / classifier: {}".format(classifier))
analyse_fields = ['title_txt', 'content_txt',
'description_txt', 'ocr_t']
text = ''
for field in analyse_fields:
if field in data:
text = "{}{}\n".format(text, data[field])
# classify/tag with class each word of the content
url = "http://localhost:8080/ent"
if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):
url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/ent'
headers = {'content-type': 'application/json'}
d = {'text': text, 'model': classifier}
retries = 0
retrytime = 1
# wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
retrytime_max = 120
no_connection = True
while no_connection:
try:
if retries > 0:
print(
'Retrying to connect to Spacy services in {} second(s).'.format(retrytime))
time.sleep(retrytime)
retrytime = retrytime * 2
if retrytime > retrytime_max:
retrytime = retrytime_max
response = requests.post(
url, data=json.dumps(d), headers=headers)
# if bad status code, raise exception
response.raise_for_status()
no_connection = False
except requests.exceptions.ConnectionError as e:
retries += 1
sys.stderr.write(
"Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\n".format(retrytime, e))
r = response.json()
for ent in r:
entity_class = ent['label']
# get entity string from returned start and end value
entity = text[int(ent['start']): int(ent['end'])]
# strip whitespaces from begin and end
entity = entity.strip()
# after strip exclude empty entities
if not entity:
continue
# if class of entity is mapped to a facet/field, append the entity to this facet/field
if entity_class in mapping:
if verbose:
print("NER classified word(s)/name {} to {}. Appending to mapped facet {}".format(
entity, entity_class, mapping[entity_class]))
etl.append(data, mapping[entity_class], entity)
else:
if verbose:
print("Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}".format(entity_class, entity))
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_ner_stanford.py
================================================
import etl
from nltk.tag.stanford import StanfordNERTagger
#
# Stanford Named Entitiy Recognizer (NER)
#
# Appends classified (Persons, Locations, Organizations) entities (names/words) to mapped facets/fields
class enhance_ner_stanford(object):
# compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)
def multi_word_entities(self, entities):
multi_word_entities = []
multi_word_entity = ""
last_entity_class = ""
i = 0
for entity, entity_class in entities:
i += 1
class_change = False
# new entity class different from last words which had been joined?
if last_entity_class:
if entity_class != last_entity_class:
class_change = True
# if new class add last values to dictionary and begin new multi word entity
if class_change:
multi_word_entities.append(
(multi_word_entity, last_entity_class))
multi_word_entity = ""
# add new word to multi word entity
if multi_word_entity:
multi_word_entity += " " + entity
else:
multi_word_entity = entity
# if last entity, no next class change, so add now
if i == len(entities):
multi_word_entities.append((multi_word_entity, entity_class))
last_entity_class = entity_class
return multi_word_entities
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
if 'stanford_ner_mapping' in parameters:
mapping = parameters['stanford_ner_mapping']
else:
# todo: extend mapping for models with more classes like dates
mapping = {
'PERSON': 'person_ss',
'LOCATION': 'location_ss',
'ORGANIZATION': 'organization_ss',
'I-ORG': 'organization_ss',
'I-PER': 'person_ss',
'I-LOC': 'location_ss',
'ORG': 'organization_ss',
'PER': 'person_ss',
'LOC': 'location_ss',
'PERS': 'person_ss',
'LUG': 'location_ss',
'MONEY': 'money_ss',
}
# default classifier
classifier = 'english.all.3class.distsim.crf.ser.gz'
if 'stanford_ner_classifier_default' in parameters:
classifier = parameters['stanford_ner_classifier_default']
# set language specific classifier, if configured and document language detected
if 'stanford_ner_classifiers' in parameters and 'language_s' in data:
# is a language speciic cassifier there for the detected language?
if data['language_s'] in parameters['stanford_ner_classifiers']:
classifier = parameters['stanford_ner_classifiers'][data['language_s']]
# if standard classifier configured to None and no classifier for detected language, exit the plugin
if not classifier:
return parameters, data
kwargs = {}
if 'stanford_ner_java_options' in parameters:
kwargs['java_options'] = parameters['stanford_ner_java_options']
if 'stanford_ner_path_to_jar' in parameters:
kwargs['path_to_jar'] = parameters['stanford_ner_path_to_jar']
analyse_fields = ['title_txt', 'content_txt',
'description_txt', 'ocr_t', 'ocr_descew_t']
text = ''
for field in analyse_fields:
if field in data:
text = "{}{}\n".format(text, data[field])
# classify/tag with class each word of the content
st = StanfordNERTagger(classifier, encoding='utf8',
verbose=verbose, **kwargs)
entities = st.tag(text.split())
# compound words of same class to multi word entities (result is a split by class changes instead of split on single words/tokens)
entities = self.multi_word_entities(entities)
# if class of entity is mapped to a facet/field, append the entity to this facet/field
for entity, entity_class in entities:
if entity_class in mapping:
if verbose:
print("NER classified word(s)/name {} to {}. Appending to mapped facet {}".format(
entity, entity_class, mapping[entity_class]))
etl.append(data, mapping[entity_class], entity)
else:
if verbose:
print("Since Named Entity Recognition (NER) class {} not mapped to a field/facet, ignore entity/word(s): {}".format(entity_class, entity))
# mark the document, that it was analyzed by this plugin yet
data['enhance_ner_stanford_b'] = "true"
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_ocr.py
================================================
from tesseract_cache import tesseract_cache
#
# If image add ocr text
#
class enhance_ocr(object):
# how to find uris which are not enriched yet?
# (if not enhanced on indexing but later)
# this plugin needs to read the field id as a
# parameters to enrich unenriched docs
fields = ['id', 'content_type']
# query to find documents, that were not enriched by this plugin yet
# (since we marked documents which were OCRd with ocr_b = true
query = "content_type: image/* AND NOT enhance_ocr_b:true"
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = parameters.get('verbose', False)
filename = parameters['filename']
if 'content_type_ss' in data:
mimetype = data['content_type_ss']
else:
mimetype = parameters['content_type_ss']
# if connector returns a list, use only first
# value (which is the only entry of the list)
if isinstance(mimetype, list):
mimetype = mimetype[0]
lang = parameters.get('ocr_lang', 'eng')
if "image" in mimetype.lower():
if verbose:
print("Mimetype seems image ({}), starting OCR"
.format(mimetype))
ocr_txt = tesseract_cache.get_ocr_text(filename=filename, lang=lang, cache_dir=parameters.get("ocr_cache"))
if ocr_txt:
data['ocr_t'] = ocr_txt
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_path.py
================================================
import os.path
#
# Build and add path facets from filename
#
class enhance_path(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
docid = parameters['id']
filename_extension = os.path.splitext(docid)[1][1:].lower()
if filename_extension:
data['filename_extension_s'] = filename_extension
if 'facet_path_strip_prefix' in parameters:
facet_path_strip_prefix = parameters['facet_path_strip_prefix']
else:
facet_path_strip_prefix = ['file://', 'http://', 'https://']
# if begins with unwanted path prefix strip it
if facet_path_strip_prefix:
for prefix in facet_path_strip_prefix:
if docid.startswith(prefix):
docid = docid.replace(prefix, '', 1)
break
# replace backslash (i.e. windows filenames) with unix path seperator
docid = docid.replace("\\", '/')
# replace # (i.e. uri) with unix path seperator
docid = docid.replace("#", '/')
# if more than one /
docid = docid.replace("//", '/')
# split paths
path = docid.split('/')
# it's only a domain
if (len(path) == 1) or (len(path) == 2 and docid.endswith('/')):
data['path0_s'] = path[0]
else:
# it's a path
# if leading / on unix paths, split leads to first element empty, so delete it
if not path[0]:
del path[0]
i = 0
for subpath in path:
if i == len(path) - 1:
# last element, so basename/pure filename without path
if subpath: # if not ending / so empty last part after split
data['path_basename_s'] = subpath
else:
# not last path element (=filename), so part of path, not the filename at the end
data['path' + str(i) + '_s'] = subpath
i += 1
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_pdf_ocr.py
================================================
import os.path
import sys
import subprocess
import hashlib
import tempfile
import json
import etl_plugin_core
from tesseract_cache import tesseract_cache
# Extract text from all extracted images from pdf
# if splitpages is off, return one txt instead of page based list of texts
def pdfimages2text(filename, lang='eng', verbose=False,
pdf_ocr=True,
cache=None):
ocr_txt = {}
if cache is not None:
try:
return load_cache(filename, cache, lang, pdf_ocr)
except (FileNotFoundError, KeyError):
if verbose:
print('Not in PDF OCR cache, starting OCR for {}'.format(filename))
ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_")
# Extract all images of the pdf to tempdir with commandline tool
# "pdfimages" from poppler pdf toolbox
# -j = export as JPEG
# -p = write page name in image filename
result = subprocess.call(
['pdfimages', '-p', '-j', filename,
ocr_temp_dirname + os.path.sep + 'image'])
if result != 0:
sys.stderr.write(
"Error: Extracting images from PDF failed for {} {}"
.format(filename, result))
return {}, {}
images = os.listdir(ocr_temp_dirname)
images.sort()
for image in images:
imagefilename = ocr_temp_dirname + os.path.sep + image
if pdf_ocr:
try:
result = tesseract_cache.get_ocr_text(filename=imagefilename, lang=lang, cache_dir=cache)
if result:
# extract page number from extracted image
# filename (image-pagenumber-imagenumber.jpg)
pagenumber = int(image.split('-')[1])
append_page(ocr_txt, pagenumber, result)
except BaseException as e:
sys.stderr.write("Exception while OCR of PDF: {} - "
"maybe corrupt image: {} - exception: {}\n"
.format(filename, imagefilename, e))
os.remove(imagefilename)
os.rmdir(ocr_temp_dirname)
return ocr_txt
def load_cache(filename, cache, lang='eng',
pdf_ocr=True):
pdffile = open(filename, 'rb')
md5hash = hashlib.md5(pdffile.read()).hexdigest()
pdffile.close()
ocr_cache_filename = cache + os.path.sep + \
"{}-{}.json".format(lang, md5hash)
with open(ocr_cache_filename) as f:
dct = json.load(f)
ocr_txt = None
if pdf_ocr:
ocr_txt = dict(enumerate(dct["ocr_txt"], 1))
return ocr_txt
def append_page(dct, n, page):
if n in dct:
dct[n] += '\n' + page
else:
dct[n] = page
#
# Process plugin
#
# check if content type PDF, if so start enrich pdf process for OCR
#
class enhance_pdf_ocr(etl_plugin_core.Plugin):
# process plugin, if one of the filters matches
filter_filename_suffixes = ['.pdf']
filter_mimetype_prefixes = ['application/pdf']
# how to find uris which are not enriched yet?
# (if not enhanced on indexing but later)
# this plugin needs to read the field id as a parameters
# to enrich unenriched docs
fields = ['id', 'content_type']
# query to find documents, that were not enriched by this plugin yet
# (since we marked documents which were OCRd with ocr_b = true
query = ("(content_type:application/pdf*) "
"AND NOT (etl_enhance_pdf_ocr_b:true)")
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = parameters.get('verbose', False)
# no further processing, if plugin filters like for content type do not match
if self.filter(parameters, data):
return parameters, data
filename = parameters['filename']
# is OCR of embedded images by Tika enabled or disabled by config?
ocr_pdf_tika = parameters.get('ocr_pdf_tika', True)
# was there a Tika exception?
tika_exception = parameters.get('etl_tika_exception', False)
if 'etl_error_plugins_ss' in data:
if 'enhance_extract_text_tika_server' in data['etl_error_plugins_ss']:
tika_exception = True
# OCR is done by Apache Tika plugin
# If standard OCR by Tika is disabled or Tika Exception, do it here
pdf_ocr = False
# Do not run if no images (detected by Tika plugin)
nothing_for_ocr = parameters.get('etl_nothing_for_ocr', False)
if nothing_for_ocr:
if verbose:
print('Not running OCR for PDF, since no image(s) detected by Apache Tika')
pdf_ocr = False
elif tika_exception or ocr_pdf_tika == False:
pdf_ocr = True
if pdf_ocr:
if verbose:
print('Mimetype is PDF or file ending is .pdf, running OCR of embedded images')
if not ocr_pdf_tika:
print ('OCR of embedded images in PDF by Apache Tika is disabled, so doing OCR for PDF by plugin enhance_pdf_ocr')
elif tika_exception:
print ('Because of Apache Tika exception, adding / trying fallback OCR for PDF by plugin enhance_pdf_ocr')
lang = parameters.get('ocr_lang', 'eng')
ocr_txt = {}
try:
ocr_txt = pdfimages2text(
filename=filename, lang=lang, verbose=verbose,
pdf_ocr=pdf_ocr,
cache=parameters.get("ocr_cache"))
except BaseException as e:
sys.stderr.write(
"Exception while OCR the PDF {} - {}\n".format(filename, e))
parameters['enhance_pdf_ocr'] = ocr_txt
# create text field ocr_t with all OCR results of all pages
pages_content = [value for (key, value) in sorted(ocr_txt.items())]
data['ocr_t'] = "\n".join(pages_content)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_pdf_page.py
================================================
import os
import sys
import subprocess
import tempfile
import hashlib
import etl_plugin_core
from etl import ETL
#
# by split to pages (so we have links to pages instead of documents) and get text from OCR from previous running plugin enhance_pdf_ocr and run plugins for splitting results into paragraphs and sentences
#
class enhance_pdf_page(etl_plugin_core.Plugin):
# process plugin, if one of the filters matches
filter_filename_suffixes = ['.pdf']
filter_mimetype_prefixes = ['application/pdf']
# how to find uris which are not enriched yet?
# (if not enhanced on indexing but later)
# this plugin needs to read the field id as a parameters to enrich unenriched docs
fields = ['id', 'content_type']
# query to find documents, that were not enriched by this plugin yet
# (since we marked documents which were OCRd with ocr_b = true
query = "content_type: application\/pdf* AND NOT enhance_pdf_page_b:true"
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
# no further processing, if plugin filters like for content type do not match
if self.filter(parameters, data):
return parameters, data
if verbose:
print('Mimetype or filename suffix is PDF, extracting single pages for segmentation')
if 'id' in data:
docid = data['id']
else:
docid = parameters['id']
filename = parameters['filename']
# defaults, if pdfinfo will not detect them
pages = 1
title = 'No title'
author = None
# get pagecount with pdfinfo command line tool
pdfinfo = subprocess.check_output(
['pdfinfo', '-enc', 'UTF-8', filename])
# decode
pdfinfo = pdfinfo.decode(encoding='UTF-8')
# get the count of pages from pdfinfo result
# its a text with a line per parameter
for line in pdfinfo.splitlines():
line = line.strip()
# we want only the line with the pagecount
if line.startswith('Pages:'):
pages = int(line.split()[1])
if line.startswith('Title:'):
title = line.replace("Title:", '', 1)
title = title.strip()
if line.startswith('Author:'):
author = line.replace("Author:", '', 1)
author = author.strip()
etl = ETL()
# export and index each page
for pagenumber in range(1, pages + 1):
if verbose:
print("Extracting PDF page {} of {}".format(pagenumber, pages))
# generate temporary filename
md5hash = hashlib.md5(filename.encode('utf-8')).hexdigest()
temp_filename = tempfile.gettempdir() + os.path.sep + \
"opensemanticetl_pdftotext_" + md5hash + "_" + str(pagenumber)
# call pdftotext to write the text of page into tempfile
try:
result = subprocess.check_call(['pdftotext', '-enc', 'UTF-8', '-f', str(
pagenumber), '-l', str(pagenumber), filename, temp_filename])
except BaseException as e:
sys.stderr.write(
"Exception extracting text from PDF page {}: {}\n".format(pagenumber, e))
# read text from tempfile
f = open(temp_filename, "r", encoding="utf-8")
text = f.read()
os.remove(temp_filename)
partdocid = docid + '#page=' + str(pagenumber)
partparameters = parameters.copy()
partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server',
'enhance_entity_linking', 'enhance_multilingual']
if 'enhance_ner_spacy' in parameters['plugins']:
partparameters['plugins'].append('enhance_ner_spacy')
if 'enhance_ner_stanford' in parameters['plugins']:
partparameters['plugins'].append('enhance_ner_stanford')
pagedata = {}
pagedata['id'] = partdocid
pagedata['page_i'] = pagenumber
pagedata['pages_i'] = pages
pagedata['container_s'] = docid
pagedata['title_txt'] = title
if author:
pagedata['author_ss'] = author
pagedata['content_type_group_ss'] = "Page"
pagedata['content_type_ss'] = "PDF page"
pagedata['content_txt'] = text
if verbose:
print("Indexing extracted page {}".format(pagenumber))
# index page
try:
partparameters, pagedata = etl.process(
partparameters, pagedata)
except BaseException as e:
sys.stderr.write(
"Exception adding PDF page {} : {}".format(pagenumber, e))
data['pages_i'] = pages
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_pdf_page_preview.py
================================================
import sys
import subprocess
from pathlib import Path
import hashlib
import etl_plugin_core
# generate single page PDF for each page of the full PDF for preview so client has not to load full pdf for previewing a page
class enhance_pdf_page_preview(etl_plugin_core.Plugin):
# process plugin, if one of the filters matches
filter_filename_suffixes = ['.pdf']
filter_mimetype_prefixes = ['application/pdf']
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
# no further processing, if plugin filters like for content type do not match
if self.filter(parameters, data):
return parameters, data
if verbose:
print('Mimetype or filename suffix is PDF, extracting single pages for preview')
if 'id' in data:
docid = data['id']
else:
docid = parameters['id']
filename = parameters['filename']
thumbnail_dir = '/var/opensemanticsearch/media/thumbnails'
# generate thumbnail directory
md5hash = hashlib.md5(docid.encode('utf-8')).hexdigest()
if not thumbnail_dir.endswith('/'):
thumbnail_dir += '/'
thumbnail_subdir = md5hash
Path(thumbnail_dir + thumbnail_subdir).mkdir(parents=True, exist_ok=True)
if verbose:
print("Generating single page PDF for previews from {} for {} to {}".format(
filename, docid, thumbnail_dir + thumbnail_subdir))
# call pdftk burst
try:
result = subprocess.check_call(
['pdftk', filename, 'burst', 'output', thumbnail_dir + thumbnail_subdir + '/%d.pdf'])
data['etl_thumbnails_s'] = thumbnail_subdir
except BaseException as e:
sys.stderr.write(
"Exception while genarating single page PDFs by pdftk burst\n")
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_pst.py
================================================
import sys
import hashlib
import tempfile
import os
import shutil
import subprocess
import etl_plugin_core
from etl_file import Connector_File
#
# Extract emails from Outlook PST file
#
class enhance_pst(etl_plugin_core.Plugin):
# process plugin, if one of the filters matches
filter_filename_suffixes = ['.pst']
filter_mimetype_prefixes = ['application/vnd.ms-outlook-pst']
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
# no further processing, if plugin filters like for content type do not match
if self.filter(parameters, data):
return parameters, data
if verbose:
print("Mimetype or file ending seems Outlook PST file, starting extraction of emails")
pstfilename = parameters['filename']
# we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
if 'tmp' in parameters:
system_temp_dirname = parameters['tmp']
if not os.path.exists(system_temp_dirname):
os.mkdir(system_temp_dirname)
else:
system_temp_dirname = tempfile.gettempdir()
h = hashlib.md5(parameters['id'].encode('UTF-8'))
temp_dirname = system_temp_dirname + os.path.sep + \
"opensemanticetl_enhancer_pst_" + \
str(os.getpid()) + "_" + h.hexdigest()
if not os.path.exists(temp_dirname):
os.mkdir(temp_dirname)
# start external PST extractor / converter
result = subprocess.call(
['readpst', '-S', '-D', '-o', temp_dirname, pstfilename])
if not result == 0:
sys.stderr.write(
"Error: readpst failed for {}".format(pstfilename))
# prepare document processing
connector = Connector_File()
connector.verbose = verbose
connector.config = parameters.copy()
# only set container if not yet set by a ZIP or PST before (if this PST is inside another ZIP or PST)
if not 'container' in connector.config:
connector.config['container'] = pstfilename
for dirName, subdirList, fileList in os.walk(temp_dirname):
if verbose:
print('Scanning directory: %s' % dirName)
for fileName in fileList:
if verbose:
print('Scanning file: %s' % fileName)
try:
# replace temp dirname from indexed id
contained_dirname = dirName.replace(temp_dirname, '', 1)
# build a virtual filename pointing to original PST file
if contained_dirname:
contained_dirname = contained_dirname + os.path.sep
else:
contained_dirname = os.path.sep
connector.config['id'] = parameters['id'] + \
contained_dirname + fileName
contained_filename = dirName + os.path.sep + fileName
# E-mails filenames are pure number
# Attachment file names are number-filename
# if temp_filename without - in filename, its a mail file
# rename to suffix .eml so Tika will extract more metadata like from and to
if not '-' in fileName:
os.rename(contained_filename,
contained_filename + '.eml')
contained_filename += '.eml'
connector.config['id'] += '.eml'
try:
connector.index_file(filename=contained_filename)
except KeyboardInterrupt:
raise KeyboardInterrupt
except BaseException as e:
sys.stderr.write("Exception while indexing contained content {} from {} : {}\n".format(
fileName, connector.config['container'], e.args[0]))
os.remove(contained_filename)
except BaseException as e:
sys.stderr.write(
"Exception while indexing file {} : {}\n".format(fileName, e.args[0]))
shutil.rmtree(temp_dirname)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_rdf.py
================================================
import sys
import logging
import rdflib
import etl_plugin_core
# define used ontologies / standards / properties
skos = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#')
owl = rdflib.Namespace('http://www.w3.org/2002/07/owl#')
import etl
from etl import ETL
# Import RDF graph file granular, not only as a whole single file:
# for every entity (subject) own document with properties (predicates) as facets and its objects as values
class enhance_rdf(etl_plugin_core.Plugin):
def __init__(self, verbose=False):
self.verbose = verbose
self.labelProperties = (rdflib.term.URIRef(u'http://www.w3.org/2004/02/skos/core#prefLabel'), rdflib.term.URIRef(u'http://www.w3.org/2000/01/rdf-schema#label'),
rdflib.term.URIRef(u'http://www.w3.org/2004/02/skos/core#altLabel'), rdflib.term.URIRef(u'http://www.w3.org/2004/02/skos/core#hiddenLabel'))
#
# get all labels, alternate labels / synonyms for the URI/subject, if not there, use subject (=URI) as default
#
def get_labels(self, subject):
labels = []
# append RDFS.label
# get all labels for this obj
for label in self.graph.objects(subject=subject, predicate=rdflib.RDFS.label):
labels.append(str(label))
#
# append SKOS labels
#
# append SKOS prefLabel
skos = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#')
for label in self.graph.objects(subject=subject, predicate=skos['prefLabel']):
labels.append(str(label))
# append SKOS altLabels
for label in self.graph.objects(subject=subject, predicate=skos['altLabel']):
labels.append(str(label))
# append SKOS hiddenLabels
for label in self.graph.objects(subject=subject, predicate=skos['hiddenLabel']):
labels.append(str(label))
return labels
#
# Get indexable full text(s) / label(s) instead of URI references
#
def get_values(self, obj):
values = []
# since we want full text search we want not to use ID/URI but all labels for indexing
# if type not literal but URI reference, add label(s)
if type(obj) == rdflib.URIRef:
# get labels of this object, therefore it is the subject parameter for getlabels()
values = self.get_labels(subject=obj)
if not values:
if self.verbose:
print("No label for this object, using URI {}".format(obj))
values = str(obj)
elif type(obj) == rdflib.term.Literal:
values = str(obj)
# if no values or labels, use the object / URI
if not values:
if self.verbose:
print("No label or URI for this object, using object {}".format(obj))
print("Data type of RDF object: {}".format(type(obj)))
values = str(obj)
return values
# best/preferred label as title
def get_preferred_label(self, subject, lang='en'):
preferred_label = self.graph.preferredLabel(
subject=subject, lang=lang, labelProperties=self.labelProperties)
# if no label in preferred language, try with english, if not preferred lang is english yet)
if not preferred_label and not lang == 'en':
preferred_label = self.graph.preferredLabel(
subject=subject, lang='en', labelProperties=self.labelProperties)
# use label from some other language
if not preferred_label:
preferred_label = self.graph.preferredLabel(
subject=subject, labelProperties=self.labelProperties)
# if no label, use URI
if preferred_label:
# since return is tuple with type and label take only the label
preferred_label = preferred_label[0][1]
else:
preferred_label = subject
return str(preferred_label)
#
# ETL knowledge graph to full text search index
#
# Index each entity / subject with all its properties/predicates as facets and objects (dereference URIs by their labels) as values
def etl_graph(self, parameters):
if self.verbose:
print("Graph has {} triples.".format(len(self.graph)))
count_triple = 0
count_subjects = 0
part_parameters = {}
part_parameters['plugins'] = []
part_parameters['export'] = parameters['export']
property2facet = {}
if 'property2facet' in parameters:
property2facet = parameters['property2facet']
etl_processor = ETL()
etl_processor.verbose = self.verbose
class_properties = []
class_properties.append(rdflib.term.URIRef(
u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'))
class_properties.append(rdflib.term.URIRef(
u'http://www.wikidata.org/prop/direct/P31'))
# since there can be multiple triples/values for same property in/from different graphs or graph describes existing other file/document,
# do not overwrite document but add value to existent document & values of the facet/field/property
part_parameters['add'] = True
# use SPARQL query with distinct to get subjects only once
res = self.graph.query(
"""SELECT DISTINCT ?subject
WHERE {
?subject ?predicate ?object .
}""")
for row in res:
count_subjects += 1
if self.verbose:
print("Importing entity / subject {}".format(count_subjects))
# get subject of the concept from first column
subj = row[0]
if self.verbose:
print("Processing RDF subject {}".format(subj))
part_data = {}
part_data['content_type_group_ss'] = 'Knowledge graph'
# subject as URI/ID
part_parameters['id'] = str(subj)
preferred_label = self.get_preferred_label(subject=subj)
part_data['title_txt'] = preferred_label
count_subject_triple = 0
# get all triples for this subject
for pred, obj in self.graph.predicate_objects(subject=subj):
count_triple += 1
count_subject_triple += 1
if self.verbose:
print("Importing subjects triple {}".format(
count_subject_triple))
print("Predicate / property: {}".format(pred))
print("Object / value: {}".format(obj))
try:
# if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)),
# so its name (label) will be available in entities view and as filter for faceted search
if pred in class_properties:
class_facet = str(obj)
# map class to facet, if mapping for class exist
if class_facet in property2facet:
class_facet = property2facet[class_facet]
if class_facet in parameters['facets']:
part_data['content_type_ss'] = 'Knowledge graph class {}'.format(
parameters['facets'][class_facet]['label'])
etl.append(data=part_data, facet=class_facet, values=preferred_label)
#
# Predicate/property to facet/field
#
# set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype
facet = str(pred) + '_ss'
facet_uri = facet + '_uri_ss'
facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss'
if self.verbose:
print("Facet: {}".format(facet))
#
# get values or labels of this object
#
values = self.get_values(obj=obj)
if self.verbose:
print("Values: {}".format(values))
# insert or append value (object of triple) to data
etl.append(data=part_data, facet=facet, values=values)
# if object is reference/URI append URI
if type(obj) == rdflib.URIRef:
uri = str(obj)
etl.append(data=part_data, facet=facet_uri, values=uri)
# append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search
preferredlabel_and_uri = "{} <{}>".format(
self.get_preferred_label(subject=obj), str(obj))
else:
preferredlabel_and_uri = self.get_preferred_label(
subject=obj)
etl.append(
data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri)
except KeyboardInterrupt:
raise KeyboardInterrupt
except BaseException as e:
sys.stderr.write("Exception while triple {} of subject {}: {}\n".format(
count_subject_triple, subj, e))
# index subject
etl_processor.process(part_parameters, part_data)
def etl_graph_file(self, docid, filename, parameters=None):
if parameters is None:
parameters = {}
self.graph = rdflib.Graph()
self.graph.parse(filename)
self.etl_graph(parameters=parameters)
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
self.verbose = True
# get parameters
docid = parameters['id']
filename = parameters['filename']
mimetype = ''
if 'content_type_ss' in data:
mimetype = data['content_type_ss']
elif 'content_type_ss' in parameters:
mimetype = parameters['content_type_ss']
# if connector returns a list, use only first value (which is the only entry of the list)
if isinstance(mimetype, list):
mimetype = mimetype[0]
# todo: add other formats like turtle
# if mimetype is graph, call graph import
if mimetype.lower() == "application/rdf+xml":
self.etl_graph_file(docid, filename, parameters=parameters)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_rdf_annotations_by_http_request.py
================================================
import os
import sys
import hashlib
import urllib
import rdflib
from rdflib import URIRef
# Do templating of metaserver url for id
def metaserver_url(metaserver, docid):
metaurl = metaserver
metaurl = metaurl.replace('[uri]', urllib.parse.quote_plus(docid))
h = hashlib.md5(docid.encode("utf-8"))
metaurl = metaurl.replace(
'[uri_md5]', urllib.parse.quote_plus(h.hexdigest()))
return metaurl
# get the modification date of meta data
# todo: check all metaservers, not only the last one and return latest date
def getmeta_modified(metaservers, docid, verbose=False):
if isinstance(metaservers, str):
metaserver = metaservers
else:
for server in metaservers:
metaserver = server
metaurl = metaserver_url(metaserver, docid)
moddate = False
if verbose:
print("Getting Meta from {}".format(metaurl))
try:
g = rdflib.Graph()
result = g.parse(metaurl)
# if semantic mediawiki modification date field, take this as date
for subj, pred, obj in g.triples((None, URIRef("http://semantic-mediawiki.org/swivt/1.0#wikiPageModificationDate"), None)):
# todo only if later than previos, if more than one (f.e. more than one metaserver)
moddate = str(obj)
if verbose:
print("Extracted modification date: {}".format(moddate))
if verbose:
if not moddate:
print("No modification date for metadata")
except BaseException as e:
sys.stderr.write(
"Exception while getting metadata modification time: {}\n".format(e.args[0]))
return moddate
# Get tagging and annotation from metadata server
def getmeta_rdf_from_server(metaserver, data, property2facet, docid, verbose=False):
moddate = False
metaurl = metaserver_url(metaserver, docid)
if verbose:
print("Getting Meta from {}".format(metaurl))
g = rdflib.Graph()
result = g.parse(metaurl)
# Print infos
if verbose:
print("Meta graph has {} statements.".format(len(g)))
for subj, pred, obj in g:
try:
print("{} : {}".format(pred, obj.toPython))
except BaseException as e:
sys.stderr.write(
"Exception while printing triple: {}\n".format(e.args[0]))
# make solr iteral for each rdf tripple contained in configurated properties
for facet in property2facet:
# if this predicat is configured as facet, add literal with pred as facetname and object as value
try:
if verbose:
print('Checking Facet {}'.format(facet))
facetRef = URIRef(facet)
for subj, pred, obj in g.triples((None, facetRef, None)):
try:
# add the facet with object as value
solr_facet = property2facet[facet]
if verbose:
print("Adding Solr facet {} with the object {}".format(
solr_facet, obj))
if solr_facet in data:
data[solr_facet].append(obj.toPython())
else:
data[solr_facet] = [obj.toPython()]
except BaseException as e:
sys.stderr.write(
"Exception while checking predicate {}{}\n".format(pred, e.args[0]))
except BaseException as e:
sys.stderr.write(
"Exception while checking a part of metadata graph: {}\n".format(e.args[0]))
# if semantic mediawiki modification date field, take this as date
moddateRef = URIRef(
"http://semantic-mediawiki.org/swivt/1.0#wikiPageModificationDate")
if (None, moddateRef, None) in g:
for subj, pred, obj in g.triples((None, moddateRef, None)):
moddate = obj.toPython()
# todo: transform date format to date and in exporter date to Solr date string format
#data['meta_modified_dt'] = str(moddate)
if verbose:
print("Extracted modification date: {}".format(moddate))
elif verbose:
print("No semantic mediawiki modification date")
return data
# Get tagging and annotation from metadata server
class enhance_rdf_annotations_by_http_request(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
# get parameters
docid = parameters['id']
metaserver = parameters['metaserver']
if os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER'):
metaserver = os.getenv('OPEN_SEMANTIC_ETL_METADATA_SERVER')
property2facet = parameters['property2facet']
if isinstance(metaserver, str):
# get metadata
metaserver=[metaserver]
for server in metaserver:
# get and add metadata
data = getmeta_rdf_from_server(
metaserver=server, data=data, property2facet=property2facet, docid=docid, verbose=verbose)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_regex.py
================================================
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import re
import etl_plugin_core
def regex2facet(data, text, regex, group, facet, verbose=False):
if verbose:
print("Checking regex {} for facet {}".format(regex, facet))
matches = re.finditer(regex, text, re.IGNORECASE)
if matches:
for match in matches:
try:
value = match.group(group)
if verbose:
print("Found regex {} with value {} for facet {}".format(
regex, value, facet))
etl_plugin_core.append(data, facet, value)
except BaseException as e:
print("Exception while adding value {} from regex {} and group {} to facet {}:".format(
value, regex, group, facet))
print(e.args[0])
# opens a tab with regexes and facets
def readregexesfromfile(data, text, filename, verbose=False):
listfile = open(filename)
# search all the lines
for line in listfile:
try:
line = line.strip()
# ignore empty lines and comment lines (starting with #)
if line and not line.startswith("#"):
facet = 'tag_ss'
columns = line.split("\t")
regex = columns[0]
if len(columns) > 1:
facet = columns[1]
if len(columns) > 2:
group = int(columns[2])
else:
group = 0
regex2facet(data=data, text=text, regex=regex,
group=group, facet=facet, verbose=verbose)
except BaseException as e:
print("Exception while checking line {} of regexlist {}:".format(
line, filename))
print(e.args[0])
listfile.close()
#
# add to configured facet, if entry in list is in text
#
class enhance_regex(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
regexlists = {}
if 'regex_lists' in parameters:
regexlists = parameters['regex_lists']
# collect/copy to be analyzed text from all fields
text = etl_plugin_core.get_text(data=data)
for regexlistfile in regexlists:
try:
readregexesfromfile(data=data, text=text,
filename=regexlistfile, verbose=verbose)
except BaseException as e:
print("Exception while checking regex list {}:".format(regexlistfile))
print(e.args[0])
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_sentence_segmentation.py
================================================
import json
import os
import requests
import sys
import time
from etl import ETL
#
# split text to sentences
#
class enhance_sentence_segmentation(object):
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
if 'id' in data:
docid = data['id']
else:
docid = parameters['id']
# default classifier
classifier = 'en_core_web_sm'
if 'spacy_ner_classifier_default' in parameters:
classifier = parameters['spacy_ner_classifier_default']
# set language specific classifier, if configured and document language detected
if 'spacy_ner_classifiers' in parameters and 'language_s' in data:
# is a language speciic cassifier there for the detected language?
if data['language_s'] in parameters['spacy_ner_classifiers']:
classifier = parameters['spacy_ner_classifiers'][data['language_s']]
analyse_fields = ['content_txt', 'ocr_t', 'ocr_descew_t']
text = ''
for field in analyse_fields:
if field in data:
text = "{}{}\n".format(text, data[field])
# extract sentences from text
url = "http://localhost:8080/sents"
if os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER'):
url = os.getenv('OPEN_SEMANTIC_ETL_SPACY_SERVER') + '/sents'
headers = {'content-type': 'application/json'}
d = {'text': text, 'model': classifier}
retries = 0
retrytime = 1
# wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry
retrytime_max = 120
no_connection = True
while no_connection:
try:
if retries > 0:
print(
'Retrying to connect to Spacy services in {} second(s).'.format(retrytime))
time.sleep(retrytime)
retrytime = retrytime * 2
if retrytime > retrytime_max:
retrytime = retrytime_max
response = requests.post(url, data=json.dumps(d), headers=headers)
# if bad status code, raise exception
response.raise_for_status()
no_connection = False
except requests.exceptions.ConnectionError as e:
retries += 1
sys.stderr.write(
"Connection to Spacy services (will retry in {} seconds) failed. Exception: {}\n".format(retrytime, e))
sentences = response.json()
etl = ETL()
sentencenumber = 0
for sentence in sentences:
sentencenumber += 1
partdocid = docid + '#sentence' + str(sentencenumber)
partparameters = parameters.copy()
partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server',
'enhance_entity_linking', 'enhance_multilingual']
if 'enhance_ner_spacy' in parameters['plugins']:
partparameters['plugins'].append('enhance_ner_spacy')
if 'enhance_ner_stanford' in parameters['plugins']:
partparameters['plugins'].append('enhance_ner_stanford')
sentencedata = {}
sentencedata['id'] = partdocid
sentencedata['container_s'] = docid
if 'author_ss' in data:
sentencedata['author_ss'] = data['author_ss']
sentencedata['content_type_group_ss'] = "Sentence"
sentencedata['content_type_ss'] = "Sentence"
sentencedata['content_txt'] = sentence
# index sentence
try:
partparameters, sentencedata = etl.process(
partparameters, sentencedata)
except BaseException as e:
sys.stderr.write(
"Exception adding sentence {} : {}".format(sentencenumber, e))
data['sentences_i'] = sentencenumber
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_warc.py
================================================
import hashlib
import tempfile
import os
import sys
import shutil
import time
from warcio.archiveiterator import ArchiveIterator
import etl_plugin_core
from etl_file import Connector_File
class enhance_warc(etl_plugin_core.Plugin):
# process plugin, if one of the filters matches
filter_filename_suffixes = ['.warc', '.warc.gz']
filter_mimetype_prefixes = ['application/warc']
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = False
if 'verbose' in parameters:
if parameters['verbose']:
verbose = True
# no further processing, if plugin filters like for content type do not match
if self.filter(parameters, data):
return parameters, data
warcfilename = parameters['filename']
# create temp dir where to unwarc the archive
if 'tmp' in parameters:
system_temp_dirname = parameters['tmp']
if not os.path.exists(system_temp_dirname):
os.mkdir(system_temp_dirname)
else:
system_temp_dirname = tempfile.gettempdir()
# we build temp dirname ourselfes instead of using system_temp_dirname so we can use configurable / external tempdirs
h = hashlib.md5(parameters['id'].encode('UTF-8'))
temp_dirname = system_temp_dirname + os.path.sep + \
"opensemanticetl_enhancer_warc_" + h.hexdigest()
if os.path.exists(temp_dirname) == False:
os.mkdir(temp_dirname)
# prepare document processing
connector = Connector_File()
connector.verbose = verbose
connector.config = parameters.copy()
# only set container if not yet set by a zip before (if this zip is inside another zip)
if not 'container' in connector.config:
connector.config['container'] = warcfilename
i = 0
with open(warcfilename, 'rb') as stream:
for record in ArchiveIterator(stream):
i += 1
if record.rec_type == 'response':
print(record.rec_headers)
# write WARC record content to tempfile
tempfilename = temp_dirname + \
os.path.sep + 'warcrecord' + str(i)
tmpfile = open(tempfilename, 'wb')
tmpfile.write(record.content_stream().read())
tmpfile.close()
# set last modification time of the file to WARC-Date
try:
last_modified = time.mktime(time.strptime(
record.rec_headers.get_header('WARC-Date'), '%Y-%m-%dT%H:%M:%SZ'))
os.utime(tempfilename, (last_modified, last_modified))
except BaseException as e:
sys.stderr.write("Exception while reading filedate to warc content {} from {} : {}\n".format(
tempfilename, connector.config['container'], e))
# set id (URL and WARC Record ID)
uri = record.rec_headers.get_header('WARC-Target-URI')
if not uri.endswith('/'):
uri += '/'
connector.config['id'] = uri + record.rec_headers.get_header('WARC-Record-ID')
# index the extracted file
try:
connector.index_file(filename=tempfilename)
except KeyboardInterrupt:
raise KeyboardInterrupt
except BaseException as e:
sys.stderr.write("Exception while indexing warc content {} from {} : {}\n".format(
tempfilename, connector.config['container'], e))
os.remove(tempfilename)
shutil.rmtree(temp_dirname)
return parameters, data
================================================
FILE: src/opensemanticetl/enhance_xml.py
================================================
import xml.etree.ElementTree as ElementTree
import os.path
import sys
class enhance_xml(object):
def elements2data(self, element, data, path="xml"):
path += "/" + element.tag
fieldname = path + '_ss'
text = element.text.strip()
if text:
if fieldname in data:
data[fieldname].append(text)
else:
data[fieldname] = [text]
for child in element:
data = self.elements2data(element=child, path=path, data=data)
return data
# get xml filename by mapping configuration
def get_xml_filename(self, filename, mapping):
dirname = os.path.dirname(filename)
basename = os.path.basename(filename)
xmlfilename = mapping
xmlfilename = xmlfilename.replace('%DIRNAME%', dirname)
xmlfilename = xmlfilename.replace('%BASENAME%', dirname)
if not os.path.isfile(xmlfilename):
xmlfilename = False
return xmlfilename
def process(self, parameters=None, data=None):
if parameters is None:
parameters = {}
if data is None:
data = {}
verbose = F
gitextract_2awl829e/
├── .github/
│ └── FUNDING.yml
├── .gitignore
├── .gitmodules
├── DEBIAN/
│ ├── conffiles
│ ├── control
│ ├── postinst
│ └── prerm
├── Dockerfile
├── LICENSE
├── build-deb
├── docker-compose.test.yml
├── docker-compose.ubuntu.test.yml
├── docker-entrypoint.sh
├── etc/
│ ├── opensemanticsearch/
│ │ ├── blacklist/
│ │ │ ├── blacklist-url
│ │ │ ├── blacklist-url-prefix
│ │ │ ├── blacklist-url-regex
│ │ │ ├── blacklist-url-suffix
│ │ │ ├── enhance_extract_law/
│ │ │ │ └── blacklist-lawcode-if-no-clause
│ │ │ ├── enhance_zip/
│ │ │ │ ├── blacklist-contenttype
│ │ │ │ ├── blacklist-contenttype-prefix
│ │ │ │ ├── blacklist-contenttype-regex
│ │ │ │ ├── blacklist-contenttype-suffix
│ │ │ │ ├── whitelist-contenttype
│ │ │ │ ├── whitelist-contenttype-prefix
│ │ │ │ ├── whitelist-contenttype-regex
│ │ │ │ └── whitelist-contenttype-suffix
│ │ │ ├── textanalysis/
│ │ │ │ ├── blacklist-fieldname
│ │ │ │ ├── blacklist-fieldname-prefix
│ │ │ │ └── blacklist-fieldname-suffix
│ │ │ ├── whitelist-url
│ │ │ ├── whitelist-url-prefix
│ │ │ ├── whitelist-url-regex
│ │ │ └── whitelist-url-suffix
│ │ ├── connector-files
│ │ ├── connector-web
│ │ ├── enhancer-rdf
│ │ ├── etl
│ │ ├── facets
│ │ ├── filemonitoring/
│ │ │ └── files
│ │ ├── ocr/
│ │ │ └── dictionary.txt
│ │ ├── regex/
│ │ │ ├── email.tsv
│ │ │ ├── iban.tsv
│ │ │ └── phone.tsv
│ │ └── task_priorities
│ └── systemd/
│ └── system/
│ ├── opensemanticetl-filemonitoring.service
│ └── opensemanticetl.service
└── src/
└── opensemanticetl/
├── __init__.py
├── clean_title.py
├── enhance_annotations.py
├── enhance_contenttype_group.py
├── enhance_csv.py
├── enhance_detect_language_tika_server.py
├── enhance_entity_linking.py
├── enhance_extract_email.py
├── enhance_extract_hashtags.py
├── enhance_extract_law.py
├── enhance_extract_money.py
├── enhance_extract_phone.py
├── enhance_extract_text_tika_server.py
├── enhance_file_mtime.py
├── enhance_file_size.py
├── enhance_html.py
├── enhance_mapping_id.py
├── enhance_mimetype.py
├── enhance_multilingual.py
├── enhance_ner_spacy.py
├── enhance_ner_stanford.py
├── enhance_ocr.py
├── enhance_path.py
├── enhance_pdf_ocr.py
├── enhance_pdf_page.py
├── enhance_pdf_page_preview.py
├── enhance_pst.py
├── enhance_rdf.py
├── enhance_rdf_annotations_by_http_request.py
├── enhance_regex.py
├── enhance_sentence_segmentation.py
├── enhance_warc.py
├── enhance_xml.py
├── enhance_xmp.py
├── enhance_zip.py
├── etl.py
├── etl_delete.py
├── etl_enrich.py
├── etl_file.py
├── etl_filedirectory.py
├── etl_filemonitoring.py
├── etl_hypothesis.py
├── etl_plugin_core.py
├── etl_rss.py
├── etl_sitemap.py
├── etl_sparql.py
├── etl_twitter_scraper.py
├── etl_web.py
├── etl_web_crawl.py
├── export_elasticsearch.py
├── export_json.py
├── export_neo4j.py
├── export_print.py
├── export_queue_files.py
├── export_solr.py
├── filter_blacklist.py
├── filter_file_not_modified.py
├── move_indexed_file.py
├── requirements.txt
├── tasks.py
├── test_enhance_detect_language_tika_server.py
├── test_enhance_extract_email.py
├── test_enhance_extract_law.py
├── test_enhance_extract_money.py
├── test_enhance_extract_text_tika_server.py
├── test_enhance_mapping_id.py
├── test_enhance_ner_spacy.py
├── test_enhance_path.py
├── test_enhance_pdf_ocr.py
├── test_enhance_regex.py
├── test_enhance_warc.py
├── test_etl_file.py
├── test_move_indexed_files.py
└── testdata/
├── README.md
├── example.warc
├── run_integrationtests.sh
└── run_tests.sh
SYMBOL INDEX (294 symbols across 71 files)
FILE: src/opensemanticetl/clean_title.py
class clean_title (line 6) | class clean_title(object):
method process (line 8) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_annotations.py
class enhance_annotations (line 10) | class enhance_annotations(etl_plugin_core.Plugin):
method process (line 12) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_contenttype_group.py
class enhance_contenttype_group (line 9) | class enhance_contenttype_group(object):
method process (line 46) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_csv.py
class enhance_csv (line 12) | class enhance_csv(object):
method __init__ (line 14) | def __init__(self, verbose=False):
method read_parameters (line 41) | def read_parameters(self, parameters, data):
method add_csv_parameters_from_meta_settings (line 96) | def add_csv_parameters_from_meta_settings(self, metaserver):
method get_csv_dialect (line 109) | def get_csv_dialect(self):
method set_titles (line 164) | def set_titles(self, row):
method export_row_data_to_index (line 177) | def export_row_data_to_index(self, data, rownumber):
method import_row (line 205) | def import_row(self, row, rownumber, docid):
method enhance_csv (line 259) | def enhance_csv(self, parameters, data):
method process (line 377) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_detect_language_tika_server.py
class enhance_detect_language_tika_server (line 9) | class enhance_detect_language_tika_server(object):
method process (line 11) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_entity_linking.py
function taxonomy2fields (line 17) | def taxonomy2fields(taxonomy, field, separator="\t", subfields_suffix="_...
class enhance_entity_linking (line 48) | class enhance_entity_linking(etl_plugin_core.Plugin):
method process (line 50) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_extract_email.py
class enhance_extract_email (line 11) | class enhance_extract_email(object):
method process (line 12) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_extract_hashtags.py
class enhance_extract_hashtags (line 4) | class enhance_extract_hashtags(object):
method process (line 6) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_extract_law.py
function get_taxonomy (line 16) | def get_taxonomy(law_clause, law_code = None):
class enhance_extract_law (line 32) | class enhance_extract_law(etl_plugin_core.Plugin):
method process (line 34) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_extract_money.py
class enhance_extract_money (line 12) | class enhance_extract_money(etl_plugin_core.Plugin):
method process (line 17) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_extract_phone.py
function normalize_phonenumber (line 12) | def normalize_phonenumber(phone):
class enhance_extract_phone (line 31) | class enhance_extract_phone(object):
method process (line 32) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_extract_text_tika_server.py
function in_parsers (line 8) | def in_parsers(parser, parsers):
class enhance_extract_text_tika_server (line 23) | class enhance_extract_text_tika_server(object):
method process (line 33) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_file_mtime.py
class enhance_file_mtime (line 12) | class enhance_file_mtime(object):
method process (line 13) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_file_size.py
class enhance_file_size (line 11) | class enhance_file_size(object):
method process (line 12) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_html.py
class enhance_html (line 8) | class enhance_html(object):
method elements2data (line 10) | def elements2data(self, element, data, path=None, recursive=True):
method process (line 40) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_mapping_id.py
class enhance_mapping_id (line 9) | class enhance_mapping_id(object):
method process (line 11) | def process(self, parameters=None, data=None):
function mapping (line 25) | def mapping(value, mappings=None):
function mapping_reverse (line 53) | def mapping_reverse(value, mappings=None):
FILE: src/opensemanticetl/enhance_mimetype.py
class enhance_mimetype (line 10) | class enhance_mimetype(object):
method process (line 12) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_multilingual.py
class enhance_multilingual (line 10) | class enhance_multilingual(object):
method process (line 23) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_ner_spacy.py
class enhance_ner_spacy (line 14) | class enhance_ner_spacy(object):
method process (line 16) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_ner_stanford.py
class enhance_ner_stanford (line 11) | class enhance_ner_stanford(object):
method multi_word_entities (line 14) | def multi_word_entities(self, entities):
method process (line 53) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_ocr.py
class enhance_ocr (line 7) | class enhance_ocr(object):
method process (line 20) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_path.py
class enhance_path (line 7) | class enhance_path(object):
method process (line 9) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_pdf_ocr.py
function pdfimages2text (line 15) | def pdfimages2text(filename, lang='eng', verbose=False,
function load_cache (line 71) | def load_cache(filename, cache, lang='eng',
function append_page (line 86) | def append_page(dct, n, page):
class enhance_pdf_ocr (line 98) | class enhance_pdf_ocr(etl_plugin_core.Plugin):
method process (line 116) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_pdf_page.py
class enhance_pdf_page (line 15) | class enhance_pdf_page(etl_plugin_core.Plugin):
method process (line 31) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_pdf_page_preview.py
class enhance_pdf_page_preview (line 11) | class enhance_pdf_page_preview(etl_plugin_core.Plugin):
method process (line 18) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_pst.py
class enhance_pst (line 15) | class enhance_pst(etl_plugin_core.Plugin):
method process (line 20) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_rdf.py
class enhance_rdf (line 18) | class enhance_rdf(etl_plugin_core.Plugin):
method __init__ (line 20) | def __init__(self, verbose=False):
method get_labels (line 32) | def get_labels(self, subject):
method get_values (line 65) | def get_values(self, obj):
method get_preferred_label (line 98) | def get_preferred_label(self, subject, lang='en'):
method etl_graph (line 130) | def etl_graph(self, parameters):
method etl_graph_file (line 268) | def etl_graph_file(self, docid, filename, parameters=None):
method process (line 277) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_rdf_annotations_by_http_request.py
function metaserver_url (line 11) | def metaserver_url(metaserver, docid):
function getmeta_modified (line 27) | def getmeta_modified(metaservers, docid, verbose=False):
function getmeta_rdf_from_server (line 68) | def getmeta_rdf_from_server(metaserver, data, property2facet, docid, ver...
class enhance_rdf_annotations_by_http_request (line 145) | class enhance_rdf_annotations_by_http_request(object):
method process (line 147) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_regex.py
function regex2facet (line 8) | def regex2facet(data, text, regex, group, facet, verbose=False):
function readregexesfromfile (line 33) | def readregexesfromfile(data, text, filename, verbose=False):
class enhance_regex (line 71) | class enhance_regex(object):
method process (line 72) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_sentence_segmentation.py
class enhance_sentence_segmentation (line 14) | class enhance_sentence_segmentation(object):
method process (line 16) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_warc.py
class enhance_warc (line 14) | class enhance_warc(etl_plugin_core.Plugin):
method process (line 20) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_xml.py
class enhance_xml (line 6) | class enhance_xml(object):
method elements2data (line 8) | def elements2data(self, element, data, path="xml"):
method get_xml_filename (line 28) | def get_xml_filename(self, filename, mapping):
method process (line 43) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_xmp.py
function get_xmp_filename (line 10) | def get_xmp_filename(filename):
class enhance_xmp (line 32) | class enhance_xmp(object):
method process (line 33) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/enhance_zip.py
class enhance_zip (line 10) | class enhance_zip(object):
method process (line 12) | def process(self, parameters=None, data=None):
method unzip_and_index_files (line 38) | def unzip_and_index_files(self, zipfilename, parameters=None, verbose=...
FILE: src/opensemanticetl/etl.py
class ETL (line 21) | class ETL(object):
method __init__ (line 23) | def __init__(self, plugins=(), verbose=False):
method set_configdefaults (line 33) | def set_configdefaults(self):
method init_exporter (line 48) | def init_exporter(self):
method read_configfile (line 56) | def read_configfile(self, configfile):
method is_plugin_blacklisted_for_contenttype (line 71) | def is_plugin_blacklisted_for_contenttype(self, plugin, parameters, da...
method process (line 147) | def process(self, parameters=None, data=None):
method commit (line 270) | def commit(self):
function append (line 279) | def append(data, facet, values):
function error_message (line 309) | def error_message(docid, data, plugin, e):
function sort_plugins (line 343) | def sort_plugins(plugins):
FILE: src/opensemanticetl/etl_delete.py
class Delete (line 9) | class Delete(ETL):
method __init__ (line 10) | def __init__(self, verbose=False, quiet=True):
method set_configdefaults (line 28) | def set_configdefaults(self):
method read_configfiles (line 39) | def read_configfiles(self):
method delete (line 52) | def delete(self, uri):
method empty (line 62) | def empty(self):
FILE: src/opensemanticetl/etl_enrich.py
class ETL_Enrich (line 19) | class ETL_Enrich(ETL):
method __init__ (line 21) | def __init__(self, plugins=(), verbose=False):
method getfieldnames_from_plugins (line 68) | def getfieldnames_from_plugins(self):
method enrich_document (line 92) | def enrich_document(self, docid):
method enrich (line 161) | def enrich(self):
method enrich_query (line 182) | def enrich_query(self, query):
FILE: src/opensemanticetl/etl_file.py
class Connector_File (line 10) | class Connector_File(ETL):
method __init__ (line 12) | def __init__(self, verbose=False, quiet=True):
method set_configdefaults (line 22) | def set_configdefaults(self):
method read_configfiles (line 77) | def read_configfiles(self):
method clean_filename (line 98) | def clean_filename(self, filename):
method index (line 108) | def index(self, filename):
method index_dir (line 138) | def index_dir(self, rootDir, followlinks=False):
method index_file (line 174) | def index_file(self, filename, additional_plugins=()):
function key_val (line 215) | def key_val(s):
FILE: src/opensemanticetl/etl_filedirectory.py
class Connector_Filedirectory (line 11) | class Connector_Filedirectory(Connector_File):
method __init__ (line 13) | def __init__(self, verbose=False, quiet=False):
FILE: src/opensemanticetl/etl_filemonitoring.py
class EventHandler (line 17) | class EventHandler(pyinotify.ProcessEvent):
method __init__ (line 19) | def __init__(self):
method process_IN_CLOSE_WRITE (line 24) | def process_IN_CLOSE_WRITE(self, event):
method process_IN_MOVED_TO (line 30) | def process_IN_MOVED_TO(self, event):
method process_IN_DELETE (line 39) | def process_IN_DELETE(self, event):
method move_file (line 50) | def move_file(self, src, dest):
method move_dir (line 58) | def move_dir(self, src, dest):
method index_file (line 66) | def index_file(self, filename):
method delete_file (line 73) | def delete_file(self, filename):
class Filemonitor (line 85) | class Filemonitor(ETL):
method __init__ (line 87) | def __init__(self, verbose=False):
method read_configfiles (line 117) | def read_configfiles(self):
method add_watch (line 125) | def add_watch(self, filename):
method add_watches_from_file (line 131) | def add_watches_from_file(filename):
method watch (line 140) | def watch(self):
FILE: src/opensemanticetl/etl_hypothesis.py
class Connector_Hypothesis (line 19) | class Connector_Hypothesis(ETL):
method etl_document (line 45) | def etl_document(self, uri):
method etl_annotation (line 78) | def etl_annotation(self, annotation):
method etl_annotations (line 119) | def etl_annotations(self, last_update="", user=None, group=None, tag=N...
FILE: src/opensemanticetl/etl_plugin_core.py
class Plugin (line 10) | class Plugin(object):
method filter (line 16) | def filter(self, parameters=None, data=None):
function get_text (line 85) | def get_text(data):
function append (line 166) | def append(data, facet, values):
function get_preflabels (line 196) | def get_preflabels(values):
function get_all_matchtexts (line 214) | def get_all_matchtexts(values):
FILE: src/opensemanticetl/etl_rss.py
class Connector_RSS (line 12) | class Connector_RSS(Connector_Web):
method __init__ (line 14) | def __init__(self, verbose=False, quiet=True):
method read_configfiles (line 21) | def read_configfiles(self):
method index (line 47) | def index(self, uri):
FILE: src/opensemanticetl/etl_sitemap.py
class Connector_Sitemap (line 12) | class Connector_Sitemap(Connector_Web):
method __init__ (line 14) | def __init__(self, verbose=False, quiet=True):
method read_configfiles (line 22) | def read_configfiles(self):
method index (line 45) | def index(self, sitemap):
FILE: src/opensemanticetl/etl_sparql.py
function download_rdf_from_sparql_endpoint (line 17) | def download_rdf_from_sparql_endpoint(endpoint, query):
function sparql_select_to_list_file (line 40) | def sparql_select_to_list_file(endpoint, query, filename=None):
class Connector_SPARQL (line 71) | class Connector_SPARQL(ETL):
method __init__ (line 73) | def __init__(self, verbose=False, quiet=True):
method read_configfiles (line 81) | def read_configfiles(self):
method index_rdf (line 99) | def index_rdf(self, endpoint, query):
method index_select (line 116) | def index_select(self, endpoint, query):
method index (line 139) | def index(self, endpoint, query):
FILE: src/opensemanticetl/etl_twitter_scraper.py
function index_tweet (line 19) | def index_tweet(obj, config):
function index (line 64) | def index(search=None, username=None, Profile_full=False, limit=None, In...
FILE: src/opensemanticetl/etl_web.py
class Connector_Web (line 13) | class Connector_Web(Connector_File):
method __init__ (line 15) | def __init__(self, verbose=False, quiet=True):
method set_configdefaults (line 23) | def set_configdefaults(self):
method read_configfiles (line 61) | def read_configfiles(self):
method read_mtime_from_html (line 80) | def read_mtime_from_html(self, tempfilename):
method index (line 125) | def index(self, uri, last_modified=False, downloaded_file=False, downl...
FILE: src/opensemanticetl/etl_web_crawl.py
class OpenSemanticETL_Spider (line 15) | class OpenSemanticETL_Spider(CrawlSpider):
method parse_item (line 19) | def parse_item(self, response):
function index (line 42) | def index(uri, crawler_type="PATH"):
FILE: src/opensemanticetl/export_elasticsearch.py
class export_elasticsearch (line 6) | class export_elasticsearch(object):
method __init__ (line 8) | def __init__(self, config=None):
method process (line 24) | def process(self, parameters=None, data=None):
method update (line 38) | def update(self, docid=None, data=None, parameters=None):
method get_lastmodified (line 56) | def get_lastmodified(self, docid, parameters=None):
method commit (line 76) | def commit(self):
FILE: src/opensemanticetl/export_json.py
class export_json (line 4) | class export_json(object):
method __init__ (line 6) | def __init__(self, config=None):
method process (line 16) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/export_neo4j.py
class export_neo4j (line 10) | class export_neo4j(object):
method __init__ (line 12) | def __init__(self, config=None):
method process (line 17) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/export_print.py
class export_print (line 4) | class export_print(object):
method __init__ (line 6) | def __init__(self, config=None):
method process (line 16) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/export_queue_files.py
class export_queue_files (line 8) | class export_queue_files(object):
method __init__ (line 10) | def __init__(self, config=None):
method process (line 15) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/export_solr.py
class export_solr (line 16) | class export_solr(object):
method __init__ (line 18) | def __init__(self, config=None):
method process (line 45) | def process(self, parameters=None, data=None):
method update (line 98) | def update(self, data, add=False, fields_set=(), commit=None):
method post (line 117) | def post(self, data=None, docid=None, commit=None):
method tag (line 195) | def tag(self, docid=None, field=None, value=None, data=None):
method update_by_query (line 219) | def update_by_query(self, query, field=None, value=None, data=None, qu...
method get_data (line 294) | def get_data(self, docid, fields):
method commit (line 312) | def commit(self):
method get_lastmodified (line 320) | def get_lastmodified(self, docid):
method delete (line 337) | def delete(self, parameters, docid=None, query=None,):
method append_synonyms (line 362) | def append_synonyms(self, resourceid, synonyms):
function solr_mask (line 370) | def solr_mask(string_to_mask, solr_specialchars='\+-&|!(){}[]^"~*?:/'):
FILE: src/opensemanticetl/filter_blacklist.py
function is_in_lists (line 7) | def is_in_lists(listfiles, value, match=None):
function is_in_list (line 28) | def is_in_list(filename, value, match=None):
class filter_blacklist (line 68) | class filter_blacklist(object):
method process (line 70) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/filter_file_not_modified.py
class filter_file_not_modified (line 14) | class filter_file_not_modified(object):
method __init__ (line 16) | def __init__(self):
method process (line 25) | def process(self, parameters=None, data=None):
FILE: src/opensemanticetl/move_indexed_file.py
function move_files (line 9) | def move_files(host: str, moves: dict, prefix=""):
function move_dir (line 37) | def move_dir(host: str, src: str, dest: str, prefix=""):
function change_path (line 57) | def change_path(prefix: str):
function change_dir (line 71) | def change_dir(prefix: str, src: str, dest: str):
function _change_path (line 100) | def _change_path(data: dict, dest_components: tuple, prefix: str = "") -...
function prepare_payload (line 121) | def prepare_payload(adds, delete_ids):
class DuplicateKey (line 131) | class DuplicateKey(str):
method __hash__ (line 134) | def __hash__(self):
function extract_path (line 138) | def extract_path(data: dict) -> str:
function extract_path_components (line 144) | def extract_path_components(data: dict):
function dict_map (line 158) | def dict_map(mapping: dict):
function append_prefix (line 165) | def append_prefix(prefix: str):
function get_files (line 172) | def get_files(host: str, ids: list) -> list:
function get_files_in_dir (line 180) | def get_files_in_dir(host: str, path: str) -> list:
function get (line 190) | def get(host: str, query: str) -> list:
function get_pages (line 194) | def get_pages(host: str, query: str, limit=50):
function post (line 209) | def post(host: str, data: dict):
FILE: src/opensemanticetl/tasks.py
function delete (line 56) | def delete(uri):
function index_file (line 65) | def index_file(filename, additional_plugins=(), wait=0, commit=False, co...
function index_filedirectory (line 89) | def index_filedirectory(filename, config=None):
function index_web (line 110) | def index_web(uri, wait=0, downloaded_file=False, downloaded_headers=None):
function index_web_crawl (line 126) | def index_web_crawl(uri, crawler_type="PATH"):
function index_sitemap (line 138) | def index_sitemap(uri):
function index_rss (line 154) | def index_rss(uri):
function enrich (line 166) | def enrich(plugins, uri, wait=0):
function index_twitter_scraper (line 194) | def index_twitter_scraper(search=None, username=None, Profile_full=False...
FILE: src/opensemanticetl/test_enhance_detect_language_tika_server.py
class Test_enhance_detect_language_tika_server (line 8) | class Test_enhance_detect_language_tika_server(unittest.TestCase):
method test (line 10) | def test(self):
FILE: src/opensemanticetl/test_enhance_extract_email.py
class Test_enhance_extract_email (line 8) | class Test_enhance_extract_email(unittest.TestCase):
method test (line 10) | def test(self):
FILE: src/opensemanticetl/test_enhance_extract_law.py
class Test_enhance_extract_law (line 8) | class Test_enhance_extract_law(unittest.TestCase):
method test (line 10) | def test(self):
method test_blacklist (line 50) | def test_blacklist(self):
FILE: src/opensemanticetl/test_enhance_extract_money.py
class Test_enhance_extract_money (line 8) | class Test_enhance_extract_money(unittest.TestCase):
method test (line 10) | def test(self):
method test_numerizer (line 58) | def test_numerizer(self):
FILE: src/opensemanticetl/test_enhance_extract_text_tika_server.py
class TestEnhanceExtractTextTikaServer (line 9) | class TestEnhanceExtractTextTikaServer(unittest.TestCase):
method delete_ocr_cache_entries (line 12) | def delete_ocr_cache_entries(self):
method setUp (line 23) | def setUp(self):
method tearDown (line 25) | def tearDown(self):
method test_text_extraction_pdf (line 28) | def test_text_extraction_pdf(self):
method test_text_extraction_pdf_ocr (line 53) | def test_text_extraction_pdf_ocr(self):
method test_text_extraction_pdf_ocr_cache (line 78) | def test_text_extraction_pdf_ocr_cache(self):
method test_ocr_png (line 112) | def test_ocr_png(self):
method test_ocr_jpg (line 128) | def test_ocr_jpg(self):
method test_disabled_ocr_png (line 144) | def test_disabled_ocr_png(self):
FILE: src/opensemanticetl/test_enhance_mapping_id.py
class Test_enhance_mapping_id (line 8) | class Test_enhance_mapping_id(unittest.TestCase):
method test (line 10) | def test(self):
method test_reverse (line 33) | def test_reverse(self):
FILE: src/opensemanticetl/test_enhance_ner_spacy.py
class Test_enhance_ner_spacy (line 15) | class Test_enhance_ner_spacy(unittest.TestCase):
method test_en (line 17) | def test_en(self):
method test_de (line 34) | def test_de(self):
FILE: src/opensemanticetl/test_enhance_path.py
class Test_enhance_path (line 8) | class Test_enhance_path(unittest.TestCase):
method test (line 10) | def test(self):
FILE: src/opensemanticetl/test_enhance_pdf_ocr.py
class Test_enhance_pdf_ocr (line 9) | class Test_enhance_pdf_ocr(unittest.TestCase):
method test_pdf_ocr (line 12) | def test_pdf_ocr(self):
FILE: src/opensemanticetl/test_enhance_regex.py
class Test_enhance_regex (line 8) | class Test_enhance_regex(unittest.TestCase):
method test (line 10) | def test(self):
FILE: src/opensemanticetl/test_enhance_warc.py
class Test_enhance_warc (line 11) | class Test_enhance_warc(unittest.TestCase):
method test_warc (line 14) | def test_warc(self):
FILE: src/opensemanticetl/test_etl_file.py
class Test_ETL_file (line 10) | class Test_ETL_file(unittest.TestCase):
method test_pdf_and_ocr_by_tika (line 12) | def test_pdf_and_ocr_by_tika(self):
method test_ocr_by_plugin_enhance_pdf_ocr (line 57) | def test_ocr_by_plugin_enhance_pdf_ocr(self):
FILE: src/opensemanticetl/test_move_indexed_files.py
class TestMove (line 10) | class TestMove(unittest.TestCase):
method test_move_files (line 11) | def test_move_files(self):
method test_move_dir (line 45) | def test_move_dir(self):
method test_get_pages (line 82) | def test_get_pages(self):
function mock_response (line 109) | def mock_response(data):
Condensed preview — 123 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (394K chars).
[
{
"path": ".github/FUNDING.yml",
"chars": 44,
"preview": "custom: ['https://www.paypal.me/MMandalka']\n"
},
{
"path": ".gitignore",
"chars": 45,
"preview": "__pycache__\n.project\n.pydevproject\n.settings\n"
},
{
"path": ".gitmodules",
"chars": 331,
"preview": "[submodule \"src/open-semantic-entity-search-api\"]\n\tpath = src/open-semantic-entity-search-api\n\turl = https://github.com/"
},
{
"path": "DEBIAN/conffiles",
"chars": 1411,
"preview": "/etc/opensemanticsearch/etl\n/etc/opensemanticsearch/filemonitoring/files\n/etc/opensemanticsearch/connector-files\n/etc/op"
},
{
"path": "DEBIAN/control",
"chars": 909,
"preview": "Package: open-semantic-etl\nVersion: 21.10.18\nSection: misc\nPriority: optional\nArchitecture: all\nDepends: tika-server(>=0"
},
{
"path": "DEBIAN/postinst",
"chars": 753,
"preview": "#!/bin/sh\n\nadduser --system --disabled-password opensemanticetl\ngroupadd -r tesseract_cache\nusermod -a -G tesseract_cach"
},
{
"path": "DEBIAN/prerm",
"chars": 182,
"preview": "#!/bin/sh\n\nsystemctl disable opensemanticetl-filemonitoring\n\nsystemctl stop opensemanticetl-filemonitoring\n\nsystemctl di"
},
{
"path": "Dockerfile",
"chars": 1801,
"preview": "ARG FROM=debian:bullseye\nFROM ${FROM}\n\nENV DEBIAN_FRONTEND=noninteractive\nENV CRYPTOGRAPHY_DONT_BUILD_RUST=1\n\nRUN apt-ge"
},
{
"path": "LICENSE",
"chars": 35142,
"preview": " GNU GENERAL PUBLIC LICENSE\n Version 3, 29 June 2007\n\n Copyright (C) 2007 Free "
},
{
"path": "build-deb",
"chars": 1351,
"preview": "#!/bin/sh\n\nVERSION=`date +%y.%m.%d`\nPACKAGE=open-semantic-etl_${VERSION}.deb\nBUILDDIR=/tmp/open-semantic-etl-$$.deb\n\n\n#\n"
},
{
"path": "docker-compose.test.yml",
"chars": 92,
"preview": "sut:\n build: .\n command: /usr/lib/python3/dist-packages/opensemanticetl/test/run_tests.sh\n"
},
{
"path": "docker-compose.ubuntu.test.yml",
"chars": 175,
"preview": "version: '3'\nservices:\n sut:\n build:\n context: .\n args:\n FROM: ubuntu:focal\n command: /usr/lib/p"
},
{
"path": "docker-entrypoint.sh",
"chars": 274,
"preview": "#! /bin/sh\n\n# docker-entrypoint for opensemanticsearch/open-semantic-etl\n\n# wait for the apps container to finish initia"
},
{
"path": "etc/opensemanticsearch/blacklist/blacklist-url",
"chars": 20,
"preview": "# Blacklist of URLs\n"
},
{
"path": "etc/opensemanticsearch/blacklist/blacklist-url-prefix",
"chars": 50,
"preview": "# Blacklist of URL Prefixes like domains or paths\n"
},
{
"path": "etc/opensemanticsearch/blacklist/blacklist-url-regex",
"chars": 67,
"preview": "# Blacklist URLs with text patterns by regular expressions (regex)\n"
},
{
"path": "etc/opensemanticsearch/blacklist/blacklist-url-suffix",
"chars": 60,
"preview": "# Blacklist of URL Suffixes like file endings\n.css\n.CSS\n.Css"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_extract_law/blacklist-lawcode-if-no-clause",
"chars": 722,
"preview": "# Preferred labels of Law codes will be only added to facet \"Law code\",\n# if the following configured (alternate) labels"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype",
"chars": 28,
"preview": "# Blacklist of contenttypes\n"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-prefix",
"chars": 834,
"preview": "# Blacklist of contenttype prefixes\n\n# Open Office / Libreoffice / MS Office\n# Open document format and MS office open x"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-regex",
"chars": 75,
"preview": "# Blacklist contenttypes with text patterns by regular expressions (regex)\n"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/blacklist-contenttype-suffix",
"chars": 36,
"preview": "# Blacklist of contenttype suffixes\n"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype",
"chars": 28,
"preview": "# Whitelist of contenttypes\n"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-prefix",
"chars": 36,
"preview": "# Whitelist of contenttype prefixes\n"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-regex",
"chars": 75,
"preview": "# Whitelist contenttypes with text patterns by regular expressions (regex)\n"
},
{
"path": "etc/opensemanticsearch/blacklist/enhance_zip/whitelist-contenttype-suffix",
"chars": 36,
"preview": "# Whitelist of contenttype suffixes\n"
},
{
"path": "etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname",
"chars": 3461,
"preview": "language_s\ncontent_type_ss\ncontent_type_group_ss\nAEB Bracket Value_ss\nAE Setting_ss\nAF Area Height_ss\nAF Area Width_ss\nA"
},
{
"path": "etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-prefix",
"chars": 176,
"preview": "etl_\nX-TIKA\nAF Point\nChroma \nCompression \nComponent \nDate/Time\nMeasured EV \nPrimary AF Point \nSelf Timer \nUnknown Camera"
},
{
"path": "etc/opensemanticsearch/blacklist/textanalysis/blacklist-fieldname-suffix",
"chars": 73,
"preview": "_i\n_is\n_l\n_ls\n_b\n_bs\n_f\n_fs\n_d\n_ds\n_f\n_fs\n_dt\n_dts\n_uri_ss\n_matchtext_ss\n"
},
{
"path": "etc/opensemanticsearch/blacklist/whitelist-url",
"chars": 20,
"preview": "# Whitelist of URLs\n"
},
{
"path": "etc/opensemanticsearch/blacklist/whitelist-url-prefix",
"chars": 50,
"preview": "# Whitelist of URL Prefixes like domains or paths\n"
},
{
"path": "etc/opensemanticsearch/blacklist/whitelist-url-regex",
"chars": 67,
"preview": "# Whitelist URLs with text patterns by regular expressions (regex)\n"
},
{
"path": "etc/opensemanticsearch/blacklist/whitelist-url-suffix",
"chars": 46,
"preview": "# Whitelist of URL Suffixes like file endings\n"
},
{
"path": "etc/opensemanticsearch/connector-files",
"chars": 1661,
"preview": "# -*- coding: utf-8 -*-\n\n# Config for opensemanticsearch-index-file\n\n\n# print Debug output\n#config['verbose'] = True\n\n\n#"
},
{
"path": "etc/opensemanticsearch/connector-web",
"chars": 1088,
"preview": "# -*- coding: utf-8 -*-\n\n#\n# Config for opensemanticsearch-index-web-crawl\n#\n\n#\n# common file extensions that are not fo"
},
{
"path": "etc/opensemanticsearch/enhancer-rdf",
"chars": 1448,
"preview": "# -*- coding: utf-8 -*-\n\n# Config for RDF metadata server\n\n# URL of the meta data server (RDF)\n# if set to False don't u"
},
{
"path": "etc/opensemanticsearch/etl",
"chars": 8090,
"preview": "# -*- coding: utf-8 -*-\n\n#\n# ETL config for connector(s)\n#\n\n# print debug messages\n#config['verbose'] = True\n\n\n#\n# Langu"
},
{
"path": "etc/opensemanticsearch/facets",
"chars": 3248,
"preview": "# Warning: Do not edit here!\n\n# This config file will be overwritten\n# by web admin user interface after config changes\n"
},
{
"path": "etc/opensemanticsearch/filemonitoring/files",
"chars": 0,
"preview": ""
},
{
"path": "etc/opensemanticsearch/ocr/dictionary.txt",
"chars": 0,
"preview": ""
},
{
"path": "etc/opensemanticsearch/regex/email.tsv",
"chars": 27,
"preview": "[\\w\\.-]+@[\\w\\.-]+\temail_ss\n"
},
{
"path": "etc/opensemanticsearch/regex/iban.tsv",
"chars": 98,
"preview": "\\b[a-zA-Z]{2}(?: ?)[0-9]{2}(?: ?)[a-zA-Z0-9]{4}(?: ?)[0-9]{7}(?: ?)([a-zA-Z0-9]?){0,16}\\b\tiban_ss\n"
},
{
"path": "etc/opensemanticsearch/regex/phone.tsv",
"chars": 44,
"preview": "[\\+\\(]?[1-9][0-9 .\\-\\(\\)]{8,}[0-9]\tphone_ss\n"
},
{
"path": "etc/opensemanticsearch/task_priorities",
"chars": 1417,
"preview": "# Priorities of document processing in task queue\n\n# The higher the additional priority is, the earlier the document wil"
},
{
"path": "etc/systemd/system/opensemanticetl-filemonitoring.service",
"chars": 279,
"preview": "[Unit]\nDescription=Open Semantic ETL filemonitoring\nAfter=network.target\n\n[Service]\nType=simple\nUser=opensemanticetl\nExe"
},
{
"path": "etc/systemd/system/opensemanticetl.service",
"chars": 215,
"preview": "[Unit]\nDescription=Open Semantic ETL\nAfter=network.target\n\n[Service]\nType=simple\nUser=opensemanticetl\nEnvironment=OMP_TH"
},
{
"path": "src/opensemanticetl/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "src/opensemanticetl/clean_title.py",
"chars": 1498,
"preview": "import sys\n\n# Replace empty title with useful info from other fields for better usability\n\n\nclass clean_title(object):\n\n"
},
{
"path": "src/opensemanticetl/enhance_annotations.py",
"chars": 1234,
"preview": "import os\nimport requests\nfrom requests.adapters import HTTPAdapter\nfrom requests.packages.urllib3.util.retry import Ret"
},
{
"path": "src/opensemanticetl/enhance_contenttype_group.py",
"chars": 3047,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\n#\n# Map/aggregate content type to content type group\n#\n\n\nclass enhance_conten"
},
{
"path": "src/opensemanticetl/enhance_csv.py",
"chars": 10396,
"preview": "import sys\nimport os\nimport csv\nimport urllib.request\nfrom etl import ETL\n\n\n# import each row of CSV file to index\n# wri"
},
{
"path": "src/opensemanticetl/enhance_detect_language_tika_server.py",
"chars": 2287,
"preview": "import os\nimport sys\nimport time\nimport requests\n\n# Extract text from filename\n\n\nclass enhance_detect_language_tika_serv"
},
{
"path": "src/opensemanticetl/enhance_entity_linking.py",
"chars": 9118,
"preview": "#\n# Named Entity Extraction by Open Semantic Entity Search API dictionary\n#\n\nimport requests\nimport sys\nimport time\n\nfro"
},
{
"path": "src/opensemanticetl/enhance_extract_email.py",
"chars": 1663,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n#\n# extract email addresses\n#\n\nclass enhanc"
},
{
"path": "src/opensemanticetl/enhance_extract_hashtags.py",
"chars": 553,
"preview": "import etl_plugin_core\n\n# Extract text from filename\nclass enhance_extract_hashtags(object):\n\n def process(self, para"
},
{
"path": "src/opensemanticetl/enhance_extract_law.py",
"chars": 5015,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n\n#\n# get taxonomy for aggregated facets / f"
},
{
"path": "src/opensemanticetl/enhance_extract_money.py",
"chars": 1850,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\nfrom numerizer import numerize\n\n#\n# extract "
},
{
"path": "src/opensemanticetl/enhance_extract_phone.py",
"chars": 1681,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n#\n# normalize phone number (remove all non-"
},
{
"path": "src/opensemanticetl/enhance_extract_text_tika_server.py",
"chars": 10587,
"preview": "import os\nimport tempfile\nimport sys\nimport time\nimport requests\n\n\ndef in_parsers(parser, parsers):\n\n for value in pa"
},
{
"path": "src/opensemanticetl/enhance_file_mtime.py",
"chars": 905,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os.path\nimport datetime\n\n#\n# Add file modification time\n#\n\n\nclass enha"
},
{
"path": "src/opensemanticetl/enhance_file_size.py",
"chars": 667,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os.path\n\n#\n# add file size\n#\n\n\nclass enhance_file_size(object):\n de"
},
{
"path": "src/opensemanticetl/enhance_html.py",
"chars": 2375,
"preview": "#\n# Extracts text within configured HTML tags / XML tags\n#\n\nfrom lxml import etree\n\n\nclass enhance_html(object):\n\n de"
},
{
"path": "src/opensemanticetl/enhance_mapping_id.py",
"chars": 2077,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\n\n#\n# Map paths or domains\n#\n\nclass enhance_mapping_id(object):\n\n def proce"
},
{
"path": "src/opensemanticetl/enhance_mimetype.py",
"chars": 771,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport magic\n\n\n#\n# Get MimeType (Which kind of file is this?)\n#\nclass enhance"
},
{
"path": "src/opensemanticetl/enhance_multilingual.py",
"chars": 6042,
"preview": "#\n# Multilinguality\n#\n# Copy content language specific dynamic fields for language specific analysis like stemming, gram"
},
{
"path": "src/opensemanticetl/enhance_ner_spacy.py",
"chars": 5048,
"preview": "import etl\nimport requests\nimport json\nimport os\nimport sys\nimport time\n\n#\n# SpaCy Named Entity Recognizer (NER)\n#\n\n# Ap"
},
{
"path": "src/opensemanticetl/enhance_ner_stanford.py",
"chars": 5149,
"preview": "import etl\nfrom nltk.tag.stanford import StanfordNERTagger\n\n\n#\n# Stanford Named Entitiy Recognizer (NER)\n#\n\n# Appends cl"
},
{
"path": "src/opensemanticetl/enhance_ocr.py",
"chars": 1561,
"preview": "from tesseract_cache import tesseract_cache\n\n\n#\n# If image add ocr text\n#\nclass enhance_ocr(object):\n\n # how to find "
},
{
"path": "src/opensemanticetl/enhance_path.py",
"chars": 2155,
"preview": "import os.path\n\n#\n# Build and add path facets from filename\n#\n\nclass enhance_path(object):\n\n def process(self, parame"
},
{
"path": "src/opensemanticetl/enhance_pdf_ocr.py",
"chars": 6098,
"preview": "import os.path\nimport sys\nimport subprocess\nimport hashlib\nimport tempfile\nimport json\n\nimport etl_plugin_core\nfrom tess"
},
{
"path": "src/opensemanticetl/enhance_pdf_page.py",
"chars": 5157,
"preview": "import os\nimport sys\nimport subprocess\nimport tempfile\nimport hashlib\n\nimport etl_plugin_core\nfrom etl import ETL\n\n#\n# b"
},
{
"path": "src/opensemanticetl/enhance_pdf_page_preview.py",
"chars": 2118,
"preview": "import sys\nimport subprocess\nfrom pathlib import Path\nimport hashlib\n\nimport etl_plugin_core\n\n\n# generate single page PD"
},
{
"path": "src/opensemanticetl/enhance_pst.py",
"chars": 4558,
"preview": "import sys\nimport hashlib\nimport tempfile\nimport os\nimport shutil\nimport subprocess\n\nimport etl_plugin_core\nfrom etl_fil"
},
{
"path": "src/opensemanticetl/enhance_rdf.py",
"chars": 10925,
"preview": "import sys\nimport logging\nimport rdflib\n\nimport etl_plugin_core\n\n# define used ontologies / standards / properties\nskos "
},
{
"path": "src/opensemanticetl/enhance_rdf_annotations_by_http_request.py",
"chars": 5296,
"preview": "import os\nimport sys\nimport hashlib\nimport urllib\nimport rdflib\nfrom rdflib import URIRef\n\n# Do templating of metaserver"
},
{
"path": "src/opensemanticetl/enhance_regex.py",
"chars": 2834,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport re\nimport etl_plugin_core\n\n\ndef regex2facet(data, text, regex, group,"
},
{
"path": "src/opensemanticetl/enhance_sentence_segmentation.py",
"chars": 4286,
"preview": "import json\nimport os\nimport requests\nimport sys\nimport time\n\nfrom etl import ETL\n\n#\n# split text to sentences\n#\n\n\nclass"
},
{
"path": "src/opensemanticetl/enhance_warc.py",
"chars": 4001,
"preview": "import hashlib\nimport tempfile\nimport os\nimport sys\nimport shutil\nimport time\n\nfrom warcio.archiveiterator import Archiv"
},
{
"path": "src/opensemanticetl/enhance_xml.py",
"chars": 2394,
"preview": "import xml.etree.ElementTree as ElementTree\nimport os.path\nimport sys\n\n\nclass enhance_xml(object):\n\n def elements2dat"
},
{
"path": "src/opensemanticetl/enhance_xmp.py",
"chars": 4568,
"preview": "import xml.etree.ElementTree as ElementTree\nimport os.path\nimport sys\n\n\n#\n# is there a xmp sidecar file?\n#\n\ndef get_xmp_"
},
{
"path": "src/opensemanticetl/enhance_zip.py",
"chars": 4121,
"preview": "import zipfile\nimport sys\nimport hashlib\nimport tempfile\nimport os\nimport shutil\nfrom etl_file import Connector_File\n\n\nc"
},
{
"path": "src/opensemanticetl/etl.py",
"chars": 13404,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport datetime\nimport importlib\nimport os\nimport sys\n\nimport filter_blackli"
},
{
"path": "src/opensemanticetl/etl_delete.py",
"chars": 3242,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport importlib\n\nfrom etl import ETL\nimport enhance_mapping_id\n\nclass Delet"
},
{
"path": "src/opensemanticetl/etl_enrich.py",
"chars": 14145,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nfrom etl import ETL\nimport pysolr\nimport export_solr\nimport importlib\nimport"
},
{
"path": "src/opensemanticetl/etl_file.py",
"chars": 10351,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport os.path\nimport sys\n\nfrom etl import ETL\n\n\nclass Connector_File(ETL):\n"
},
{
"path": "src/opensemanticetl/etl_filedirectory.py",
"chars": 2183,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nfrom etl_file import Connector_File\n\n#\n# Parallel processing of files by add"
},
{
"path": "src/opensemanticetl/etl_filemonitoring.py",
"chars": 5079,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nfrom argparse import ArgumentParser\n\nimport pyinotify\n\nfrom tasks import ind"
},
{
"path": "src/opensemanticetl/etl_hypothesis.py",
"chars": 7568,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\n#\n# Import annotations from Hypothesis - https://hypothes.is\n#\n\nimport reque"
},
{
"path": "src/opensemanticetl/etl_plugin_core.py",
"chars": 7081,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport itertools\n\n#\n# Core functions used by multiple plugins, so they can b"
},
{
"path": "src/opensemanticetl/etl_rss.py",
"chars": 4400,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport feedparser\nimport sys\n\nfrom etl_web import Connector_Web\n\nimport expo"
},
{
"path": "src/opensemanticetl/etl_sitemap.py",
"chars": 5456,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport sys\nimport urllib.request\nimport xml.etree.ElementTree as ElementTree"
},
{
"path": "src/opensemanticetl/etl_sparql.py",
"chars": 5293,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport os\nimport tempfile\n\nfrom etl import ETL\nfrom enhance_rdf import enhan"
},
{
"path": "src/opensemanticetl/etl_twitter_scraper.py",
"chars": 3184,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport twint\nimport sys\nfrom etl import ETL\nfrom tasks import index_web\n\nmod"
},
{
"path": "src/opensemanticetl/etl_web.py",
"chars": 8449,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport time\nimport urllib.request\nimport os\nfrom lxml import etree\nfrom date"
},
{
"path": "src/opensemanticetl/etl_web_crawl.py",
"chars": 3671,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport tempfile\nimport re\n\nfrom scrapy.crawler import CrawlerProcess\n\nfrom s"
},
{
"path": "src/opensemanticetl/export_elasticsearch.py",
"chars": 2035,
"preview": "from elasticsearch import Elasticsearch\n\n\n# Connect to Elastic Search\n\nclass export_elasticsearch(object):\n\n def __in"
},
{
"path": "src/opensemanticetl/export_json.py",
"chars": 713,
"preview": "import json\n\n\nclass export_json(object):\n\n def __init__(self, config=None):\n if config is None:\n co"
},
{
"path": "src/opensemanticetl/export_neo4j.py",
"chars": 3591,
"preview": "import os\n\nfrom py2neo import Graph, Node, Relationship\n\n#\n# Export entities and connections to neo4j\n#\n\n\nclass export_n"
},
{
"path": "src/opensemanticetl/export_print.py",
"chars": 429,
"preview": "import pprint\n\n\nclass export_print(object):\n\n def __init__(self, config=None):\n if config is None:\n "
},
{
"path": "src/opensemanticetl/export_queue_files.py",
"chars": 1851,
"preview": "#\n# Write filename to Celery queue for batching and parallel processing\n#\n\nfrom tasks import index_file\n\n\nclass export_q"
},
{
"path": "src/opensemanticetl/export_solr.py",
"chars": 11105,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport os\nimport json\nimport requests\nimport sys\nimport time\n\nimport urllib."
},
{
"path": "src/opensemanticetl/filter_blacklist.py",
"chars": 3620,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport re\n\n\ndef is_in_lists(listfiles, value, match=None):\n\n result = Fals"
},
{
"path": "src/opensemanticetl/filter_file_not_modified.py",
"chars": 12145,
"preview": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os\nimport datetime\nimport sys\nimport importlib\n\n#\n# do not index (set "
},
{
"path": "src/opensemanticetl/move_indexed_file.py",
"chars": 7041,
"preview": "#!/usr/bin/env python3\n\nimport urllib.request\nimport urllib.parse\nimport json\nfrom itertools import starmap\n\n\ndef move_f"
},
{
"path": "src/opensemanticetl/requirements.txt",
"chars": 135,
"preview": "celery\nfeedparser\nlxml\nnumerizer\npy2neo\npycurl\npyinotify\npysolr\npython-dateutil\nrequests\nrdflib\nscrapy\nSPARQLWrapper\ntik"
},
{
"path": "src/opensemanticetl/tasks.py",
"chars": 5312,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\n#\n# Queue tasks for batch processing and parallel processing\n#\n\nimport os\nim"
},
{
"path": "src/opensemanticetl/test_enhance_detect_language_tika_server.py",
"chars": 709,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_detect_language_tika_server\n\nclass Test_enha"
},
{
"path": "src/opensemanticetl/test_enhance_extract_email.py",
"chars": 1212,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_extract_email\n\nclass Test_enhance_extract_em"
},
{
"path": "src/opensemanticetl/test_enhance_extract_law.py",
"chars": 2519,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nfrom etl import ETL\n\nclass Test_enhance_extract_law(unittes"
},
{
"path": "src/opensemanticetl/test_enhance_extract_money.py",
"chars": 2772,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nfrom etl import ETL\n\nclass Test_enhance_extract_money(unitt"
},
{
"path": "src/opensemanticetl/test_enhance_extract_text_tika_server.py",
"chars": 7301,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nimport enhance_extract_text_tika_server\n\nclass Te"
},
{
"path": "src/opensemanticetl/test_enhance_mapping_id.py",
"chars": 1910,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_mapping_id\n\nclass Test_enhance_mapping_id(un"
},
{
"path": "src/opensemanticetl/test_enhance_ner_spacy.py",
"chars": 1435,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_ner_spacy\n\nconfig = {\n 'spacy_ner_classif"
},
{
"path": "src/opensemanticetl/test_enhance_path.py",
"chars": 943,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_path\n\nclass Test_enhance_path(unittest.TestC"
},
{
"path": "src/opensemanticetl/test_enhance_pdf_ocr.py",
"chars": 884,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nimport enhance_pdf_ocr\n\nclass Test_enhance_pdf_oc"
},
{
"path": "src/opensemanticetl/test_enhance_regex.py",
"chars": 849,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\n\nimport enhance_regex\n\nclass Test_enhance_regex(unittest.Tes"
},
{
"path": "src/opensemanticetl/test_enhance_warc.py",
"chars": 1418,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nfrom etl_file import Connector_File\nfrom etl_dele"
},
{
"path": "src/opensemanticetl/test_etl_file.py",
"chars": 4119,
"preview": "#!/usr/bin/python3\n# -*- coding: utf-8 -*-\n\nimport unittest\nimport os\n\nfrom etl_file import Connector_File\nfrom etl_dele"
},
{
"path": "src/opensemanticetl/test_move_indexed_files.py",
"chars": 4504,
"preview": "import unittest\nfrom unittest import mock\n\nimport json\nimport itertools\n\nimport move_indexed_file\n\n\nclass TestMove(unitt"
},
{
"path": "src/opensemanticetl/testdata/README.md",
"chars": 1490,
"preview": "Automated tests by unittest\n===========================\n\nAutomated tests are implemented using the Python library unitte"
},
{
"path": "src/opensemanticetl/testdata/run_integrationtests.sh",
"chars": 171,
"preview": "#!/bin/sh\n\npython3 -m unittest discover -s /usr/lib/python3/dist-packages/entity_linking/\n\npython3 -m unittest discover "
},
{
"path": "src/opensemanticetl/testdata/run_tests.sh",
"chars": 160,
"preview": "#!/bin/sh\n\ncd /usr/lib/python3/dist-packages/opensemanticetl\n\npython3 -m unittest \\\n test_enhance_extract_email \\\n test_"
}
]
// ... and 1 more files (download for full content)
About this extraction
This page contains the full source code of the opensemanticsearch/open-semantic-etl GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 123 files (363.3 KB), approximately 84.6k tokens, and a symbol index with 294 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.