Showing preview only (9,041K chars total). Download the full file or copy to clipboard to get everything.
Repository: dalab/end2end_neural_el
Branch: master
Commit: 1464bcf4cd2b
Files: 169
Total size: 8.6 MB
Directory structure:
gitextract_uokgx3gu/
├── .gitignore
├── Examples _ End-to-End Neural Entity Linking.ipynb
├── LICENSE
├── code/
│ ├── __init__.py
│ ├── evaluation/
│ │ ├── .swm
│ │ ├── .swn
│ │ ├── .swo
│ │ ├── metrics.py
│ │ ├── metrics_old.py
│ │ ├── print_predictions (copy).py
│ │ ├── print_predictions.py
│ │ └── summarize_all_experiments.py
│ ├── gerbil/
│ │ ├── build_entity_universe.py
│ │ ├── gerbil_recall_calculation.py
│ │ ├── nn_processing.py
│ │ ├── nn_processing_correct.py
│ │ └── server.py
│ ├── model/
│ │ ├── base_model.py
│ │ ├── config.py
│ │ ├── ed_model_original.py
│ │ ├── ensemble_eval.py
│ │ ├── evaluate.py
│ │ ├── model.py
│ │ ├── model_ablations.py
│ │ ├── reader.py
│ │ ├── train.py
│ │ └── util.py
│ ├── preprocessing/
│ │ ├── __init__.py
│ │ ├── aida_insight.py
│ │ ├── bridge_code_lua/
│ │ │ ├── ent_vecs_from_txt_to_npy.py
│ │ │ └── ent_vecs_to_txt.lua
│ │ ├── extra.py
│ │ ├── old/
│ │ │ ├── old_code.py
│ │ │ ├── prepro_datasets1_once.py
│ │ │ ├── preprocess1.py
│ │ │ └── preprocessv2.py
│ │ ├── p_e_m.py
│ │ ├── prepro_aida.py
│ │ ├── prepro_aida_tokenize.py
│ │ ├── prepro_gerbil_datasets.py
│ │ ├── prepro_other_datasets.py
│ │ ├── prepro_util.py
│ │ ├── prepro_wikidump.py
│ │ └── util.py
│ └── script
├── create_entity_vectors.md
├── deep-ed/
│ └── deep-ed-master/
│ ├── LICENSE
│ ├── README.md
│ ├── data_gen/
│ │ ├── gen_p_e_m/
│ │ │ ├── gen_p_e_m_from_wiki.lua
│ │ │ ├── gen_p_e_m_from_yago.lua
│ │ │ ├── merge_crosswikis_wiki.lua
│ │ │ └── unicode_map.lua
│ │ ├── gen_test_train_data/
│ │ │ ├── gen_ace_msnbc_aquaint_csv.lua
│ │ │ ├── gen_aida_test.lua
│ │ │ ├── gen_aida_train.lua
│ │ │ └── gen_all.lua
│ │ ├── gen_wiki_data/
│ │ │ ├── gen_ent_wiki_w_repr.lua
│ │ │ └── gen_wiki_hyp_train_data.lua
│ │ ├── indexes/
│ │ │ ├── wiki_disambiguation_pages_index.lua
│ │ │ ├── wiki_redirects_index.lua
│ │ │ └── yago_crosswikis_wiki.lua
│ │ └── parse_wiki_dump/
│ │ └── parse_wiki_dump_tools.lua
│ ├── ed/
│ │ ├── args.lua
│ │ ├── ed.lua
│ │ ├── loss.lua
│ │ ├── minibatch/
│ │ │ ├── build_minibatch.lua
│ │ │ └── data_loader.lua
│ │ ├── models/
│ │ │ ├── SetConstantDiag.lua
│ │ │ ├── linear_layers.lua
│ │ │ ├── model.lua
│ │ │ ├── model_global.lua
│ │ │ └── model_local.lua
│ │ ├── test/
│ │ │ ├── check_coref.lua
│ │ │ ├── coref_persons.lua
│ │ │ ├── ent_freq_stats_test.lua
│ │ │ ├── ent_p_e_m_stats_test.lua
│ │ │ ├── test.lua
│ │ │ └── test_one_loaded_model.lua
│ │ └── train.lua
│ ├── ent_vecs_scores.txt
│ ├── entities/
│ │ ├── ent_name2id_freq/
│ │ │ ├── e_freq_gen.lua
│ │ │ ├── e_freq_index.lua
│ │ │ └── ent_name_id.lua
│ │ ├── learn_e2v/
│ │ │ ├── 4EX_wiki_words.lua
│ │ │ ├── batch_dataset_a.lua
│ │ │ ├── e2v_a.lua
│ │ │ ├── learn_a.lua
│ │ │ ├── minibatch_a.lua
│ │ │ └── model_a.lua
│ │ ├── pretrained_e2v/
│ │ │ ├── check_ents.lua
│ │ │ ├── e2v.lua
│ │ │ └── e2v_txt_reader.lua
│ │ └── relatedness/
│ │ ├── filter_wiki_canonical_words_RLTD.lua
│ │ ├── filter_wiki_hyperlink_contexts_RLTD.lua
│ │ └── relatedness.lua
│ ├── log_train_entity_vecs
│ ├── our_system_annotations.txt
│ ├── utils/
│ │ ├── logger.lua
│ │ ├── optim/
│ │ │ ├── adadelta_mem.lua
│ │ │ ├── adagrad_mem.lua
│ │ │ └── rmsprop_mem.lua
│ │ └── utils.lua
│ └── words/
│ ├── load_w_freq_and_vecs.lua
│ ├── stop_words.lua
│ ├── w2v/
│ │ ├── glove_reader.lua
│ │ ├── w2v.lua
│ │ └── word2vec_reader.lua
│ └── w_freq/
│ ├── w_freq_gen.lua
│ └── w_freq_index.lua
├── gerbil-SpotWrapNifWS4Test/
│ ├── .gitignore
│ ├── Dockerfile
│ ├── LICENSE
│ ├── Makefile
│ ├── README.md
│ ├── curlExample.sh
│ ├── docker-compose.yml
│ ├── example.ttl
│ ├── my_notes/
│ │ ├── messages_format
│ │ ├── python_server_format
│ │ ├── python_server_format_ed
│ │ └── python_server_format_el
│ ├── pom.xml
│ ├── repository/
│ │ └── org/
│ │ ├── aksw/
│ │ │ └── gerbil.nif.transfer/
│ │ │ ├── 1.1.0-SNAPSHOT/
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.jar
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.pom
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5
│ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1
│ │ │ │ ├── maven-metadata-local.xml
│ │ │ │ ├── maven-metadata-local.xml.md5
│ │ │ │ └── maven-metadata-local.xml.sha1
│ │ │ ├── maven-metadata-local.xml
│ │ │ ├── maven-metadata-local.xml.md5
│ │ │ └── maven-metadata-local.xml.sha1
│ │ └── restlet/
│ │ ├── org.restlet/
│ │ │ ├── 2.2.1/
│ │ │ │ ├── org.restlet-2.2.1.jar
│ │ │ │ ├── org.restlet-2.2.1.jar.md5
│ │ │ │ ├── org.restlet-2.2.1.jar.sha1
│ │ │ │ ├── org.restlet-2.2.1.pom
│ │ │ │ ├── org.restlet-2.2.1.pom.md5
│ │ │ │ └── org.restlet-2.2.1.pom.sha1
│ │ │ ├── maven-metadata-local.xml
│ │ │ ├── maven-metadata-local.xml.md5
│ │ │ └── maven-metadata-local.xml.sha1
│ │ └── org.restlet.ext.servlet/
│ │ ├── 2.2.1/
│ │ │ ├── org.restlet.ext.servlet-2.2.1.jar
│ │ │ ├── org.restlet.ext.servlet-2.2.1.jar.md5
│ │ │ ├── org.restlet.ext.servlet-2.2.1.jar.sha1
│ │ │ ├── org.restlet.ext.servlet-2.2.1.pom
│ │ │ ├── org.restlet.ext.servlet-2.2.1.pom.md5
│ │ │ └── org.restlet.ext.servlet-2.2.1.pom.sha1
│ │ ├── maven-metadata-local.xml
│ │ ├── maven-metadata-local.xml.md5
│ │ └── maven-metadata-local.xml.sha1
│ └── src/
│ └── main/
│ ├── java/
│ │ └── org/
│ │ └── aksw/
│ │ └── gerbil/
│ │ └── ws4test/
│ │ ├── EDResource.java
│ │ ├── LocalIntermediateWebserver.java
│ │ ├── MyResource.java
│ │ ├── SpotlightClient.java
│ │ ├── SpotlightResource.java
│ │ ├── TestApplication.java
│ │ └── data_format
│ ├── resources/
│ │ └── log4j.properties
│ └── webapp/
│ └── WEB-INF/
│ └── web.xml
├── readme.md
└── requirements.txt
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
*.aux
*.bbl
*.bcf
*.blg
*.log
*.out
*.run.xml
*.toc
*.synctex.gz
data/
**.idea/
**__pycache__/
*.swp
*.~lock.*ods#
end2end_neural_el_env/
deep-ed/data/
local/
================================================
FILE: Examples _ End-to-End Neural Entity Linking.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Try on your own input\n",
"The following notebook shows how to practice with the code repository with you example text.\n",
"**Pre-requisites**\n",
"- Make sure you have installed and downloaded everything as the [README](https://github.com/dalab/end2end_neural_el#trying-the-system-on-random-user-input-text) mentions on the github page.\n",
"- Once you follow the instructions you will have the server running at https://localhost:5555.\n",
"- To quickly. run the following cell.- "
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Response [200]>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import requests, json\n",
"requests.post(\"http://localhost:5555\") ## if Response is [200] then it means the server is running"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[17, 7, \"Germany\"], [49, 6, \"Angela_Merkel\"], [0, 5, \"Barack_Obama\"]]"
]
}
],
"source": [
"## Check using curl. (it is not nessasary to run the code)\n",
"!curl -X POST --header 'Content-Type: application/json' --header 'Accept: application/json' -d \"{ \\\"text\\\": \\\"Obama will visit Germany and have a meeting with Merkel tomorrow.\\\", \\\"spans\\\": [] }\" 'http://localhost:5555'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Utility Functions"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import pprint\n",
"import pandas as pd\n",
"from IPython.display import Markdown\n",
"\n",
"\n",
"def query(text):\n",
" ## Takes the input string and passes it to the service and gets the reponse back.\n",
" myjson = { \"text\": text, \"spans\": [] }\n",
" r = requests.post(\"http://localhost:5555\", json=myjson)\n",
" return json.loads(r.content)\n",
"\n",
"def printmd(string):\n",
" ## displays the annotated/tagged input text in jupyter's Markdown format\n",
" display(Markdown(string))\n",
" \n",
" \n",
"def format_index_output(text):\n",
" ## main function which sends the input text to the service, gets the response back and formats the output\n",
" ## in a presentable form to evaluate.\n",
" \n",
" ents = query(text)\n",
" ents.sort(key=lambda tup: tup[0], reverse=True) \n",
" for i, ent in enumerate(ents):\n",
" text = text[:ent[0]] + '['+text[ent[0]:ent[0]+ent[1]]+'](https://en.wikipedia.org/wiki/'+ ent[2] +')' + text[ent[0]+ent[1]:]\n",
" \n",
" # Replace $ sign : Quick fix since $ sign is a keyword in jupyter markdowns\n",
" text = text.replace(\"$\",\"\\\\$\")\n",
" \n",
" printmd(text)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"text = [\"Eminem is the best rapper of all time. MMLP was his best album, Eminem and Dre have produced this album\",\n",
" \"Pakistan has one of the best teams in cricket. The pakistani squad for ICC Cricket World Cup has Shahid Afridi and Shoaib Akhtar\",\n",
" \"KIEV: Separatist rebels have not fulfilled conditions like handing back border posts or laying down their weapons, Ukraine’s president said on Monday in a phone call with the leaders of Russia, Germany and France as he pondered whether to extend a ceasefire.The call between President Petro Poroshenko, Russia’s Vladimir Putin, Germany’s Chancellor Angela Merkel and France’s Francois Hollande took place as an expiration deadline neared for Ukraine’s shaky, unilateral ceasefire\"\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/markdown": [
"[Eminem](https://en.wikipedia.org/wiki/Eminem) is the best rapper of all time. [MMLP](https://en.wikipedia.org/wiki/The_Marshall_Mathers_LP) was his best album, [Eminem](https://en.wikipedia.org/wiki/Eminem) and [Dre](https://en.wikipedia.org/wiki/Dr._Dre) have produced this album"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"Pakistan has one of the best teams in cricket. The pakistani squad for [ICC Cricket World Cup](https://en.wikipedia.org/wiki/Cricket_World_Cup) has [Shahid Afridi](https://en.wikipedia.org/wiki/Shahid_Afridi) and Shoaib Akhtar"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/markdown": [
"[KIEV](https://en.wikipedia.org/wiki/Kiev): Separatist rebels have not fulfilled conditions like handing back border posts or laying down their weapons, [Ukraine](https://en.wikipedia.org/wiki/Ukraine)’s president said on Monday in a phone call with the leaders of [Russia](https://en.wikipedia.org/wiki/Russia), [Germany](https://en.wikipedia.org/wiki/Germany) and [France](https://en.wikipedia.org/wiki/France) as he pondered whether to extend a ceasefire.The call between President [Petro Poroshenko](https://en.wikipedia.org/wiki/Petro_Poroshenko), [Russia](https://en.wikipedia.org/wiki/Russia)’s [Vladimir Putin](https://en.wikipedia.org/wiki/Vladimir_Putin), [Germany](https://en.wikipedia.org/wiki/Germany)’s Chancellor [Angela Merkel](https://en.wikipedia.org/wiki/Angela_Merkel) and [France](https://en.wikipedia.org/wiki/France)’s [Francois Hollande](https://en.wikipedia.org/wiki/François_Hollande) took place as an expiration deadline neared for [Ukraine](https://en.wikipedia.org/wiki/Ukraine)’s shaky, unilateral ceasefire"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"for t in text:\n",
" format_index_output(t)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:end2end_nel]",
"language": "python",
"name": "conda-env-end2end_nel-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: code/__init__.py
================================================
================================================
FILE: code/evaluation/metrics.py
================================================
import numpy as np
from collections import defaultdict
from operator import itemgetter
import tensorflow as tf
class Evaluator(object):
def __init__(self, threshold, name):
self.threshold = threshold
self.name = name
self.TP = defaultdict(int) # docid -> counter
self.FP = defaultdict(int) # docid -> counter
self.FN = defaultdict(int) # docid -> counter
self.docs = set() # set with all the docid encountered
self.gm_num = 0
def gm_add(self, gm_in_batch):
self.gm_num += gm_in_batch
def check_tp(self, score, docid):
if score >= self.threshold:
self.docs.add(docid)
self.TP[docid] += 1
return True
return False
def check_fp(self, score, docid):
if score >= self.threshold:
self.docs.add(docid)
self.FP[docid] += 1
return True
return False
def check_fn(self, score, docid):
if score < self.threshold:
self.docs.add(docid)
self.FN[docid] += 1
return True
return False
def _score_computation(self, el_mode):
micro_tp, micro_fp, micro_fn = 0, 0, 0
macro_pr, macro_re = 0, 0
for docid in self.docs:
tp, fp, fn = self.TP[docid], self.FP[docid], self.FN[docid]
micro_tp += tp
micro_fp += fp
micro_fn += fn
doc_precision = tp / (tp + fp + 1e-6)
macro_pr += doc_precision
doc_recall = tp / (tp + fn + 1e-6)
macro_re += doc_recall
if el_mode is False:
assert(self.gm_num == micro_tp + micro_fn)
micro_pr = 100 * micro_tp / (micro_tp + micro_fp + 1e-6)
micro_re = 100 * micro_tp / (micro_tp + micro_fn + 1e-6)
micro_f1 = 2*micro_pr*micro_re / (micro_pr + micro_re + 1e-6)
macro_pr = 100 * macro_pr / len(self.docs)
macro_re = 100 * macro_re / len(self.docs)
macro_f1 = 2*macro_pr*macro_re / (macro_pr + macro_re + 1e-6)
return micro_pr, micro_re, micro_f1, macro_pr, macro_re, macro_f1
def print_log_results(self, tf_writer, eval_cnt, el_mode):
micro_pr, micro_re, micro_f1, macro_pr, macro_re, macro_f1 = self._score_computation(el_mode)
print("micro", "P: %.1f" % micro_pr, "\tR: %.1f" % micro_re, "\tF1: %.1f" % micro_f1)
print("macro", "P: %.1f" % macro_pr, "\tR: %.1f" % macro_re, "\tF1: %.1f" % macro_f1)
if tf_writer is None:
return micro_f1, macro_f1
name = self.name+" macro"
writer_name = "el_" if el_mode else "ed_"
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=macro_f1)])
tf_writer[writer_name+"f1"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=macro_pr)])
tf_writer[writer_name+"pr"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=macro_re)])
tf_writer[writer_name+"re"].add_summary(summary, eval_cnt)
name = self.name+" micro"
writer_name = "el_" if el_mode else "ed_"
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=micro_f1)])
tf_writer[writer_name+"f1"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=micro_pr)])
tf_writer[writer_name+"pr"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=micro_re)])
tf_writer[writer_name+"re"].add_summary(summary, eval_cnt)
return micro_f1, macro_f1
def print_log_results_old(self, tf_writer, eval_cnt, el_mode):
micro_tp, micro_fp, micro_fn = 0, 0, 0
macro_pr, macro_re = 0, 0
try:
valid_macro_prec_cnt = 0
valid_macro_recall_cnt = 0
for docid in self.docs:
tp, fp, fn = self.TP[docid], self.FP[docid], self.FN[docid]
micro_tp += tp
micro_fp += fp
micro_fn += fn
if tp + fp > 0:
doc_precision = tp / (tp + fp)
macro_pr += doc_precision
valid_macro_prec_cnt += 1
if tp + fn > 0:
doc_recall = tp / (tp + fn)
macro_re += doc_recall
valid_macro_recall_cnt += 1
if el_mode is False:
assert(self.gm_num == micro_tp + micro_fn)
micro_pr = 100 * micro_tp / (micro_tp + micro_fp) if (micro_tp + micro_fp) > 0 else 0
micro_re = 100 * micro_tp / (micro_tp + micro_fn) if (micro_tp + micro_fn) > 0 else 0
micro_f1 = 2*micro_pr*micro_re / (micro_pr + micro_re) if (micro_pr + micro_re) > 0 else 0
macro_pr = 100 * macro_pr / valid_macro_prec_cnt if valid_macro_prec_cnt > 0 else 0
macro_re = 100 * macro_re / valid_macro_recall_cnt if valid_macro_recall_cnt > 0 else 0
macro_f1 = 2*macro_pr*macro_re / (macro_pr + macro_re) if (macro_pr + macro_re) > 0 else 0
except ZeroDivisionError:
print("Exception! ZeroDivisionError in print results!\nmicro_tp, micro_fp, micro_fn = ", micro_tp,
micro_fp, micro_fn)
print("micro", "P: %.1f" % micro_pr, "\tR: %.1f" % micro_re, "\tF1: %.1f" % micro_f1)
print("macro", "P: %.1f" % macro_pr, "\tR: %.1f" % macro_re, "\tF1: %.1f" % macro_f1)
if tf_writer is None:
print("len(self.docs)={}\tvalid_macro_prec_cnt={}\tvalid_macro_recall_cnt={}".format(
len(self.docs), valid_macro_prec_cnt, valid_macro_recall_cnt))
return micro_f1, macro_f1
name = self.name+" macro"
writer_name = "el_" if el_mode else "ed_"
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=macro_f1)])
tf_writer[writer_name+"f1"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=macro_pr)])
tf_writer[writer_name+"pr"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=macro_re)])
tf_writer[writer_name+"re"].add_summary(summary, eval_cnt)
name = self.name+" micro"
writer_name = "el_" if el_mode else "ed_"
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=micro_f1)])
tf_writer[writer_name+"f1"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=micro_pr)])
tf_writer[writer_name+"pr"].add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name, simple_value=micro_re)])
tf_writer[writer_name+"re"].add_summary(summary, eval_cnt)
return micro_f1, macro_f1
class StrongMatcher(object):
"""is initialized with the gm_gt_list i.e. a list of tuples
(begin_idx, end_idx, gt) and from the list of tuples it builds a set of tuples
that will help us answer if our prediction matches with a tuple from the
ground truth"""
def __init__(self, b_e_gt_iterator):
self.data = set() # of tuples (begin_idx, end_idx, gt)
for t in b_e_gt_iterator:
self.data.add(t)
def check(self, t):
"""returns True if tuple matches with ground truth else False"""
return True if t in self.data else False
class WeakMatcher(object):
"""is initialized with the gm_gt_list i.e. a list of tuples
(begin_idx, end_idx, gt) and from the list of tuples it builds a data structure
that will help us answer if our prediction matches with a tuple from the
ground truth.
structure used: a dict with key the gt and value a list of tuples
(begin_idx, end_idx). So i compare the predicted triplet (b,e,ent_id)
with all the ground truth triplets and check
if they overlap (weak matching) and return True or False.
e.g. 4 -> [(5,7), (13,14)] """
def __init__(self, b_e_gt_iterator):
self.data = defaultdict(list)
for b, e, gt in b_e_gt_iterator:
self.data[gt].append((b, e))
def check(self, t):
# here the t comes from filtereds_spans[1:] so begin_idx, end_idxm best_cand_id but name it gt in the code
s, e, gt = t
if gt in self.data:
for s2, e2 in self.data[gt]:
if s<=s2 and e<=e2 and s2<e:
return True
elif s>=s2 and e>=e2 and s<e2:
return True
elif s<=s2 and e>=e2:
return True
elif s>=s2 and e<=e2:
return True
return False
class FNStrongMatcher(object):
"""when initialized it takes our algorithms predictions
(score, begin_idx, end_idx, ent_id) list and builds a dictionary.
later we use it to check what score we have given to the ground truth i.e.
gold mention plus the correct entity.
structure used: a dict with key (begin_idx, end_idx, ent_id) --> given_score
by my algorithm"""
def __init__(self, filtered_spans):
self.data = dict()
for score, b, e, ent_id in filtered_spans:
self.data[(b, e, ent_id)] = score
def check(self, t):
"""t are tuples (begin_idx, end_idx, gt) from gm_gt_list. I check
if the ground truth is in my predictions and return the given score."""
return self.data[t] if t in self.data else -10000
class FNWeakMatcher(object):
"""when initialized it takes our algorithms predictions
(score, begin_idx, end_idx, ent_id) list and builds a data structure.
later we use it to check what score we have given to the ground truth i.e.
gold mention plus the correct entity.
structure used: # a dict with key the gt and value a list of tuples
(begin_idx, end_idx, given_score). So i compare the ground truth triplet (s,e,gt)
with all the spans that my algorithm has linked to the same entity (gt) and check
if they overlap (data matching) and return the highest score.
e.g. 4 -> [(5,7, 0.2), (13,14, 0.3)] """
def __init__(self, filtered_spans):
self.data = defaultdict(list)
for score, b, e, ent_id in filtered_spans:
self.data[ent_id].append((b, e, score))
def check(self, t):
"""t are tuples (begin_idx, end_idx, gt) from gm_gt_list. I check
if the ground truth has overlap with some of my predictions and return
the highest given score."""
s, e, gt = t
best_score = -10000
if gt in self.data:
for s2, e2, score in self.data[gt]:
if s<=s2 and e<=e2 and s2<e:
best_score = max(best_score, score)
elif s>=s2 and e>=e2 and s<e2:
best_score = max(best_score, score)
elif s<=s2 and e>=e2:
best_score = max(best_score, score)
elif s>=s2 and e<=e2:
best_score = max(best_score, score)
return best_score
def _filtered_spans_and_gm_gt_list(b, final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len,
begin_gm, end_gm, ground_truth,
ground_truth_len, words_len):
spans = []
for i in range(spans_len[b]): # candidate span
begin_idx = begin_span[b][i]
end_idx = end_span[b][i]
best_cand_id = -1
best_cand_score = -10000
for j in range(cand_entities_len[b][i]): # how many candidate entities we have for this span
score = final_scores[b][i][j]
if score > best_cand_score:
best_cand_score = score
best_cand_id = cand_entities[b][i][j]
spans.append((best_cand_score, begin_idx, end_idx, best_cand_id))
# now filter this list of spans based on score. from the overlapping ones keep the one
# with the highest score.
spans = sorted(spans, reverse=True) # highest score lowest score
filtered_spans = []
claimed = np.full(words_len[b], False, dtype=bool) # initially all words are free to select
for span in spans:
best_cand_score, begin_idx, end_idx, best_cand_id = span
if not np.any(claimed[begin_idx:end_idx]) and best_cand_id > 0:
# nothing is claimed so take it TODO this > 0 condition is it correct???
claimed[begin_idx:end_idx] = True
filtered_spans.append(span)
gm_gt_list = [(begin_gm[b][i], end_gm[b][i], ground_truth[b][i]) for i in range(ground_truth_len[b])]
return filtered_spans, gm_gt_list
def threshold_calculation(final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len, begin_gm, end_gm, ground_truth,
ground_truth_len, words_len, chunk_id, el_mode):
tp_fp_batch_scores = []
fn_batch_scores = []
if el_mode is False:
begin_gm = begin_span
end_gm = end_span
for b in range(final_scores.shape[0]): # batch
filtered_spans, gm_gt_list = _filtered_spans_and_gm_gt_list(b, final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len, begin_gm, end_gm, ground_truth, ground_truth_len, words_len)
matcher = WeakMatcher(gm_gt_list) if el_mode else StrongMatcher(gm_gt_list)
for t in filtered_spans:
if matcher.check(t[1:]):
tp_fp_batch_scores.append((t[0], 1)) # (score, TP)
else:
tp_fp_batch_scores.append((t[0], 0)) # (score, FP)
# now check for the fn
matcher = FNWeakMatcher(filtered_spans) if el_mode else FNStrongMatcher(filtered_spans)
for t in gm_gt_list:
score = matcher.check(t)
fn_batch_scores.append(score)
return tp_fp_batch_scores, fn_batch_scores
def metrics_calculation(evaluator, final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len,
begin_gm, end_gm, ground_truth,
ground_truth_len, words_len, chunk_id, el_mode):
if el_mode is False:
begin_gm = begin_span
end_gm = end_span
# for each candidate span find which is the cand entity with the highest score
for b in range(final_scores.shape[0]): # batch
filtered_spans, gm_gt_list = _filtered_spans_and_gm_gt_list(b, final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len, begin_gm, end_gm, ground_truth, ground_truth_len, words_len)
matcher = WeakMatcher(gm_gt_list) if el_mode else StrongMatcher(gm_gt_list)
docid = chunk_id[b].split(b"&*", 1)[0] # b'947testa_CRICKET&*0&*0' to b'947testa_CRICKET'
# TODO remove this and the assertion
evaluator.gm_add(len(gm_gt_list))
for t in filtered_spans:
if matcher.check(t[1:]):
evaluator.check_tp(t[0], docid)
else:
evaluator.check_fp(t[0], docid)
# now check for the fn
matcher = FNWeakMatcher(filtered_spans) if el_mode else FNStrongMatcher(filtered_spans)
for t in gm_gt_list:
score = matcher.check(t)
evaluator.check_fn(score, docid)
def metrics_calculation_and_prediction_printing(evaluator, final_scores,
cand_entities_len, cand_entities,
begin_span, end_span, spans_len,
begin_gm, end_gm, ground_truth,
ground_truth_len, words_len, chunk_id,
words, chars, chars_len,
scores_l, global_pairwise_scores, scores_names_l,
el_mode, printPredictions=None):
if el_mode is False:
begin_gm = begin_span
end_gm = end_span
# for each candidate span find which is the cand entity with the highest score
for b in range(final_scores.shape[0]): # batch
spans = []
for i in range(spans_len[b]): # candidate span
begin_idx = begin_span[b][i]
end_idx = end_span[b][i]
best_cand_id = -1
best_cand_score = -10000
best_cand_similarity_score = -10000
best_cand_position = -1
scores_text = "invalid"
for j in range(cand_entities_len[b][i]): # how many candidate entities we have for this span
score = final_scores[b][i][j]
if score > best_cand_score:
best_cand_score = score
best_cand_id = cand_entities[b][i][j]
scores_text = ' '.join([scores_name + "=" + str(score[b][i][j]) for scores_name, score in zip(scores_names_l, scores_l)])
# best_cand_similarity_score = similarity_scores[b][i][j]
best_cand_position = j
span_num = i
spans.append((best_cand_score, begin_idx, end_idx, best_cand_id,
scores_text, best_cand_position, span_num))
# now filter this list of spans based on score. from the overlapping ones keep the one
# with the highest score.
spans = sorted(spans, reverse=True) # highest score lowest score
filtered_spans = []
claimed = np.full(words_len[b], False, dtype=bool) # initially all words are free to select
for span in spans:
best_cand_score, begin_idx, end_idx, best_cand_id = span[:4]
if not np.any(claimed[begin_idx:end_idx]) and best_cand_id > 0:
# nothing is claimed so take it TODO this > 0 condition is it correct???
claimed[begin_idx:end_idx] = True
filtered_spans.append(span)
# now traverse all the filtered spans and compare them with the gold mentions
# for each tuple of filtered_spans check for tp or fp
gm_gt_list = [(begin_gm[b][i], end_gm[b][i], ground_truth[b][i]) for i in range(ground_truth_len[b])]
matcher = WeakMatcher(gm_gt_list) if el_mode else StrongMatcher(gm_gt_list)
docid = chunk_id[b].split(b"&*", 1)[0]
evaluator.gm_add(len(gm_gt_list))
tp_pred = []
fp_pred = []
fn_pred = []
gt_minus_fn_pred = [] # gt_minus_fn_pred + fn_pred create the gm_gt_list
for t in filtered_spans:
if matcher.check(t[1:4]):
if evaluator.check_tp(t[0], docid):
tp_pred.append(t)
else:
if evaluator.check_fp(t[0], docid):
fp_pred.append(t)
# now check for the fn
temp = [t[:4] for t in filtered_spans]
matcher = FNWeakMatcher(temp) if el_mode else FNStrongMatcher(temp)
for gm_num, t in enumerate(gm_gt_list):
score = matcher.check(t)
if evaluator.check_fn(score, docid):
fn_pred.append((gm_num, *t))
else:
gt_minus_fn_pred.append((gm_num, *t))
if printPredictions is not None:
gmask = global_pairwise_scores[0][b] if global_pairwise_scores else None
entity_embeddings = global_pairwise_scores[1][b] if global_pairwise_scores else None
printPredictions.process_sample(str(chunk_id[b]),
tp_pred, fp_pred, fn_pred, gt_minus_fn_pred,
words[b], words_len[b],
chars[b], chars_len[b],
cand_entities[b], cand_entities_len[b],
final_scores[b], filtered_spans,
[score[b] for score in scores_l], scores_names_l,
gmask, entity_embeddings)
================================================
FILE: code/evaluation/metrics_old.py
================================================
import numpy as np
from collections import defaultdict
from operator import itemgetter
import tensorflow as tf
class Evaluator_aux(object):
def __init__(self, threshold, name):
self.threshold = threshold
self.name = name
self.TP = defaultdict(int) # docid -> counter
self.FP = defaultdict(int) # docid -> counter
self.FN = defaultdict(int) # docid -> counter
self.docs = set() # set with all the docid encountered
def check_tp(self, score, docid):
if score >= self.threshold:
self.docs.add(docid)
self.TP[docid] += 1
def check_fp(self, score, docid):
if score >= self.threshold:
self.docs.add(docid)
self.FP[docid] += 1
def check_fn(self, score, docid):
if score < self.threshold:
self.docs.add(docid)
self.FN[docid] += 1
def print_results(self):
micro_tp, micro_fp, micro_fn = 0, 0, 0
macro_pr, macro_re = 0, 0
try:
valid_macro_prec_cnt = 0
valid_macro_recall_cnt = 0
for docid in self.docs:
tp, fp, fn = self.TP[docid], self.FP[docid], self.FN[docid]
micro_tp += tp
micro_fp += fp
micro_fn += fn
if tp + fp > 0:
doc_precision = tp / (tp + fp)
macro_pr += doc_precision
valid_macro_prec_cnt += 1
if tp + fn > 0:
doc_recall = tp / (tp + fn)
macro_re += doc_recall
valid_macro_recall_cnt += 1
micro_pr = micro_tp / (micro_tp + micro_fp) if (micro_tp + micro_fp) > 0 else 0
micro_re = micro_tp / (micro_tp + micro_fn) if (micro_tp + micro_fn) > 0 else 0
micro_f1 = 2*micro_pr*micro_re / (micro_pr + micro_re) if (micro_pr + micro_re) > 0 else 0
macro_pr = macro_pr / valid_macro_prec_cnt if valid_macro_prec_cnt > 0 else 0
macro_re = macro_re / valid_macro_recall_cnt if valid_macro_recall_cnt > 0 else 0
macro_f1 = 2*macro_pr*macro_re / (macro_pr + macro_re) if (macro_pr + macro_re) > 0 else 0
except ZeroDivisionError:
print("Exception! ZeroDivisionError in print results!\nmicro_tp, micro_fp, micro_fn = ", micro_tp,
micro_fp, micro_fn)
print(self.name, "thr", self.threshold)
print("micro", "P:", micro_pr, "\tR:", micro_re, "\tF1:", micro_f1)
print("macro", "P:", macro_pr, "\tR:", macro_re, "\tF1:", macro_f1)
return micro_f1, macro_f1, self.threshold
class Evaluator(object):
def __init__(self, weak_thr=None, strong_thr=None, name=""):
self.weak_evaluators = []
self.strong_evaluators = []
self.name = name
for thr in weak_thr:
self.weak_evaluators.append(Evaluator_aux(thr, "data"))
for thr in strong_thr:
self.strong_evaluators.append(Evaluator_aux(thr, "strong"))
def weak_check_tp(self, score, docid):
#list(map(lambda x: x.check_tp(score, docid), self.weak_evaluators))
for x in self.weak_evaluators:
x.check_tp(score, docid)
def weak_check_fp(self, score, docid):
#map(lambda x: x.check_fp(score, docid), self.weak_evaluators)
for x in self.weak_evaluators:
x.check_fp(score, docid)
def weak_check_fn(self, score, docid):
#map(lambda x: x.check_fn(score, docid), self.weak_evaluators)
for x in self.weak_evaluators:
x.check_fn(score, docid)
def strong_check_tp(self, score, docid):
#map(lambda x: x.check_tp(score, docid), self.strong_evaluators)
for x in self.strong_evaluators:
x.check_tp(score, docid)
def strong_check_fp(self, score, docid):
#map(lambda x: x.check_fp(score, docid), self.strong_evaluators)
for x in self.strong_evaluators:
x.check_fp(score, docid)
def strong_check_fn(self, score, docid):
#map(lambda x: x.check_fn(score, docid), self.strong_evaluators)
for x in self.strong_evaluators:
x.check_fn(score, docid)
def print_log_results(self, writer, eval_cnt):
weak_scores = [x.print_results() for x in self.weak_evaluators]
strong_scores = [x.print_results() for x in self.strong_evaluators]
if writer is not None:
for micro_f1, macro_f1, threshold in weak_scores:
name = self.name+" data " + str(threshold)
summary = tf.Summary(value=[tf.Summary.Value(tag=name+" micro_f1",
simple_value=micro_f1)])
writer.add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name+" macro_f1",
simple_value=macro_f1)])
writer.add_summary(summary, eval_cnt)
for micro_f1, macro_f1, threshold in strong_scores:
name = self.name+" strong " + str(threshold)
summary = tf.Summary(value=[tf.Summary.Value(tag=name+" micro_f1",
simple_value=micro_f1)])
writer.add_summary(summary, eval_cnt)
summary = tf.Summary(value=[tf.Summary.Value(tag=name+" macro_f1",
simple_value=macro_f1)])
writer.add_summary(summary, eval_cnt)
result_list = weak_scores if self.weak_evaluators != [] else strong_scores
return max(result_list, key=itemgetter(0))[0]
class WeakStrongMatching(object):
def __init__(self, b_e_gt_iterator):
self.exact = set() # of tuples (begin_idx, end_idx, gt)
self.weak = defaultdict(list) # a map with key the gt and value a list of tuples
# e.g. 4 -> [(5,7), (13,14)]
# so in order to check for data match a search my gt if in the
# data dictionary and if yes then check one by one overlap with
# the span
for b, e, gt in b_e_gt_iterator:
self.exact.add((b, e, gt))
self.weak[gt].append((b, e))
def strong_check(self, t):
return True if t in self.exact else False
def weak_check(self, t):
s, e, gt = t
if gt in self.weak:
for s2, e2 in self.weak[gt]:
if s<=s2 and e<=e2 and s2<e:
return True
elif s>=s2 and e>=e2 and s<e2:
return True
elif s<=s2 and e>=e2:
return True
elif s>=s2 and e<=e2:
return True
return False
class FNWeakStrongMatching(object):
def __init__(self, filtered_spans):
self.weak = defaultdict(list) # a map with key the gt and value a list of tuples
# e.g. 4 -> [(5,7, 0.2), (13,14, 0.3)]
# so in order to check for data match a search my gt if in the
# data dictionary and if yes then check one by one overlap with
# the span
for score, b, e, ent_id in filtered_spans:
self.weak[ent_id].append((b, e, score))
def strong_check(self, t):
s, e, gt = t
best_score = -10000
if gt in self.weak:
for s2, e2, score in self.weak[gt]:
if s==s2 and e==e2:
best_score = max(best_score, score)
return best_score
def weak_check(self, t):
s, e, gt = t
best_score = -10000
if gt in self.weak:
for s2, e2, score in self.weak[gt]:
if s<=s2 and e<=e2 and s2<e:
best_score = max(best_score, score)
elif s>=s2 and e>=e2 and s<e2:
best_score = max(best_score, score)
elif s<=s2 and e>=e2:
best_score = max(best_score, score)
elif s>=s2 and e<=e2:
best_score = max(best_score, score)
return best_score
def validation_scores_calculation(evaluator, final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len,
begin_gm, end_gm, ground_truth,
ground_truth_len, words_len, chunk_id, test_mode):
if test_mode is False:
begin_gm = begin_span
end_gm = end_span
# for each candidate span find which is the cand entity with the highest score
for b in range(final_scores.shape[0]): # batch
spans = []
for i in range(spans_len[b]): # candidate span
begin_idx = begin_span[b][i]
end_idx = end_span[b][i]
best_cand_id = -1
best_cand_score = -10000
for j in range(cand_entities_len[b][i]): # how many candidate entities we have for this span
score = final_scores[b][i][j]
if score > best_cand_score:
best_cand_score = score
best_cand_id = cand_entities[b][i][j]
spans.append((best_cand_score, begin_idx, end_idx, best_cand_id))
# now filter this list of spans based on score. from the overlapping ones keep the one
# with the highest score.
spans = sorted(spans, reverse=True) # highest score lowest score
filtered_spans = []
claimed = np.full(words_len[b], False, dtype=bool) # initially all words are free to select
for span in spans:
best_cand_score, begin_idx, end_idx, best_cand_id = span
if not np.any(claimed[begin_idx:end_idx]) and best_cand_id > 0:
# nothing is claimed so take it TODO this > 0 condition is it correct???
claimed[begin_idx:end_idx] = True
filtered_spans.append(span)
# now traverse all the filtered spans and compare them with the gold mentions
# for each tuple of filtered_spans check for tp or fp
gm_gt_list = [(begin_gm[b][i], end_gm[b][i], ground_truth[b][i]) for i in range(ground_truth_len[b])]
matcher = WeakStrongMatching(gm_gt_list)
# b'947testa_CRICKET&*0&*0' to b'947testa_CRICKET'
docid = chunk_id[b].split(b"&*", 1)[0]
for t in filtered_spans:
if matcher.strong_check(t[1:]):
evaluator.strong_check_tp(t[0], docid)
else:
evaluator.strong_check_fp(t[0], docid)
if matcher.weak_check(t[1:]):
evaluator.weak_check_tp(t[0], docid)
else:
evaluator.weak_check_fp(t[0], docid)
# now check for the fn
matcher = FNWeakStrongMatching(filtered_spans)
for t in gm_gt_list:
score = matcher.strong_check(t)
evaluator.strong_check_fn(score, docid)
score = matcher.weak_check(t)
evaluator.weak_check_fn(score, docid)
def evaluation_scores_calculation(evaluator, final_scores, cand_entities_len, cand_entities,
begin_span, end_span, spans_len,
begin_gm, end_gm, ground_truth,
ground_truth_len, words_len, chunk_id, similarity_scores,
words, chars, chars_len, cand_entities_scores,
test_mode, printPredictions=None):
if test_mode is False:
begin_gm = begin_span
end_gm = end_span
# for each candidate span find which is the cand entity with the highest score
for b in range(final_scores.shape[0]): # batch
spans = []
for i in range(spans_len[b]): # candidate span
begin_idx = begin_span[b][i]
end_idx = end_span[b][i]
best_cand_id = -1
best_cand_score = -10000
best_cand_similarity_score = -10000
best_cand_position = -1
for j in range(cand_entities_len[b][i]): # how many candidate entities we have for this span
score = final_scores[b][i][j]
if score > best_cand_score:
best_cand_score = score
best_cand_id = cand_entities[b][i][j]
best_cand_similarity_score = similarity_scores[b][i][j]
best_cand_position = j
spans.append((best_cand_score, begin_idx, end_idx, best_cand_id,
best_cand_similarity_score, best_cand_position))
# now filter this list of spans based on score. from the overlapping ones keep the one
# with the highest score.
spans = sorted(spans, reverse=True) # highest score lowest score
filtered_spans = []
claimed = np.full(words_len[b], False, dtype=bool) # initially all words are free to select
for span in spans:
best_cand_score, begin_idx, end_idx, best_cand_id, _, _ = span
if not np.any(claimed[begin_idx:end_idx]) and best_cand_id > 0:
# nothing is claimed so take it TODO this > 0 condition is it correct???
claimed[begin_idx:end_idx] = True
filtered_spans.append(span)
# now traverse all the filtered spans and compare them with the gold mentions
# for each tuple of filtered_spans check for tp or fp
gm_gt_list = [(begin_gm[b][i], end_gm[b][i], ground_truth[b][i]) for i in range(ground_truth_len[b])]
matcher = WeakStrongMatching(gm_gt_list)
# b'947testa_CRICKET&*0&*0' to b'947testa_CRICKET'
docid = chunk_id[b].split(b"&*", 1)[0]
tp_pred = []
fp_pred = []
fn_pred = []
thr = printPredictions.thr if printPredictions is not None else 0.2
for t in filtered_spans:
if matcher.strong_check(t[1:-2]):
evaluator.strong_check_tp(t[0], docid)
else:
evaluator.strong_check_fp(t[0], docid)
if matcher.weak_check(t[1:-2]):
evaluator.weak_check_tp(t[0], docid)
if t[0] >= thr:
tp_pred.append(t)
else:
evaluator.weak_check_fp(t[0], docid)
if t[0] >= thr:
fp_pred.append(t)
# now check for the fn
matcher = FNWeakStrongMatching(
[t[:-2] for t in filtered_spans])
for t in gm_gt_list:
score = matcher.strong_check(t)
evaluator.strong_check_fn(score, docid)
score = matcher.weak_check(t)
evaluator.weak_check_fn(score, docid)
if score < thr:
fn_pred.append(t)
if printPredictions is not None:
printPredictions.process_sample(chunk_id[b], gm_gt_list,
tp_pred, fp_pred, fn_pred,
words[b], words_len[b],
chars[b], chars_len[b],
cand_entities[b], cand_entities_scores[b])
================================================
FILE: code/evaluation/print_predictions (copy).py
================================================
from termcolor import colored
import pickle
from preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_name_id_map
from collections import defaultdict
import operator
class PrintPredictions(object):
def __init__(self, output_folder, predictions_folder, entity_extension=None):
self.thr = None
self.output_folder = output_folder
self.predictions_folder = predictions_folder
with open(output_folder+"word_char_maps.pickle", 'rb') as handle:
_, self.id2word, _, self.id2char, _, _ = pickle.load(handle)
self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True)
_, self.wiki_id_name_map = load_wiki_name_id_map()
self.extra_info = ""
def map_entity(self, nnid):
wikiid = self.nnid2wikiid[nnid]
wikiname = self.wiki_id_name_map[wikiid].replace(' ', '_') if wikiid != "<u>" else "<u>"
return "{} {}".format(wikiid, wikiname)
def process_file(self, el_mode, name, opt_thr):
self.thr = opt_thr
self.el_mode = el_mode
filepath = self.predictions_folder + ("el/" if el_mode else "ed/") + name
self.fout = open(filepath, "w")
def file_ended(self):
self.fout.close()
def process_sample(self, chunkid,
tp_pred, fp_pred, fn_pred, gt_minus_fn_pred,
words, words_len, chars, chars_len,
cand_entities, log_cand_entities_scores, cand_entities_len,
final_scores, similarity_scores):
"""words: [None] 1d the words of a sample, words_len: scalar,
chars: [None, None] 2d words, chars of each word, chars_len: [None] for each word
the length in terms of characters.
cand_entities: [None, None] gold_mentions, candidates for each gm,
cand_entitites_len: [None] how many cand ent each gm has."""
reconstructed_words = []
for i in range(words_len):
word = words[i]
if word != 0:
reconstructed_words.append(self.id2word[word])
else: # <wunk>
word_chars = []
for j in range(chars_len[i]):
word_chars.append(self.id2char[chars[i][j]])
reconstructed_words.append(''.join(word_chars))
text_tags = defaultdict(list)
gt_legend = []
for mylist, mycolor in zip([gt_minus_fn_pred, fn_pred], ["green", "red"]):
for i, (gm_num, b, e, gt) in enumerate(mylist, 1):
text_tags[b].append((1, colored("[{}".format(i), mycolor)))
text_tags[e].append((0, colored("]", mycolor)))
gt_text = ""
if self.el_mode is False: # find the position and the score of the ground truth
gt_text = "gt not in candidate entities (recall miss)"
for j in range(cand_entities_len[gm_num]):
if cand_entities[gm_num][j] == gt:
gt_text = "gt_p_e_m_pos={}, gt_logpem_score={}".format(j,
log_cand_entities_scores[gm_num][j])
break
text = colored("{}: {} {}".format(i, self.map_entity(gt), gt_text), mycolor)
gt_legend.append(text)
tp_legend = []
tp_pred = sorted(tp_pred, key=operator.itemgetter(1))
for i, (score, b, e, nnid, sim_score, p_e_m_pos, span_num) in enumerate(tp_pred, 1):
text_tags[b].append((1, colored("[{}".format(i), "blue")))
text_tags[e].append((0, colored("]", "blue")))
text = colored("{}: {}, score={}, sim_score={}, logpem={}, pem_pos={}".format(i,
self.map_entity(nnid), score, sim_score,
log_cand_entities_scores[span_num][p_e_m_pos], p_e_m_pos), "blue")
tp_legend.append(text)
fp_legend = []
fp_pred = sorted(fp_pred, key=operator.itemgetter(1))
if len(fp_pred) > 0:
fpWeakMatcherLogging = FPWeakMatcherLogging(self, fn_pred+gt_minus_fn_pred,
cand_entities, cand_entities_len, log_cand_entities_scores,
final_scores, similarity_scores)
for i, (score, b, e, nnid, sim_score, p_e_m_pos, span_num) in enumerate(fp_pred, 1):
text_tags[b].append((1, colored("[{}".format(i), "magenta")))
text_tags[e].append((0, colored("]", "magenta")))
fp_gt_text = fpWeakMatcherLogging.check(b, e, span_num)
text = colored("{}: {}, score={}, sim_score={}, logpem={}, pem_pos={} {} ".format(i,
self.map_entity(nnid), score, sim_score,
log_cand_entities_scores[span_num][p_e_m_pos], p_e_m_pos,
fp_gt_text), "magenta")
fp_legend.append(text)
final_acc = ["new sample " + chunkid+"\n"]
for i in range(words_len+1):
final_acc.extend([text for _, text in sorted(text_tags[i])])
if i < words_len:
final_acc.append(reconstructed_words[i])
self.fout.write(" ".join(final_acc)+"\n")
self.fout.write("\n".join(gt_legend + tp_legend + fp_legend))
self.fout.write("\n")
class FPWeakMatcherLogging(object):
"""is initialized with the gm_gt_list i.e. a list of tuples
(begin_idx, end_idx, gt) and from the list of tuples it builds a data structure. We already
know that our tuple doesn't match a ground truth. Now we want to find out what exactly happens.
cases: 1)) doesn't overlap with any gm 2)) overlap with one or more gm. In this case for each gm
that it overlaps with find a) which is the gt of this gm, b) final_score, sim_score, p_e_m position
of the gt in my fp tuple.
structure used: just a list of (begin_idx, end_idx, gt) tuples.
This one is used only during evaluation.py from the
metrics_calculation_and_prediction_printing in order to produce logging text
for the fp"""
def __init__(self, printPredictions, b_e_gt_iterator, cand_entities, cand_entities_len,
log_cand_entities_scores, final_scores, similarity_scores):
self.printPredictions = printPredictions
self.data = b_e_gt_iterator
self.cand_entities = cand_entities
self.cand_entities_len = cand_entities_len
self.log_cand_entities_scores = log_cand_entities_scores
self.final_scores = final_scores
self.similarity_scores = similarity_scores
def check(self, s, e, span_num):
# all the above information that i have for my best_cand_id, now i have to find them
# for the gt of the gm that overlap with my fp tuple.
# compare my tuple s, e with all the gm
acc = []
for (gm_num, s2, e2, gt) in self.data:
overlap = False # overlap with this specific gm of the for loop
if s<=s2 and e<=e2 and s2<e:
overlap = True
elif s>=s2 and e>=e2 and s<e2:
overlap = True
elif s<=s2 and e>=e2:
overlap = True
elif s>=s2 and e<=e2:
overlap = True
if not overlap:
continue
# add to the text accumulator the info for this gt
# find gt_score, gt_similarity_score, gt_cand_position
# check all the candidate entities of this span and find where is the gt
# of course we may not find it at all (recall miss)
gt_cand_position = -1
for j in range(self.cand_entities_len[span_num]):
if self.cand_entities[span_num][j] == gt:
gt_cand_position = j
break
if gt_cand_position >= 0:
acc.append("| {}, score={}, sim_score={}, logpem={}, pem_pos={}".format(
self.printPredictions.map_entity(gt),
self.final_scores[span_num][gt_cand_position],
self.similarity_scores[span_num][gt_cand_position],
self.log_cand_entities_scores[span_num][gt_cand_position],
gt_cand_position))
else:
acc.append("| {}, recall miss".format(self.printPredictions.map_entity(gt)))
if acc == []:
acc.append("| no overlap with gm")
return ' '.join(acc)
================================================
FILE: code/evaluation/print_predictions.py
================================================
from termcolor import colored
import pickle
from preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_name_id_map
from collections import defaultdict
import operator
import numpy as np
class GMBucketingResults(object):
def __init__(self, gm_bucketing_pempos):
gm_bucketing_pempos.append(200) # [0,1,2,7,200]
self.gm_buckets = gm_bucketing_pempos
self.gm_cnt = defaultdict(int) # how many gold mentions fall in that frquency. one counter for each bucket
self.fn_cnt = defaultdict(int) # for many false negative in that bucket
self.fn_nowinnermatch_cnt = defaultdict(int) # from the fn we exclude the ones that our winner was identical
# to gt even if we decided not to annotate in the end
self.gm_to_gt_unique_mapping = 0 # for this gold mention we have only one candidate entity which is the gt.
def reinitialize(self):
self.gm_cnt = defaultdict(int)
self.fn_cnt = defaultdict(int)
self.fn_nowinnermatch_cnt = defaultdict(int)
self.gm_to_gt_unique_mapping = 0
def process_fn(self, pos, match_with_winner, num_of_cand_entities):
if pos == 0 and num_of_cand_entities == 1:
self.gm_to_gt_unique_mapping += 1
for t in self.gm_buckets:
if pos <= t:
self.gm_cnt[t] += 1
self.fn_cnt[t] += 1
if not match_with_winner:
self.fn_nowinnermatch_cnt[t] += 1
break
def process_tp(self, pos, num_of_cand_entities):
if pos == 0 and num_of_cand_entities == 1:
self.gm_to_gt_unique_mapping += 1
for t in self.gm_buckets:
if pos <= t:
self.gm_cnt[t] += 1
break
def print(self):
print("gm_to_gt_unique_mapping =", self.gm_to_gt_unique_mapping)
for t in self.gm_buckets:
print(str(t), "]", "gm_cnt=", str(self.gm_cnt[t]),
"solved=%.1f" % (100*(self.gm_cnt[t] - self.fn_cnt[t])/self.gm_cnt[t]),
"winner_match=%.1f" % (100*(self.gm_cnt[t] - self.fn_nowinnermatch_cnt[t])/self.gm_cnt[t]))
class PrintPredictions(object):
def __init__(self, output_folder, predictions_folder, entity_extension=None, gm_bucketing_pempos=None,
print_global_voters=False, print_global_pairwise_scores=False):
self.thr = None
self.output_folder = output_folder
self.predictions_folder = predictions_folder
with open(output_folder+"word_char_maps.pickle", 'rb') as handle:
_, self.id2word, _, self.id2char, _, _ = pickle.load(handle)
self.nnid2wikiid = reverse_dict(load_wikiid2nnid(entity_extension), unique_values=True)
_, self.wiki_id_name_map = load_wiki_name_id_map()
self.extra_info = ""
self.gm_bucketing = GMBucketingResults(gm_bucketing_pempos) if gm_bucketing_pempos else None
self.print_global_pairwise_scores = print_global_pairwise_scores
self.print_global_voters = print_global_voters
def map_entity(self, nnid, onlyname=False):
wikiid = self.nnid2wikiid[nnid]
wikiname = self.wiki_id_name_map[wikiid].replace(' ', '_') if wikiid != "<u>" else "<u>"
return wikiname if onlyname else "{} {}".format(wikiid, wikiname)
def process_file(self, el_mode, name, opt_thr):
self.thr = opt_thr
self.el_mode = el_mode
filepath = self.predictions_folder + ("el/" if el_mode else "ed/") + name
self.fout = open(filepath, "w")
if self.gm_bucketing:
self.gm_bucketing.reinitialize()
def file_ended(self):
self.fout.close()
if self.gm_bucketing:
self.gm_bucketing.print()
def scores_text(self, scores_l, scores_names_l, i, j):
return ' '.join([scores_name + "=" + str(score[i][j]) for scores_name, score in zip(scores_names_l, scores_l)])
def process_sample(self, chunkid,
tp_pred, fp_pred, fn_pred, gt_minus_fn_pred,
words, words_len, chars, chars_len,
cand_entities, cand_entities_len,
final_scores, filtered_spans, scores_l, scores_names_l, gmask, entity_embeddings):
"""words: [None] 1d the words of a sample, words_len: scalar,
chars: [None, None] 2d words, chars of each word, chars_len: [None] for each word
the length in terms of characters.
cand_entities: [None, None] gold_mentions, candidates for each gm,
cand_entitites_len: [None] how many cand ent each gm has.
filtered_spans = [span1, span2,...] sorted in terms of score. each span is a tuple
(score, begin_idx, end_idx, best_nnid, simil_score, best_position 1-30, span_num)
tp_pred and fp_pred is also a list of spans like above and it is also sorted for score.
fn_pred is a [(gm_num, begin_gm, end_gm, gt)]"""
reconstructed_words = []
for i in range(words_len):
word = words[i]
if word != 0:
reconstructed_words.append(self.id2word[word])
else: # <wunk>
word_chars = []
for j in range(chars_len[i]):
word_chars.append(self.id2char[chars[i][j]])
reconstructed_words.append(''.join(word_chars))
span_num_b_e_gt = sorted(fn_pred+gt_minus_fn_pred)
text_tags = defaultdict(list)
gt_legend = []
if len(fn_pred) > 0:
fnWeakMatcherLogging = FNWeakMatcherLogging(self, filtered_spans, cand_entities,
cand_entities_len, final_scores, scores_l, scores_names_l,
reconstructed_words, self.gm_bucketing, gmask, entity_embeddings,
span_num_b_e_gt)
for mylist, mycolor in zip([gt_minus_fn_pred, fn_pred], ["green", "red"]):
for i, (gm_num, b, e, gt) in enumerate(mylist, 1):
text_tags[b].append((1, colored("[{}".format(i), mycolor)))
text_tags[e].append((0, colored("]", mycolor)))
gt_text = ""
if self.el_mode is False: # find the position and the score of the ground truth
gt_text = "gt not in candidate entities (recall miss)"
for j in range(cand_entities_len[gm_num]):
if cand_entities[gm_num][j] == gt:
gt_text = "gt_p_e_m_pos={}, {}".format(j,
self.scores_text(scores_l, scores_names_l, gm_num, j))
break
text = "{}: {} {}".format(i, self.map_entity(gt), gt_text)
if mycolor == "red":
text += fnWeakMatcherLogging.check(gm_num, b, e, gt)
text = colored(text, mycolor)
gt_legend.append(text)
tp_legend = []
tp_pred = sorted(tp_pred, key=operator.itemgetter(1))
for i, (score, b, e, nnid, scores_text, p_e_m_pos, span_num) in enumerate(tp_pred, 1):
text_tags[b].append((1, colored("[{}".format(i), "blue")))
text_tags[e].append((0, colored("]", "blue")))
text = colored("{}: {}, score={}, {}, pem_pos={}".format(i,
self.map_entity(nnid), score, scores_text, p_e_m_pos), "blue")
tp_legend.append(text)
if self.gm_bucketing:
self.gm_bucketing.process_tp(p_e_m_pos, cand_entities_len[span_num])
fp_legend = []
fp_pairwise_scores_legend = []
fp_pred = sorted(fp_pred, key=operator.itemgetter(1))
if len(fp_pred) > 0:
fpWeakMatcherLogging = FPWeakMatcherLogging(self, span_num_b_e_gt, #fn_pred+gt_minus_fn_pred,
cand_entities, cand_entities_len,
final_scores, scores_l, scores_names_l, reconstructed_words, self.gm_bucketing,
gmask, entity_embeddings)
for i, (score, b, e, nnid, scores_text, p_e_m_pos, span_num) in enumerate(fp_pred, 1):
text_tags[b].append((1, colored("[{}".format(i), "magenta")))
text_tags[e].append((0, colored("]", "magenta")))
fp_gt_text, pairwise_score_text = fpWeakMatcherLogging.check(b, e, span_num, p_e_m_pos)
text = "{}: {}, score={}, {}, pem_pos={} {} ".format(i,
self.map_entity(nnid), score, scores_text, p_e_m_pos,
fp_gt_text)
fp_legend.append(colored(text, "magenta"))
fp_pairwise_scores_legend.append("\n"+text)
fp_pairwise_scores_legend.append(pairwise_score_text)
final_acc = ["new sample " + chunkid+"\n"+self.extra_info+"\n"]
for i in range(words_len+1):
final_acc.extend([text for _, text in sorted(text_tags[i])])
if i < words_len:
final_acc.append(reconstructed_words[i])
self.fout.write(" ".join(final_acc)+"\n")
if self.print_global_voters:
self.fout.write("global score voters and weights:\n")
gmask_print_string = self.print_gmask(gmask, span_num_b_e_gt, reconstructed_words, cand_entities)
self.fout.write(gmask_print_string+"\n")
self.fout.write("\n".join(gt_legend + tp_legend + fp_legend))
if self.print_global_pairwise_scores:
self.fout.write(colored("\n".join(fp_pairwise_scores_legend), "grey"))
self.fout.write("\n")
def print_gmask(self, gmask, span_num_b_e_gt, reconstructed_words, cand_entities):
i = 0
document_gmask_acc = []
for span_num, b, e, gt in span_num_b_e_gt:
assert(i == span_num)
text_acc = ["mention {} {}: ".format(span_num, ' '.join(reconstructed_words[b:e]))]
for cand_ent_pos in range(gmask.shape[1]):
mask_value = gmask[span_num][cand_ent_pos]
assert(mask_value >= 0)
if mask_value > 0:
text_acc.append("{} {:.2f} | ".format(self.map_entity(cand_entities[span_num][cand_ent_pos]),
mask_value))
i += 1
document_gmask_acc.append(' '.join(text_acc))
return '\n'.join(document_gmask_acc)
class FPWeakMatcherLogging(object):
"""is initialized with the gm_gt_list i.e. a list of tuples
(begin_idx, end_idx, gt) and from the list of tuples it builds a data structure. We already
know that our tuple doesn't match a ground truth. Now we want to find out what exactly happens.
cases: 1)) doesn't overlap with any gm 2)) overlap with one or more gm. In this case for each gm
that it overlaps with find a) which is the gt of this gm, b) final_score, sim_score, p_e_m position
of the gt in my fp tuple.
structure used: just a list of (begin_idx, end_idx, gt) tuples.
This one is used only during evaluation.py from the
metrics_calculation_and_prediction_printing in order to produce logging text
for the fp"""
def __init__(self, printPredictions, span_num_b_e_gt, cand_entities, cand_entities_len,
final_scores, scores_l, scores_names_l, reconstructed_words, gm_bucketing=None,
gmask=None, entity_embeddings=None):
self.printPredictions = printPredictions
self.data = span_num_b_e_gt
self.cand_entities = cand_entities
self.cand_entities_len = cand_entities_len
self.final_scores = final_scores
self.scores_l = scores_l
self.scores_names_l = scores_names_l
self.reconstructed_words = reconstructed_words
self.gm_bucketing = gm_bucketing
self.gmask = gmask
self.entity_embeddings = entity_embeddings
def check(self, s, e, span_num, winner_pos=None):
# all the above information that i have for my best_cand_id, now i have to find them
# for the gt of the gm that overlap with my fp tuple.
# compare my tuple s, e with all the gm
acc = []
pairwise_scores_text = ""
for (gm_num, s2, e2, gt) in self.data:
overlap = False # overlap with this specific gm of the for loop
if s<=s2 and e<=e2 and s2<e:
overlap = True
elif s>=s2 and e>=e2 and s<e2:
overlap = True
elif s<=s2 and e>=e2:
overlap = True
elif s>=s2 and e<=e2:
overlap = True
if not overlap:
continue
# add to the text accumulator the info for this gt
# find gt_score, gt_similarity_score, gt_cand_position
# check all the candidate entities of this fp span and find where is the gt
# of course we may not find it at all (recall miss)
gt_cand_position = -1
for j in range(self.cand_entities_len[span_num]):
if self.cand_entities[span_num][j] == gt:
gt_cand_position = j
break
if gt_cand_position >= 0:
acc.append("| {}, score={}, {}, pem_pos={}".format(
self.printPredictions.map_entity(gt),
self.final_scores[span_num][gt_cand_position],
self.printPredictions.scores_text(self.scores_l, self.scores_names_l, span_num, gt_cand_position),
gt_cand_position))
else:
acc.append("| {}, recall miss".format(self.printPredictions.map_entity(gt)))
if self.printPredictions.print_global_pairwise_scores:
pairwise_scores_text = print_global_pairwise_voting(self.gmask, self.data, self.reconstructed_words,
self.cand_entities, self.printPredictions,
self.entity_embeddings, span_num, winner_pos,
gt_cand_position)
if acc == []:
acc.append("| no overlap with gm")
return ' '.join(acc), pairwise_scores_text
class FNWeakMatcherLogging(object):
""" This is used to produce text for the FN.
From the filtered spans i.e. the spans that we keep that do not overlap with each other
filtered_spans: [(best_cand_score, begin_idx, end_idx, best_cand_id,
scores_text, best_cand_position, span_num),(),...]"""
def __init__(self, printPredictions, filtered_spans, cand_entities, cand_entities_len,
final_scores, scores_l, scores_names_l, reconstructed_words, gm_bucketing=None,
gmask=None, entity_embeddings=None, span_num_b_e_gt=None):
self.printPredictions = printPredictions
self.data = filtered_spans
self.cand_entities = cand_entities
self.cand_entities_len = cand_entities_len
self.scores_l = scores_l
self.scores_names_l = scores_names_l
self.final_scores = final_scores
self.reconstructed_words = reconstructed_words
self.gm_bucketing = gm_bucketing
self.gmask = gmask
self.entity_embeddings = entity_embeddings
self.span_num_b_e_gt = span_num_b_e_gt
def check(self, gm_num, s, e, gt):
# now I will compare each possible span of filtered_spans and for each of them if they overlap
# with the fn mention I will print what was their winner entity (even if it was below the
# threshold) and what was the assigned score to the gt (if it was a candidate)
acc = []
for (best_cand_score, s2, e2, best_cand_id, scores_text, best_cand_position, span_num) in self.data:
overlap = False # overlap with this specific filtered span
if s<=s2 and e<=e2 and s2<e:
overlap = True
elif s>=s2 and e>=e2 and s<e2:
overlap = True
elif s<=s2 and e>=e2:
overlap = True
elif s>=s2 and e<=e2:
overlap = True
if not overlap:
continue
# add to the text accumulator the info for this filtered span
# print winner of this span info plus gt info: find gt_score, gt_cand_position
# check all the candidate entities of this span and find where is the gt
# of course we may not find it at all (recall miss)
gt_cand_position = -1
for j in range(self.cand_entities_len[span_num]):
if self.cand_entities[span_num][j] == gt:
gt_cand_position = j
break
assert(abs(best_cand_score - self.final_scores[span_num][best_cand_position]) < 0.001)
acc.append("[span: {} winner: {}, score={}, {}, pem_pos={}".format(
' '.join(self.reconstructed_words[s2:e2]),
self.printPredictions.map_entity(best_cand_id),
best_cand_score,
self.printPredictions.scores_text(self.scores_l, self.scores_names_l, span_num, best_cand_position),
best_cand_position))
if gt_cand_position >= 0:
acc.append(" | gt: {}, score={}, {}, pem_pos={} ]".format(
self.printPredictions.map_entity(gt),
self.final_scores[span_num][gt_cand_position],
self.printPredictions.scores_text(self.scores_l, self.scores_names_l, span_num, gt_cand_position),
gt_cand_position))
if self.gm_bucketing:
self.gm_bucketing.process_fn(gt_cand_position, best_cand_id == gt, self.cand_entities_len[span_num])
else:
acc.append(" | {}, recall miss".format(self.printPredictions.map_entity(gt)))
if False and self.printPredictions.print_global_pairwise_scores:
acc.append(print_global_pairwise_voting(self.gmask, self.span_num_b_e_gt, self.reconstructed_words,
self.cand_entities, self.printPredictions,
self.entity_embeddings, span_num, best_cand_position,
gt_cand_position))
if acc == []:
acc.append(" | no overlap with any filtered span")
return ' '.join(acc)
# TODO for ed it works well. for EL we have more spans than just the gold mentions that vote.
# pass as parameters the begin_spans, end_spans from metrics.py
def print_global_pairwise_voting(gmask, span_num_b_e_gt, reconstructed_words, cand_entities, printPredictions,
entity_embeddings, span_num, winner_pos, gt_pos):
i = 0
return_acc = ["'winner & gt' score given by each global voter"]
winner_score_sum = 0
gt_score_sum = 0
voters_cnt = 0
for other_span, b, e, _ in span_num_b_e_gt:
assert(i == other_span)
if other_span == span_num:
i += 1
continue #only the other spans vote
mention_acc = ["mention {} {}: ".format(other_span, ' '.join(reconstructed_words[b:e]))]
for cand_ent_pos in range(gmask.shape[1]):
mask_value = gmask[other_span][cand_ent_pos]
assert(mask_value >= 0)
if mask_value > 0:
winner_score = np.dot(entity_embeddings[other_span][cand_ent_pos],
entity_embeddings[span_num][winner_pos]) * mask_value
gt_score = np.dot(entity_embeddings[other_span][cand_ent_pos],
entity_embeddings[span_num][gt_pos]) * mask_value
winner_score_sum += winner_score
gt_score_sum += gt_score
voters_cnt += 1
mention_acc.append("{} {:.2f} & {:.2f} |".format(
printPredictions.map_entity(cand_entities[other_span][cand_ent_pos], onlyname=True),
winner_score, gt_score))
i += 1
return_acc.append(' '.join(mention_acc))
return_acc.append("global winner_score_avg = {:.2f} gt_score_avg = {:.2f}".format(
winner_score_sum/voters_cnt, gt_score_sum/voters_cnt))
return '\n'.join(return_acc)
================================================
FILE: code/evaluation/summarize_all_experiments.py
================================================
import argparse
import os
def process_experiment(ed_acc, el_acc, training_name):
if not os.path.exists(os.path.join(training_name, "log.txt")):
print("File doesn't exists: ", os.path.join(training_name, "log.txt"))
return
if file_is_used(os.path.join(training_name, "log.txt")):
print("File is being used by another process. Skip it.", os.path.join(training_name, "log.txt"))
return
with open(os.path.join(training_name, "log.txt"), "r") as fin:
print("file: ", training_name+"/log.txt")
best = dict()
best["ed_dev_f1"] = 0
best["el_dev_f1"] = 0
best["ed_test_f1"] = 0
best["el_test_f1"] = 0
mode = ""
for line in fin:
line = line.rstrip()
if line.startswith("args.eval_cnt"):
eval_cnt = line[line.rfind(' ')+1:]
elif line.startswith("Evaluating ED datasets"):
mode = "ed"
elif line.startswith("Evaluating EL datasets"):
mode = "el"
elif line.startswith(args.dev_set): #("aida_dev.txt"):
try:
micro_line = next(fin)
macro_line = next(fin)
line = macro_line if args.macro_or_micro == "macro" else micro_line
dev_f1 = float(line.split()[-1])
dev_pr = float(line.split()[2])
dev_re = float(line.split()[4])
if dev_f1 > best[mode+"_dev_f1"]:
best[mode+"_dev_f1"] = dev_f1
best[mode+"_dev_pr"] = dev_pr
best[mode+"_dev_re"] = dev_re
best[mode+"_eval_cnt"] = eval_cnt
# now read forward the test results
#assert(next(fin).startswith(args.test_set)) #("aida_test.txt"))
next_line = next(fin)
while not next_line.startswith(args.test_set):
next_line = next(fin)
assert(next_line.startswith(args.test_set)) #("aida_test.txt"))
micro_line = next(fin)
macro_line = next(fin)
line = macro_line if args.macro_or_micro == "macro" else micro_line
best[mode+"_test_f1"] = float(line.split()[-1])
best[mode+"_test_pr"] = float(line.split()[2])
best[mode+"_test_re"] = float(line.split()[4])
except StopIteration:
break
path = training_name[training_name.find(base_folder)+len(base_folder):]
# print the scores for this log file
#fixed_no_wikidump_entvecsl2/checkpoints/model-7 model-30.meta
if "ed_eval_cnt" in best:
checkpoint_text = "checkpoint_yes" if os.path.exists(training_name + "/checkpoints/ed/model-{}.meta".format(best["ed_eval_cnt"])) else "checkpoint_no"
ed_acc.append((best["ed_dev_f1"], best["ed_test_f1"], path,
best["ed_test_pr"], best["ed_test_re"], best["ed_eval_cnt"], checkpoint_text, training_name))
if "el_eval_cnt" in best:
checkpoint_text = "checkpoint_yes" if os.path.exists(training_name + "/checkpoints/el/model-{}.meta".format(best["el_eval_cnt"])) else "checkpoint_no"
el_acc.append((best["el_dev_f1"], best["el_test_f1"], path,
best["el_test_pr"], best["el_test_re"], best["el_eval_cnt"], checkpoint_text, training_name))
def process_folder(ed_acc, el_acc, training_name):
"""training_name may be a folder with one experiment or a group folder containing many experiment. In
the second case do recursion on all the subfolders."""
training_name_suffix = os.path.basename(os.path.normpath(training_name))
if training_name_suffix.startswith("group_") or training_name_suffix.startswith("reduced") or\
training_name_suffix.startswith("ensemble_"):
# then it is a group folder so do recursion on all your subfolders
d = training_name
subfolders = [os.path.join(d, o) for o in os.listdir(d)
if os.path.isdir(os.path.join(d, o))]
for subfolder in subfolders:
process_folder(ed_acc, el_acc, subfolder)
else:
process_experiment(ed_acc, el_acc, training_name)
def file_is_used(filepath):
from subprocess import check_output, Popen, PIPE, DEVNULL, STDOUT
try:
lsout = Popen(['lsof', filepath], stdout=PIPE, shell=False, stderr=DEVNULL)
check_output(["grep", filepath], stdin=lsout.stdout, shell=False)
return True
except:
#check_output will throw an exception here if it won't find any process using that file
return False
def main():
ed_acc = []
el_acc = []
if args.group_folder_path:
process_folder(ed_acc, el_acc, args.group_folder_path)
else:
d = base_folder
# if the base folder is itself an experiment i.e. contains a training_folder and an all_spans_training_folder
print(os.listdir(d))
if len([o for o in os.listdir(d) if o in ["all_spans_training_folder", "training_folder"]]) > 0:
experiment_names = [d]
else:
experiment_names = [os.path.join(d, o) for o in os.listdir(d)
if os.path.isdir(os.path.join(d, o))]
print("experiment_names =", experiment_names)
for experiment_name in experiment_names:
training_names = []
for temp in ["training_folder", "all_spans_training_folder"]:
d = os.path.join(experiment_name, temp)
if not os.path.exists(d):
continue
training_names.extend([os.path.join(d, o) for o in os.listdir(d)
if os.path.isdir(os.path.join(d, o))])
#print(training_names)
for training_name in training_names:
process_folder(ed_acc, el_acc, training_name)
ed_acc = sorted(ed_acc, reverse=True)
el_acc = sorted(el_acc, reverse=True)
print("Dev_score, Test_score, path, test_precision, test_recall, eval_cnt, checkp_existence")
print("ED Best Scores:")
for t in ed_acc:
print('\t'.join(map(str, t[:-1])))
print("\n\n\nEL Best Scores:")
for t in el_acc:
print('\t'.join(map(str, t[:-1])))
def _parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_folder", default="../../data/tfrecords/")
parser.add_argument("--macro_or_micro", default="macro")
parser.add_argument("--dev_set", default="aida_dev.txt")
parser.add_argument("--test_set", default="aida_test.txt")
parser.add_argument("--group_folder_path", default=None)
return parser.parse_args()
if __name__ == "__main__":
args = _parse_args()
base_folder = os.path.abspath(args.base_folder)
print("base_folder =", base_folder)
print("group_folder =", args.group_folder_path)
main()
================================================
FILE: code/gerbil/build_entity_universe.py
================================================
import pickle
from nltk.tokenize import word_tokenize
import preprocessing.prepro_util as prepro_util
from preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_name_id_map, FetchCandidateEntities
import os
import model.config as config
class BuildEntityUniverse(object):
def __init__(self):
self.entities_universe = set()
self.fetchCandidateEntities = FetchCandidateEntities(Struct())
prepro_util.args = Struct()
def process(self, text, given_spans):
# if we wanted to find entities for ed only then restrict it to given_spans instead of all spans
chunk_words = word_tokenize(text)
myspans = prepro_util.SamplesGenerator.all_spans(chunk_words)
for left, right in myspans:
cand_ent, _ = self.fetchCandidateEntities.process(chunk_words[left:right])
# cand_ent is a list of strings (i.e. wikiids are still strings) not nums
if cand_ent:
self.entities_universe.update(cand_ent)
def flush_entity_universe(self):
print("len(self.entities_universe) =", len(self.entities_universe))
entities_folder = config.base_folder+"data/entities/extension_entities/"
_, wiki_id_name_map = load_wiki_name_id_map()
if not os.path.exists(entities_folder):
os.makedirs(entities_folder)
def dump_entities(entity_set, name):
with open(entities_folder + name+".pickle", 'wb') as handle:
pickle.dump(entity_set, handle)
with open(entities_folder + name+".txt", "w") as fout:
for ent_id in entity_set:
fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n")
dump_entities(self.entities_universe, "entities_universe")
# now calculate the expansion i.e. from this universe omit the ones that we have already trained
extension_entity_set = set()
wikiid2nnid = load_wikiid2nnid()
for wikiid in self.entities_universe:
if wikiid not in wikiid2nnid:
extension_entity_set.add(wikiid)
print("len(extension_entity_set) =", len(extension_entity_set))
dump_entities(extension_entity_set, "extension_entities")
class Struct(object):
def __init__(self):
self.p_e_m_choice = "yago"
self.cand_ent_num = 30
self.lowercase_p_e_m = False
self.lowercase_spans = False
self.max_mention_width = 10
self.spans_separators = ["."]
if __name__ == "__main__":
pass
================================================
FILE: code/gerbil/gerbil_recall_calculation.py
================================================
import argparse
import os
import preprocessing.util as util
import rdflib
class ProcessDataset(object):
def __init__(self):
self.entityNameIdMap = util.EntityNameIdMap()
self.entityNameIdMap.init_gerbil_compatible_ent_id()
self.unknown_ent_name = dict()
self.no_english_uri = dict()
self.all_gm_cnt = dict()
self.englishuri_gm_cnt = dict()
self.valid_gms = dict()
def process(self, filepath, filename):
#the name of the dataset. just extract the last part of path
unknown_ent_name = 0
print("Dataset", filename, filepath)
g = rdflib.Graph()
dataset = g.parse(filepath)
print("graph has %s statements." % len(g))
print(g.serialize(format='n3'))
def main():
if not os.path.exists("/home/master_thesis_share/data/gerbil/nif_original_datasets/"):
os.makedirs("/home/master_thesis_share/data/gerbil/nif_original_datasets/")
paths_names = [("/home/gerbil/gerbil_data/datasets/N3/Reuters-128.ttl", "n3_reuters_128")]
#paths_names = [("/home/gerbil/gerbil_data/datasets/oke-challenge2016/evaluation-data/task1/evaluation-dataset-task1.ttl", "oke2016eval")]
processDataset = ProcessDataset()
for filepath, filename in paths_names:
processDataset.process(filepath, filename)
if __name__ == "__main__":
main()
================================================
FILE: code/gerbil/nn_processing.py
================================================
from model.model_ablations import Model
from time import sleep
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
import preprocessing.prepro_util as prepro_util
from evaluation.metrics import _filtered_spans_and_gm_gt_list
import numpy as np
from preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_name_id_map, FetchCandidateEntities, \
load_persons, FetchFilteredCoreferencedCandEntities
import string
class StreamingSamples(object):
def __init__(self):
# those are not used here
#self.chunk_id, self.ground_truth, self.ground_truth_len, self.begin_gm, self.end_gm
#self.cand_entities_labels,
"""only those are used:
self.words, self.words_len, self.chars, self.chars_len, \
self.begin_span, self.end_span, self.spans_len, \
self.cand_entities, self.cand_entities_scores, \
self.cand_entities_len"""
self.sample = None
self.empty = True
def new_sample(self, sample):
self.sample = sample
self.empty = False
def gen(self):
while True:
if not self.empty:
self.empty = True
yield self.sample
else:
print("sleep")
sleep(0.5)
"""
DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_FLOAT, DT_INT64,
words, words_len, chars, chars_len, begin_span, end_span, spans_len, cand_entities, cand_entities_scores, cand_entities_len,
[?,?], [?], [?,?,?], [?,?], [?,?], [?,?], [?], [?,?,?], [?,?,?], [?,?],
words, words_len, chars, chars_len, begin_span, end_span, spans_len, cand_entities, cand_entities_scores, cand_entities_len,
"""
"""(tf.TensorShape([None, None]), tf.TensorShape([None]), tf.TensorShape([None, None, None]), tf.TensorShape([None, None]),
tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None]),
tf.TensorShape([None, None, None]), tf.TensorShape([None, None, None]), tf.TensorShape([None, None])))
"""
class NNProcessing(object):
def __init__(self, train_args, args):
self.args = args
# input pipeline
self.streaming_samples = StreamingSamples()
ds = tf.data.Dataset.from_generator(
self.streaming_samples.gen, (tf.int64, tf.int64, tf.int64, tf.int64, #words, words_len, chars, chars_len
tf.int64, tf.int64, tf.int64, # begin_span, end_span, span_len
tf.int64, tf.float32, tf.int64), #cand_entities, cand_entities_scores, cand_entities_len
(tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([None, None]), tf.TensorShape([None]),
tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]),
tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None])))
next_element = ds.make_one_shot_iterator().get_next()
# batch size = 1 i expand the dims now to match the training that has batch dimension
next_element = [tf.expand_dims(t, 0) for t in next_element]
next_element = [None, *next_element[:-1], None, next_element[-1],
None, None, None, None]
# restore model
print("loading Model:", train_args.output_folder)
model = Model(train_args, next_element)
model.build()
checkpoint_path = model.restore_session("el" if args.el_mode else "ed")
self.model = model
if args.hardcoded_thr:
self.thr = args.hardcoded_thr
print("threshold used:", self.thr)
else:
# optimal threshold recovery from log files.
# based on the checkpoint selected look at the log file for threshold (otherwise recompute it)
self.thr = retrieve_optimal_threshold_from_logfile(train_args.output_folder, checkpoint_path, args.el_mode)
print("optimal threshold selected = ", self.thr)
if args.running_mode == "el_mode":
args.el_mode = True
elif args.running_mode == "ed_mode":
args.el_mode = False
# convert text to tensors for the NN
with open(args.experiment_folder+"word_char_maps.pickle", 'rb') as handle:
self.word2id, _, self.char2id, _, _, _ = pickle.load(handle)
self.wikiid2nnid = load_wikiid2nnid(extension_name=args.entity_extension)
self.nnid2wikiid = reverse_dict(self.wikiid2nnid, unique_values=True)
_, self.wiki_id_name_map = load_wiki_name_id_map()
with open(args.experiment_folder+"prepro_args.pickle", 'rb') as handle:
self.prepro_args = pickle.load(handle)
if args.lowercase_spans_pem:
self.prepro_args.lowercase_p_e_m = True
self.prepro_args.lowercase_spans = True
print("prepro_args:", self.prepro_args)
self.prepro_args.persons_coreference = args.persons_coreference
self.prepro_args.persons_coreference_merge = args.persons_coreference_merge
self.fetchFilteredCoreferencedCandEntities = FetchFilteredCoreferencedCandEntities(self.prepro_args)
prepro_util.args = self.prepro_args
self.special_tokenized_words = {"``", '"', "''"}
self.special_words_assertion_errors = 0
self.gm_idx_errors = 0
if self.args.el_with_stanfordner_and_our_ed:
from nltk.tag import StanfordNERTagger
self.st = StanfordNERTagger(
'../data/stanford_core_nlp/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',
'../data/stanford_core_nlp/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8')
self.from_myspans_to_given_spans_map_errors = 0
def process(self, text, given_spans):
self.given_spans = sorted(given_spans) if not self.args.el_mode else given_spans
self.fetchFilteredCoreferencedCandEntities.init_coref(self.args.el_mode)
original_words, chunk_words, startidx2wordnum, endidx2wordnum, words2charidx = \
self.map_words_to_char_positions(text)
if self.args.el_with_stanfordner_and_our_ed: # el test, use stanford ner to extract spans and decide with our ed system
self.given_spans, myspans = self.stanford_ner_spans(chunk_words, words2charidx)
# from given given_spans (start, length) in characters convert them to given_spans in word num
elif not self.args.el_mode: # simple ed mode. use the spans provided
myspans = []
for span in self.given_spans:
try:
start, length = span
end = start+length
if start not in startidx2wordnum:
start = self.nearest_idx(start, startidx2wordnum.keys())
if end not in endidx2wordnum:
end = self.nearest_idx(end, endidx2wordnum.keys())
if (start, end-start) != span:
print("given span:", text[span[0]:span[0]+span[1]], " new span:",
text[start:end])
myspans.append((startidx2wordnum[start], endidx2wordnum[end]+1))
except KeyError:
print("Exception KeyError!!!!")
print("original_words =", original_words)
print("chunk_words =", chunk_words)
print("start={}, length={}, left={}, span={}, right={}".format(start, length,
text[start-30:start], text[start:start+length], text[start+length:start+length+30]))
print("text =", text)
print("start= {}".format("in" if start in startidx2wordnum else "out"))
print("end= {}".format("in" if start + length in endidx2wordnum else "out"))
else: # simple el mode so consider all possible given_spans
myspans = prepro_util.SamplesGenerator.all_spans(chunk_words)
# at this point whether we do ed or el by stanfordner_plus_our_ed we must have myspans [word_num_begin, word_num_end)
# and self.given_spans which are the same spans but with characters [begin_char, length)
begin_spans, end_spans, cand_entities, cand_entities_scores = [], [], [], []
for left, right in myspans:
cand_ent, scores = self.fetchFilteredCoreferencedCandEntities.process(left, right, chunk_words)
cand_ent_filtered, scores_filtered = [], []
if cand_ent is not None and scores is not None:
for e, s in zip(cand_ent, scores):
if e in self.wikiid2nnid:
cand_ent_filtered.append(self.wikiid2nnid[e])
scores_filtered.append(s)
if cand_ent_filtered:
begin_spans.append(left)
end_spans.append(right)
cand_entities.append(cand_ent_filtered)
cand_entities_scores.append(scores_filtered)
if begin_spans == []:
return [] # this document has no annotation
words = []
chars = []
for word in chunk_words:
words.append(self.word2id[word] if word in self.word2id
else self.word2id["<wunk>"])
chars.append([self.char2id[c] if c in self.char2id else self.char2id["<u>"]
for c in word])
chars_len = [len(word) for word in chars]
new_sample = (words, len(words), list_of_lists_to_2darray(chars), chars_len,
begin_spans, end_spans, len(begin_spans),
list_of_lists_to_2darray(cand_entities),
list_of_lists_to_2darray(cand_entities_scores),
[len(t) for t in cand_entities])
self.streaming_samples.new_sample(new_sample)
result_l = self.model.sess.run([self.model.final_scores, self.model.cand_entities_len,
self.model.cand_entities, self.model.begin_span, self.model.end_span,
self.model.spans_len], feed_dict={self.model.dropout: 1})
filtered_spans, _ = _filtered_spans_and_gm_gt_list(0, *result_l, None, None, None, [0], [len(words)])
# based on final_scores and thr return annotations. also translate my given_spans to char given_spans
print("self.special_words_assertion_errors =", self.special_words_assertion_errors)
print("gm_idx_errors =", self.gm_idx_errors)
response = []
for span in filtered_spans:
score, begin_idx, end_idx, nnid = span
if score >= self.thr:
self._add_response_span(response, span, words2charidx)
print("persons_mentions_seen =", self.fetchFilteredCoreferencedCandEntities.persons_mentions_seen)
return response
def map_words_to_char_positions(self, text):
original_words = word_tokenize(text)
words2charidx = []
idx = 0
startidx2wordnum, endidx2wordnum = None, None
if not self.args.el_mode:
startidx2wordnum = dict()
endidx2wordnum = dict()
given_span_pos = -1
span_start, span_len = -100, 0
span_end = -100
chunk_words = [] # correct the special words
word_num = 0
def insert_word(start, end):
chunk_words.append(text[start:end])
words2charidx.append((start, end)) # [..)
if not self.args.el_mode:
startidx2wordnum[start] = word_num
endidx2wordnum[end] = word_num
for word in original_words:
original_word = word
if word in self.special_tokenized_words:
smallest_idx = len(text)
for special_word in self.special_tokenized_words:
start = text.find(special_word, idx)
if start != -1 and start < smallest_idx:
word = special_word
smallest_idx = start
if word != '"':
pass
#print("special word replacement: ", original_words[max(0, word_num-2):word_num+2], "new word:", word)
start = text.find(word, idx)
if start == -1 or start > idx + 10:
print("Assertion Error! in words2charidx. word={}, original_word={}".format(word, original_word),
"near_text={}\nsnippet={}".format(text[idx:idx+20], text[idx-50:idx+50]))
self.special_words_assertion_errors += 1
for special_word in self.special_tokenized_words:
start = text.find(special_word, idx)
print("idx=", idx, "special_word =", special_word, "start=", start)
else:
end = start + len(word)
idx = end
if self.args.el_mode:
insert_word(start, end)
word_num += 1
else:
while span_end <= start and given_span_pos < len(self.given_spans)-1:
given_span_pos += 1
span_start, span_len = self.given_spans[given_span_pos]
span_end = span_start + span_len
inserted_flag = False
for boundary in [span_start, span_end]:
if start < boundary < end:
print("given spans fix. original text: ", text)
print("original word: ", text[start:end], word)
print("new split: ", text[start:boundary], " and ", text[boundary:end])
insert_word(start, boundary)
word_num += 1
insert_word(boundary, end)
word_num += 1
print(words2charidx)
print(startidx2wordnum)
print(endidx2wordnum)
inserted_flag = True
if not inserted_flag:
insert_word(start, end)
word_num += 1
return original_words, chunk_words, startidx2wordnum, endidx2wordnum, words2charidx
def nearest_idx(self, key, values):
self.gm_idx_errors += 1
# find the value in values that is nearest to key
nearest_value = None
min_distance = 1e+6
for value in values:
if abs(key - value) < min_distance:
nearest_value = value
min_distance = abs(key-value)
return nearest_value
def _add_response_span(self, response, span, words2charidx):
score, begin_idx, end_idx, nnid = span
start = words2charidx[begin_idx][0] # the word begin_idx starts at this character
end = words2charidx[end_idx-1][1] # the word begin_idx starts at this character
wikiid = self.nnid2wikiid[nnid]
wikiname = self.wiki_id_name_map[wikiid].replace(' ', '_')
if not self.args.el_mode: # try to match it with a given span
start, end = self.nearest_given_span(start, end)
response.append((start, end-start, wikiname))
def nearest_given_span(self, begin_idx, end_idx): # [begin_idx, end_idx) end_idx points to the next character after mention
min_distance = 1e+6
nearest_idxes = (-1, -1)
for (start, length) in self.given_spans:
distance = abs(begin_idx - start) + abs(end_idx - (start + length))
if distance < min_distance:
nearest_idxes = (start, start + length)
min_distance = distance
if min_distance > 0:
self.from_myspans_to_given_spans_map_errors += 1
return nearest_idxes
def stanford_ner_spans(self, words_l, words2charidx):
"""returns a list of tuples (start_idx, length)"""
tags = self.st.tag(words_l)
begin_spans, end_spans, prev_tag = [], [], 'O'
for i, (_, tag) in enumerate(tags):
if tag == 'O' and prev_tag != 'O':
end_spans.append(i)
elif tag == 'O' and prev_tag == 'O':
pass
elif tag != 'O' and prev_tag == 'O':
begin_spans.append(i)
elif tag != 'O' and prev_tag == tag:
pass
elif tag != 'O' and prev_tag != tag: # and prev_tag != 'O'
end_spans.append(i)
begin_spans.append(i)
prev_tag = tag
char_spans = [] # (begin_char, length)
word_spans = [] # [begin_word, end_word)
for bw, ew in zip(begin_spans, end_spans):
word_spans.append((bw, ew))
bc = words2charidx[bw][0]
ec = words2charidx[ew-1][1]
char_spans.append((bc, ec - bc))
return char_spans, word_spans
def list_of_lists_to_2darray(a):
# with padding zeros
b = np.zeros([len(a), len(max(a, key=lambda x: len(x)))])
for i, j in enumerate(a):
b[i][0:len(j)] = j
return b
def retrieve_optimal_threshold_from_logfile(model_folder, checkpoint_path, el_mode):
eval_cnt = checkpoint_path[checkpoint_path.rfind("-")+1:] # fixed_no_wikidump_entvecsl2/checkpoints/model-7
print("eval_cnt from checkpoint_path =", eval_cnt)
with open(model_folder+"log.txt", "r") as fin:
line = next(fin).strip()
while line != "args.eval_cnt = " + eval_cnt:
line = next(fin).strip()
line = next(fin).strip()
while line != "Evaluating {} datasets".format("EL" if el_mode else "ED"):
line = next(fin).strip()
line = next(fin).strip() # Best validation threshold = -0.112 with F1=91.8
line = line.split()
assert line[3] == "=" and line[5] == "with", line
return float(line[4])
if __name__ == "__main__":
pass
================================================
FILE: code/gerbil/nn_processing_correct.py
================================================
from model.model_ablations import Model
from time import sleep
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from preprocessing.prepro_util import SamplesGenerator
from evaluation.metrics import _filtered_spans_and_gm_gt_list
import numpy as np
from preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_name_id_map, FetchCandidateEntities, load_persons
class StreamingSamples(object):
def __init__(self):
# those are not used here
#self.chunk_id, self.ground_truth, self.ground_truth_len, self.begin_gm, self.end_gm
#self.cand_entities_labels,
"""only those are used:
self.words, self.words_len, self.chars, self.chars_len, \
self.begin_span, self.end_span, self.spans_len, \
self.cand_entities, self.cand_entities_scores, \
self.cand_entities_len"""
self.sample = None
self.empty = True
def new_sample(self, sample):
self.sample = sample
self.empty = False
def gen(self):
while True:
if not self.empty:
self.empty = True
yield self.sample
else:
print("sleep")
sleep(0.5)
"""
DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_INT64, DT_FLOAT, DT_INT64,
words, words_len, chars, chars_len, begin_span, end_span, spans_len, cand_entities, cand_entities_scores, cand_entities_len,
[?,?], [?], [?,?,?], [?,?], [?,?], [?,?], [?], [?,?,?], [?,?,?], [?,?],
words, words_len, chars, chars_len, begin_span, end_span, spans_len, cand_entities, cand_entities_scores, cand_entities_len,
"""
"""(tf.TensorShape([None, None]), tf.TensorShape([None]), tf.TensorShape([None, None, None]), tf.TensorShape([None, None]),
tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None]),
tf.TensorShape([None, None, None]), tf.TensorShape([None, None, None]), tf.TensorShape([None, None])))
"""
class NNProcessing(object):
def __init__(self, train_args, args):
self.args = args
# input pipeline
self.streaming_samples = StreamingSamples()
ds = tf.data.Dataset.from_generator(
self.streaming_samples.gen, (tf.int64, tf.int64, tf.int64, tf.int64, #words, words_len, chars, chars_len
tf.int64, tf.int64, tf.int64, # begin_span, end_span, span_len
tf.int64, tf.float32, tf.int64), #cand_entities, cand_entities_scores, cand_entities_len
(tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([None, None]), tf.TensorShape([None]),
tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([]),
tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None])))
next_element = ds.make_one_shot_iterator().get_next()
# batch size = 1 i expand the dims now to match the training that has batch dimension
next_element = [tf.expand_dims(t, 0) for t in next_element]
next_element = [None, *next_element[:-1], None, next_element[-1],
None, None, None, None]
# restore model
print("loading Model:", train_args.output_folder)
model = Model(train_args, next_element)
model.build()
checkpoint_path = model.restore_session("el" if args.el_mode else "ed")
self.model = model
if args.hardcoded_thr:
self.thr = args.hardcoded_thr
print("threshold used:", self.thr)
else:
# optimal threshold recovery from log files.
# based on the checkpoint selected look at the log file for threshold (otherwise recompute it)
self.thr = retrieve_optimal_threshold_from_logfile(train_args.output_folder, checkpoint_path, args.el_mode)
print("optimal threshold selected = ", self.thr)
if args.running_mode == "el_mode":
args.el_mode = True
elif args.running_mode == "ed_mode":
args.el_mode = False
# else it remains as it is
if args.manual_thr:
self.thr = args.manual_thr
print("threshold manually overriden = ", self.thr)
# convert text to tensors for the NN
with open(args.experiment_folder+"word_char_maps.pickle", 'rb') as handle:
self.word2id, _, self.char2id, _, _, _ = pickle.load(handle)
self.wikiid2nnid = load_wikiid2nnid(extension_name=args.entity_extension)
self.nnid2wikiid = reverse_dict(self.wikiid2nnid, unique_values=True)
_, self.wiki_id_name_map = load_wiki_name_id_map()
with open(args.experiment_folder+"prepro_args.pickle", 'rb') as handle:
self.prepro_args = pickle.load(handle)
if args.lowercase_spans_pem:
self.prepro_args.lowercase_p_e_m = True
self.prepro_args.lowercase_spans = True
print("prepro_args:", self.prepro_args)
self.fetchCandidateEntities = FetchCandidateEntities(self.prepro_args)
self.special_tokenized_words = {"``", '"', "''"}
self.special_words_assertion_errors = 0
self.gm_idx_errors = 0
if self.args.el_with_stanfordner_and_our_ed:
from nltk.tag import StanfordNERTagger
self.st = StanfordNERTagger(
'/home/programs/stanford-ner-2018-02-27/classifiers/english.all.3class.distsim.crf.ser.gz',
'/home/programs/stanford-ner-2018-02-27/stanford-ner.jar', encoding='utf-8')
if self.args.persons_coreference:
self.persons_wikiids = load_persons()
self.persons_mentions_seen = set()
def process(self, text, given_spans):
self.persons_mentions_seen = set()
original_words = word_tokenize(text)
#print("tokenized chunk_words =", chunk_words)
words2charidx = []
idx = 0
self.given_spans = given_spans
#chunk_words = parsing_errors(chunk_words)
# do not use this, not useful for ed either #el has slightly better results without using this
#aida_test 0.8126 without, 0.8114 with.
if not self.args.el_mode:
startidx2wordnum = dict()
endidx2wordnum = dict()
chunk_words = [] # correct the special words
for word_num, word in enumerate(original_words):
original_word = word
if word in self.special_tokenized_words:
smallest_idx = len(text)
for special_word in self.special_tokenized_words:
start = text.find(special_word, idx)
if start != -1 and start < smallest_idx:
word = special_word
smallest_idx = start
if word != '"':
pass
#print("special word replacement: ", original_words[max(0, word_num-2):word_num+2], "new word:", word)
start = text.find(word, idx)
if start == -1 or start > idx + 10:
print("Assertion Error! in words2charidx. word={}, original_word={}".format(word, original_word),
"near_text={}\nsnippet={}".format(text[idx:idx+20], text[idx-50:idx+50]))
self.special_words_assertion_errors += 1
for special_word in self.special_tokenized_words:
start = text.find(special_word, idx)
print("idx=", idx, "special_word =", special_word, "start=", start)
else:
chunk_words.append(word)
end = start + len(word)
idx = end
assert(len(words2charidx) == word_num)
words2charidx.append((start, end)) # [..)
if not self.args.el_mode:
startidx2wordnum[start] = word_num
endidx2wordnum[end] = word_num
if self.args.el_with_stanfordner_and_our_ed: # el test, use stanford ner to extract spans and decide with our ed system
self.given_spans, myspans = self.stanford_ner_spans(chunk_words, words2charidx)
# from given given_spans (start, length) in characters convert them to given_spans in word num
elif not self.args.el_mode: # simple ed mode. use the spans provided
self.given_spans = sorted(self.given_spans)
myspans = []
for span in self.given_spans:
try:
start, length = span
end = start+length
if start not in startidx2wordnum:
start = self.nearest_idx(start, startidx2wordnum.keys())
if end not in endidx2wordnum:
end = self.nearest_idx(end, endidx2wordnum.keys())
if (start, end-start) != span:
print("given span:", text[span[0]:span[0]+span[1]], " new span:",
text[start:end])
myspans.append((startidx2wordnum[start], endidx2wordnum[end]+1))
except KeyError:
print("Exception KeyError!!!!")
print("original_words =", original_words)
print("chunk_words =", chunk_words)
print("start={}, length={}, left={}, span={}, right={}".format(start, length,
text[start-30:start], text[start:start+length], text[start+length:start+length+30]))
print("text =", text)
print("start= {}".format("in" if start in startidx2wordnum else "out"))
print("end= {}".format("in" if start + length in endidx2wordnum else "out"))
else: # simple el mode
# consider all possible given_spans
myspans = SamplesGenerator.all_spans(chunk_words)
# at this point whether we do ed or el by stanfordner_plus_oured we must have myspans [word_num_b, word_num_end)
# and self.given_spans which are those spans but with characters [begin_char, length)
begin_spans, end_spans, cand_entities, cand_entities_scores = [], [], [], []
for left, right in myspans:
span_text = ' '.join(chunk_words[left:right])
cand_ent, scores = self.fetchCandidateEntities.process(span_text)
if self.args.persons_coreference:
coreference_supermention = self.find_corefence_person(span_text)
if coreference_supermention:
print("original text:", chunk_words[max(0, left-4):max(len(chunk_words), right+4)])
if not self.args.persons_coreference_merge:
cand_ent, scores = self.fetchCandidateEntities.process(coreference_supermention)
else: # merge with cand_ent and scores
cand_ent2, scores2 = self.fetchCandidateEntities.process(coreference_supermention)
temp1 = list(zip(scores, cand_ent))
temp2 = list(zip(scores2, cand_ent2))
temp3 = sorted(temp1 + temp2, reverse=True)
scores, cand_ent = map(list, zip(*temp3[:self.prepro_args.cand_ent_num]))
# ['Obama_e', 'ent2', 'ent3'] , [0.9, 0.2, 0.8]
# filter out entities that are not in our universe (and its corresponding scores)
# then encode it from wikiid2nnid
# similar to prepro_util._encode_cand_entities_and_labels
cand_ent_filtered, scores_filtered = [], []
if cand_ent is not None and scores is not None:
if self.args.persons_coreference and not coreference_supermention and \
cand_ent[0] in self.persons_wikiids and len(span_text) >= 3:
self.persons_mentions_seen.add(span_text)
for e, s in zip(cand_ent, scores):
if e in self.wikiid2nnid:
cand_ent_filtered.append(self.wikiid2nnid[e])
scores_filtered.append(s)
if cand_ent_filtered:
begin_spans.append(left)
end_spans.append(right)
cand_entities.append(cand_ent_filtered)
cand_entities_scores.append(scores_filtered)
if begin_spans == []:
return [] # this document has no annotation
words = []
chars = []
for word in chunk_words:
words.append(self.word2id[word] if word in self.word2id
else self.word2id["<wunk>"])
chars.append([self.char2id[c] if c in self.char2id else self.char2id["<u>"]
for c in word])
chars_len = [len(word) for word in chars]
new_sample = (words, len(words), list_of_lists_to_2darray(chars), chars_len,
begin_spans, end_spans, len(begin_spans),
list_of_lists_to_2darray(cand_entities),
list_of_lists_to_2darray(cand_entities_scores),
[len(t) for t in cand_entities])
self.streaming_samples.new_sample(new_sample)
result_l = self.model.sess.run([self.model.final_scores, self.model.cand_entities_len,
self.model.cand_entities, self.model.begin_span, self.model.end_span,
self.model.spans_len], feed_dict={self.model.dropout: 1})
filtered_spans, _ = _filtered_spans_and_gm_gt_list(0, *result_l, None, None, None, [0], [len(words)])
# based on final_scores and thr return annotations. also translate my given_spans to char given_spans
print("self.special_words_assertion_errors =", self.special_words_assertion_errors)
print("gm_idx_errors =", self.gm_idx_errors)
if self.args.each_entity_only_once or self.args.each_mention_only_once or \
self.args.omit_first_sentence:
return self.custom_response(filtered_spans, text, words2charidx, chunk_words)
response = []
for span in filtered_spans:
score, begin_idx, end_idx, nnid = span
if score >= self.thr:
self._add_response_span(response, span, words2charidx)
print("self.persons_mentions_seen =", self.persons_mentions_seen)
return response
def find_corefence_person(self, span_text):
"""if span_text is substring of another person mention found before. it should be
substring of words. so check next character and previous character to be non alphanumeric"""
if len(span_text) < 3:
return None
for mention in self.persons_mentions_seen:
idx = mention.find(span_text)
if idx != -1:
#print("find_coreference_person substring initial match")
if len(mention) == len(span_text):
continue # they are identical so no point in substituting them
if idx > 0 and mention[idx-1].isalpha():
continue
if idx < len(mention) - 1 and mention[idx+1].isalpha():
continue
print("persons coreference, before:", span_text, "after:", mention)
return mention
return None
def nearest_idx(self, key, values):
self.gm_idx_errors += 1
# find the value in values that is nearest to key
nearest_value = None
min_distance = 1e+6
for value in values:
if abs(key - value) < min_distance:
nearest_value = value
min_distance = abs(key-value)
return nearest_value
def _add_response_span(self, response, span, words2charidx):
score, begin_idx, end_idx, nnid = span
start = words2charidx[begin_idx][0] # the word begin_idx starts at this character
end = words2charidx[end_idx-1][1] # the word begin_idx starts at this character
wikiid = self.nnid2wikiid[nnid]
wikiname = self.wiki_id_name_map[wikiid].replace(' ', '_')
if not self.args.el_mode: # try to match it with a given span
start, end = self.nearest_given_span(start, end)
response.append((start, end-start, wikiname))
def nearest_given_span(self, begin_idx, end_idx): # [begin_idx, end_idx) end_idx points to the next character after mention
min_distance = 1e+6
nearest_idxes = (-1, -1)
for (start, length) in self.given_spans:
distance = abs(begin_idx - start) + abs(end_idx - (start + length))
if distance < min_distance:
nearest_idxes = (start, start + length)
min_distance = distance
return nearest_idxes
def custom_response(self, filtered_spans, text, words2charidx, chunk_words):
from operator import itemgetter
filtered_spans = sorted(filtered_spans, key=itemgetter(1))
response = []
# omit title
if self.args.omit_first_sentence:
start = text.find(self.args.first_sentence_separator)
if start > 100: # this dataset doesn't have a title so do not omit anything
start = 0
print("omit first sentence:", text[:start])
for i, span in enumerate(filtered_spans):
if words2charidx[span[1]][0] > start:
break
print("omitted annotations:", [chunk_words[span[1]:span[2]] for span in filtered_spans[:i] if span[0] > self.thr])
filtered_spans = filtered_spans[i:]
# each entity only once
if self.args.each_entity_only_once:
used_entities = set()
for span in filtered_spans:
score, begin_idx, end_idx, nnid = span
if score >= self.thr and nnid not in used_entities:
self._add_response_span(response, span, words2charidx)
used_entities.add(nnid)
elif self.args.each_mention_only_once:
used_mentions = set()
for span in filtered_spans:
score, begin_idx, end_idx, nnid = span
mention = text[words2charidx[begin_idx][0]:words2charidx[end_idx-1][1]]
if score >= self.thr and mention not in used_mentions:
self._add_response_span(response, span, words2charidx)
used_mentions.add(mention)
return response
def stanford_ner_spans(self, words_l, words2charidx):
"""returns a list of tuples (start_idx, length)"""
tags = self.st.tag(words_l)
begin_spans, end_spans, prev_tag = [], [], 'O'
for i, (_, tag) in enumerate(tags):
if tag == 'O' and prev_tag != 'O':
end_spans.append(i)
elif tag == 'O' and prev_tag == 'O':
pass
elif tag != 'O' and prev_tag == 'O':
begin_spans.append(i)
elif tag != 'O' and prev_tag == tag:
pass
elif tag != 'O' and prev_tag != tag: # and prev_tag != 'O'
end_spans.append(i)
begin_spans.append(i)
prev_tag = tag
char_spans = [] # (begin_char, length)
word_spans = [] # [begin_word, end_word)
for bw, ew in zip(begin_spans, end_spans):
word_spans.append((bw, ew))
bc = words2charidx[bw][0]
ec = words2charidx[ew-1][1]
char_spans.append((bc, ec - bc))
return char_spans, word_spans
""" not used
def parsing_errors(chunk_words):
# check each chunk_word if it is alpha. if not then try to split it
# 'U.S' '.' merge them to U.S.
# alpha notalaphcharacter alpha do not split them
temp = []
i = 0
while i < len(chunk_words):
word = chunk_words[i]
if word == 'U.S.':
temp.append(word)
elif word == 'U.S' and chunk_words[i+1] == '.':
temp.append('U.S.')
i += 1
elif word.isalpha():
temp.append(word)
else:
temp_idx = 0
for c in word:
if not c.isalpha():
break
temp_idx += 1
if 0 < temp_idx < len(word) and not word[temp_idx+1:].isalpha():
temp.append(word[:temp_idx])
temp.append(word[temp_idx:])
else:
temp.append(word)
i += 1
return temp
"""
def list_of_lists_to_2darray(a):
# with padding zeros
b = np.zeros([len(a), len(max(a, key=lambda x: len(x)))])
for i, j in enumerate(a):
b[i][0:len(j)] = j
return b
def retrieve_optimal_threshold_from_logfile(model_folder, checkpoint_path, el_mode):
eval_cnt = checkpoint_path[checkpoint_path.rfind("-")+1:] # fixed_no_wikidump_entvecsl2/checkpoints/model-7
print("eval_cnt from checkpoint_path =", eval_cnt)
with open(model_folder+"log.txt", "r") as fin:
line = next(fin).strip()
while line != "args.eval_cnt = " + eval_cnt:
line = next(fin).strip()
line = next(fin).strip()
while line != "Evaluating {} datasets".format("EL" if el_mode else "ED"):
line = next(fin).strip()
line = next(fin).strip() # Best validation threshold = -0.112 with F1=91.8
line = line.split()
assert line[3] == "=" and line[5] == "with", line
return float(line[4])
if __name__ == "__main__":
debug = True
================================================
FILE: code/gerbil/server.py
================================================
from http.server import BaseHTTPRequestHandler, HTTPServer
import json
import argparse
import model.config as config
from gerbil.nn_processing import NNProcessing
from model.util import load_train_args
from gerbil.build_entity_universe import BuildEntityUniverse
class GetHandler(BaseHTTPRequestHandler):
def do_POST(self):
content_length = int(self.headers['Content-Length'])
post_data = self.rfile.read(content_length)
self.send_response(200)
self.end_headers()
if args.build_entity_universe:
buildEntityUniverse.process(*read_json(post_data))
response = []
else:
response = nnprocessing.process(*read_json(post_data))
print("response in server.py code:\n", response)
self.wfile.write(bytes(json.dumps(response), "utf-8"))
return
def read_json(post_data):
data = json.loads(post_data.decode("utf-8"))
#print("received data:", data)
text = data["text"]
spans = [(int(j["start"]), int(j["length"])) for j in data["spans"]]
return text, spans
def _parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--experiment_name", default="per_document_no_wikidump",
help="under folder data/tfrecords/")
parser.add_argument("--training_name", default="doc_fixed_nowiki_evecsl2dropout")
parser.add_argument("--all_spans_training", type=bool, default=False)
parser.add_argument("--el_mode", dest='el_mode', action='store_true')
parser.add_argument("--ed_mode", dest='el_mode', action='store_false')
parser.set_defaults(el_mode=True)
parser.add_argument("--running_mode", default=None, help="el_mode or ed_mode, so"
"we can restore an ed_mode model and run it for el")
parser.add_argument("--lowercase_spans_pem", type=bool, default=False)
parser.add_argument("--entity_extension", default=None, help="extension_entities or extension_entities_all etc")
# those are for building the entity set
parser.add_argument("--build_entity_universe", type=bool, default=False)
parser.add_argument("--hardcoded_thr", type=float, default=None, help="0, 0.2")
parser.add_argument("--el_with_stanfordner_and_our_ed", type=bool, default=False)
parser.add_argument("--persons_coreference", type=bool, default=False)
parser.add_argument("--persons_coreference_merge", type=bool, default=False)
args = parser.parse_args()
if args.persons_coreference_merge:
args.persons_coreference = True
print(args)
if args.build_entity_universe:
return args, None
temp = "all_spans_" if args.all_spans_training else ""
args.experiment_folder = config.base_folder+"data/tfrecords/" + args.experiment_name+"/"
args.output_folder = config.base_folder+"data/tfrecords/" + \
args.experiment_name+"/{}training_folder/".format(temp) + \
args.training_name+"/"
train_args = load_train_args(args.output_folder, "gerbil")
train_args.entity_extension = args.entity_extension
print(train_args)
return args, train_args
def terminate():
tee.close()
if args.build_entity_universe:
buildEntityUniverse.flush_entity_universe()
else:
print("from_myspans_to_given_spans_map_errors:", nnprocessing.from_myspans_to_given_spans_map_errors)
if __name__ == "__main__":
args, train_args = _parse_args()
if args.build_entity_universe:
buildEntityUniverse = BuildEntityUniverse()
else:
nnprocessing = NNProcessing(train_args, args)
server = HTTPServer(('localhost', 5555), GetHandler)
print('Starting server at http://localhost:5555')
from model.util import Tee
tee = Tee('server.txt', 'w')
try:
server.serve_forever()
except KeyboardInterrupt:
terminate()
exit(0)
================================================
FILE: code/model/base_model.py
================================================
import os
import tensorflow as tf
class BaseModel(object):
def __init__(self, args):
self.args = args
self.sess = None
self.ed_saver = None
self.el_saver = None
def reinitialize_weights(self, scope_name):
"""Reinitializes the weights of a given layer"""
variables = tf.contrib.framework.get_variables(scope_name)
init = tf.variables_initializer(variables)
self.sess.run(init)
def add_train_op(self, lr_method, lr, loss, clip=-1):
"""Defines self.train_op that performs an update on a batch
Args:
lr_method: (string) sgd method, for example "adam"
lr: (tf.placeholder) tf.float32, learning rate
loss: (tensor) tf.float32 loss to minimize
clip: (python float) clipping of gradient. If < 0, no clipping
"""
_lr_m = lr_method.lower() # lower to make sure
with tf.variable_scope("train_step"):
if _lr_m == 'adam': # sgd method
optimizer = tf.train.AdamOptimizer(lr)
elif _lr_m == 'adagrad':
optimizer = tf.train.AdagradOptimizer(lr)
elif _lr_m == 'sgd':
optimizer = tf.train.GradientDescentOptimizer(lr)
elif _lr_m == 'rmsprop':
optimizer = tf.train.RMSPropOptimizer(lr)
else:
raise NotImplementedError("Unknown method {}".format(_lr_m))
if clip > 0: # gradient clipping if clip is positive
grads, vs = zip(*optimizer.compute_gradients(loss))
grads, gnorm = tf.clip_by_global_norm(grads, clip)
self.train_op = optimizer.apply_gradients(zip(grads, vs))
else:
self.train_op = optimizer.minimize(loss)
def initialize_session(self):
"""Defines self.sess and initialize the variables"""
print("Initializing tf session")
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True
#self.sess = tf.Session(config=config)
self.sess = tf.Session()
#from tensorflow.python import debug as tf_debug
#self.sess = tf_debug.LocalCLIDebugWrapperSession(tf.Session())
self.sess.run(tf.global_variables_initializer())
self.ed_saver = tf.train.Saver(var_list=self.checkpoint_variables(), max_to_keep=self.args.checkpoints_num)
self.el_saver = tf.train.Saver(var_list=self.checkpoint_variables(), max_to_keep=self.args.checkpoints_num)
def restore_session(self, option="latest"):
"""option: 'latest', 'ed', 'el' so it chooses the latest checkpoint for ed for el or from
both of them if it is 'latest' (this is used in continue_training'"""
"""restores from the latest checkpoint of this folder"""
assert(option in ["latest", "ed", "el"])
if hasattr(self.args, 'checkpoint_model_num') and self.args.checkpoint_model_num is not None:
assert(option != "latest") # it is either ed or el
checkpoint_path = self.args.checkpoints_folder + option + "/model-{}".format(self.args.checkpoint_model_num)
else:
if option == "ed":
checkpoint_path = self.my_latest_checkpoint(self.args.checkpoints_folder+"ed/")
elif option == "el":
checkpoint_path = self.my_latest_checkpoint(self.args.checkpoints_folder+"el/")
elif option == "latest":
print("Reloading the latest trained model...(either ed or el)")
ed = self.my_latest_checkpoint(self.args.checkpoints_folder+"ed/")
el = self.my_latest_checkpoint(self.args.checkpoints_folder+"el/")
ed_eval_cnt = int(ed[ed.rfind('-') + 1:])
el_eval_cnt = int(el[el.rfind('-') + 1:])
if ed_eval_cnt >= el_eval_cnt:
checkpoint_path = self.my_latest_checkpoint(self.args.checkpoints_folder+"ed/")
option = "ed"
else:
checkpoint_path = self.my_latest_checkpoint(self.args.checkpoints_folder+"el/")
option = "el"
print("Using checkpoint: {}".format(checkpoint_path))
self.sess = tf.Session()
self.ed_saver = tf.train.Saver(var_list=self.checkpoint_variables(), max_to_keep=self.args.checkpoints_num)
self.el_saver = tf.train.Saver(var_list=self.checkpoint_variables(), max_to_keep=self.args.checkpoints_num)
saver = self.ed_saver if option == "ed" else self.el_saver
saver.restore(self.sess, checkpoint_path)
self.init_embeddings()
print("Finished loading checkpoint.")
return checkpoint_path
def my_latest_checkpoint(self, folder_path): # model-9.meta
files = [name for name in os.listdir(folder_path) if name.startswith("model") and name.endswith("meta")]
max_epoch = max([int(name[len("model-"):-len(".meta")]) for name in files])
return folder_path + "model-" + str(max_epoch)
def save_session(self, eval_cnt, save_ed_flag, save_el_flag):
"""Saves session = weights"""
for save_flag, category in zip([save_ed_flag, save_el_flag], ["ed", "el"]):
if save_flag is False:
continue
checkpoints_folder = self.args.checkpoints_folder + category + "/"
if not os.path.exists(checkpoints_folder):
os.makedirs(checkpoints_folder)
print("saving session checkpoint for {}...".format(category))
checkpoint_prefix = os.path.join(checkpoints_folder, "model")
saver = self.ed_saver if category == "ed" else self.el_saver
save_path = saver.save(self.sess, checkpoint_prefix, global_step=eval_cnt)
print("Checkpoint saved in file: %s" % save_path)
def close_session(self):
"""Closes the session"""
self.sess.close()
def _restore_list(self):
return [n for n in tf.global_variables()
if n.name != 'entities/_entity_embeddings:0']
def checkpoint_variables(self):
"""omit word embeddings and entity embeddings from being stored in checkpoint when they are fixed
in order to save disk space. word emb are always fixed, entity emb are fixed when
args.train_ent_vecs == False"""
omit_variables = ['words/_word_embeddings:0']
if not self.args.train_ent_vecs:
omit_variables.append('entities/_entity_embeddings:0')
variables = [n for n in tf.global_variables() if n.name not in omit_variables]
print("checkpoint variables to restore:", variables)
return variables
def find_variable_handler_by_name(self, var_name):
for n in tf.global_variables():
if n.name == var_name:
return n
================================================
FILE: code/model/config.py
================================================
base_folder = "../"
spans_separators = ["."] #maybe also try ['.', ',', ';']
unk_ent_id = "0"
================================================
FILE: code/model/ed_model_original.py
================================================
import numpy as np
import pickle
import tensorflow as tf
import model.config as config
from .base_model import BaseModel
import model.util as util
class EDModel(BaseModel):
def __init__(self, args, next_element):
super().__init__(args)
self.chunk_id, self.words, self.words_len, self.chars, self.chars_len,\
self.begin_span, self.end_span, self.spans_len,\
self.cand_entities, self.cand_entities_scores, self.cand_entities_labels,\
self.cand_entities_len, self.ground_truth, self.ground_truth_len,\
self.begin_gm, self.end_gm = next_element
self.begin_span = tf.cast(self.begin_span, tf.int32)
self.end_span = tf.cast(self.end_span, tf.int32)
self.words_len = tf.cast(self.words_len, tf.int32) # TODO new command caution if it is breaking old models
with open(config.base_folder +"data/tfrecords/" + self.args.experiment_name+
"/word_char_maps.pickle", 'rb') as handle:
_, id2word, _, id2char, _, _ = pickle.load(handle)
self.nwords = len(id2word)
self.nchars = len(id2char)
if self.args.cand_ent_num_restriction:
self.cand_entities = tf.slice(self.cand_entities, [0, 0, 0], [-1, -1, self.args.cand_ent_num_restriction])
self.cand_entities_scores = tf.slice(self.cand_entities_scores, [0, 0, 0], [-1, -1, self.args.cand_ent_num_restriction])
self.cand_entities_labels = tf.slice(self.cand_entities_labels, [0, 0, 0], [-1, -1, self.args.cand_ent_num_restriction])
self.cand_entities_len = tf.minimum(self.cand_entities_len, self.args.cand_ent_num_restriction)
self.ffnn_l2normalization_op_list = []
if not tf.__version__.startswith("1.4"):
temp_shape = tf.shape(self.cand_entities_len)
temp = tf.sequence_mask(tf.reshape(self.cand_entities_len, [-1]), tf.shape(self.cand_entities_scores)[2], dtype=tf.float32)
self.loss_mask = tf.reshape(temp, [temp_shape[0], temp_shape[1], tf.shape(temp)[-1]])
else:
self.loss_mask = tf.sequence_mask(self.cand_entities_len, tf.shape(self.cand_entities_scores)[2], dtype=tf.float32) # 30 candidates
def add_placeholders(self):
"""Define placeholders = entries to computational graph"""
"""
# shape = (batch size, max length of sentence in batch)
self.words = tf.placeholder(tf.int64, shape=[None, None],
name="words")
# shape = (batch size)
self.words_len = tf.placeholder(tf.int64, shape=[None],
name="words_len")
# shape = (batch size, max length of sentence, max length of word)
self.chars = tf.placeholder(tf.int64, shape=[None, None, None],
name="chars")
# shape = (batch_size, max_length of sentence)
self.chars_len = tf.placeholder(tf.int64, shape=[None, None],
name="chars_len")
# shape = (batch_size, max number of candidate spans in one of the batch sentences)
self.begin_span = tf.placeholder(tf.int64, shape=[None, None],
name="begin_span")
self.end_span = tf.placeholder(tf.int64, shape=[None, None],
name="end_span")
# shape = (batch size)
self.spans_len = tf.placeholder(tf.int64, shape=[None],
name="spans_len")
# shape = (batch size, max number of candidate spans, max number of cand entitites)
self.cand_entities = tf.placeholder(tf.int64, shape=[None, None, None],
name="cand_entities")
self.cand_entities_scores = tf.placeholder(tf.float32, shape=[None, None, None],
name="cand_entities_scores")
self.cand_entities_labels = tf.placeholder(tf.int64, shape=[None, None, None],
name="cand_entities_labels")
# shape = (batch_size, max number of candidate spans)
self.cand_entities_len = tf.placeholder(tf.int64, shape=[None, None],
name="cand_entities_len")
# shape = (batch_size, max number of candidate spans)
self.ground_truth = tf.placeholder(tf.int64, shape=[None, None],
name="ground_truth")
# shape = (batch_size)
self.ground_truth_len = tf.placeholder(tf.int64, shape=[None],
name="ground_truth_len")
# the next two placeholders are not useful. TODO remove them
# shape = (batch_size, max number of gold mentions)
self.begin_gm = tf.placeholder(tf.int64, shape=[None, None],
name="begin_gm")
self.end_gm = tf.placeholder(tf.int64, shape=[None, None],
name="end_gm")
"""
# hyper parameters
self.dropout = tf.placeholder(dtype=tf.float32, shape=[], name="dropout")
self.lr = tf.placeholder(dtype=tf.float32, shape=[], name="lr")
def init_embeddings(self):
print("\n!!!! init embeddings !!!!\n")
# read the numpy file
embeddings_nparray = np.load(config.base_folder +"data/tfrecords/" + self.args.experiment_name+
"/embeddings_array.npy")
self.sess.run(self.word_embedding_init, feed_dict={self.word_embeddings_placeholder: embeddings_nparray})
entity_embeddings_nparray = util.load_ent_vecs(self.args)
self.sess.run(self.entity_embedding_init, feed_dict={self.entity_embeddings_placeholder: entity_embeddings_nparray})
def add_embeddings_op(self):
"""Defines self.word_embeddings"""
#with tf.device('/cpu:0'), tf.name_scope("embedding"):
with tf.variable_scope("words"):
_word_embeddings = tf.Variable(
tf.constant(0.0, shape=[self.nwords, 300]),
name="_word_embeddings",
dtype=tf.float32,
trainable=False)
self.word_embeddings_placeholder = tf.placeholder(tf.float32, [self.nwords, 300])
self.word_embedding_init = _word_embeddings.assign(self.word_embeddings_placeholder)
word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
self.words, name="word_embeddings")
self.pure_word_embeddings = word_embeddings
#print("word_embeddings (after lookup) ", word_embeddings)
with tf.variable_scope("chars"):
if self.args.use_chars:
# get char embeddings matrix
_char_embeddings = tf.get_variable(
name="_char_embeddings",
dtype=tf.float32,
shape=[self.nchars, self.args.dim_char], trainable=True)
char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
self.chars, name="char_embeddings")
# put the time dimension on axis=1
s = tf.shape(char_embeddings)
char_embeddings = tf.reshape(char_embeddings,
shape=[s[0] * s[1], s[-2], self.args.dim_char])
char_lengths = tf.reshape(self.chars_len, shape=[s[0] * s[1]])
# bi lstm on chars
cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_char,
state_is_tuple=True)
cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_char,
state_is_tuple=True)
_output = tf.nn.bidirectional_dynamic_rnn(
cell_fw, cell_bw, char_embeddings,
sequence_length=char_lengths, dtype=tf.float32)
# read and concat output
_, ((_, output_fw), (_, output_bw)) = _output
output = tf.concat([output_fw, output_bw], axis=-1)
# shape = (batch size, max sentence length, char hidden size)
output = tf.reshape(output,
shape=[s[0], s[1], 2 * self.args.hidden_size_char])
#print("output after char lstm ", output)
word_embeddings = tf.concat([word_embeddings, output], axis=-1)
#print("word_embeddings with char after concatenation ", word_embeddings)
# (batch, words, 300+2*100)
self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)
with tf.variable_scope("entities"):
from preprocessing.util import load_wikiid2nnid
self.nentities = len(load_wikiid2nnid(extension_name=self.args.entity_extension))
_entity_embeddings = tf.Variable(
tf.constant(0.0, shape=[self.nentities, 300]),
name="_entity_embeddings",
dtype=tf.float32,
trainable=self.args.train_ent_vecs)
self.entity_embeddings_placeholder = tf.placeholder(tf.float32, [self.nentities, 300])
self.entity_embedding_init = _entity_embeddings.assign(self.entity_embeddings_placeholder)
self.entity_embeddings = tf.nn.embedding_lookup(_entity_embeddings, self.cand_entities,
name="entity_embeddings")
self.pure_entity_embeddings = self.entity_embeddings
if self.args.ent_vecs_regularization.startswith("l2"): # 'l2' or 'l2dropout'
self.entity_embeddings = tf.nn.l2_normalize(self.entity_embeddings, dim=3)
if self.args.ent_vecs_regularization == "dropout" or \
self.args.ent_vecs_regularization == "l2dropout":
self.entity_embeddings = tf.nn.dropout(self.entity_embeddings, self.dropout)
#print("entity_embeddings = ", self.entity_embeddings)
def add_context_emb_op(self):
with tf.variable_scope("context-bi-lstm"):
cell_fw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm)
cell_bw = tf.contrib.rnn.LSTMCell(self.args.hidden_size_lstm)
(output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
cell_fw, cell_bw, self.word_embeddings,
sequence_length=self.words_len, dtype=tf.float32)
output = tf.concat([output_fw, output_bw], axis=-1)
self.context_emb = tf.nn.dropout(output, self.dropout)
#print("context_emb = ", self.context_emb) # [batch, num_mentions, 600]
def add_span_emb_op(self):
mention_emb_list = []
# span embedding based on boundaries (start, end) and head mechanism. but do that on top of contextual bilistm
# output or on top of original word+char embeddings. this flag determines that. Of course by default head
# is on top of word+char emb instead of bilstm output.
boundaries_input_vecs = self.word_embeddings if self.args.span_boundaries_from_wordemb else self.context_emb
if self.args.span_emb.find("boundaries") != -1:
mention_start_emb = tf.gather_nd(boundaries_input_vecs, tf.stack(
[tf.tile(tf.expand_dims(tf.range(tf.shape(self.begin_span)[0]), 1), [1, tf.shape(self.begin_span)[1]]),
self.begin_span], 2))
#mention_start_emb = tf.gather(text_outputs, mention_starts) # [num_mentions, emb]
mention_emb_list.append(mention_start_emb)
mention_end_emb = tf.gather_nd(boundaries_input_vecs, tf.stack(
[tf.tile(tf.expand_dims(tf.range(tf.shape(self.begin_span)[0]), 1), [1, tf.shape(self.begin_span)[1]]),
tf.nn.relu(self.end_span-1)], 2)) # -1 because the end of span in exclusive [start, end)
# relu so the 0 don't become -1
#mention_end_emb = tf.gather(text_outputs, mention_ends) # [num_mentions, emb]
mention_emb_list.append(mention_end_emb)
#print("mention_start_emb = ", mention_start_emb)
#print("mention_end_emb = ", mention_end_emb)
mention_width = self.end_span - self.begin_span # [batch, num_mentions]
# TODO remove the comment code below
"""
if self.args.use_features:
mention_width_index = mention_width - 1 # [num_mentions]
mention_width_emb = tf.gather(tf.get_variable("mention_width_embeddings", [self.args["max_mention_width"],
self.args["feature_size"]]),
mention_width_index) # [batch, num_mentions, emb]
mention_width_emb = tf.nn.dropout(mention_width_emb, self.dropout)
#print("mention_width_emb = ", mention_width_emb)
mention_emb_list.append(mention_width_emb)
"""
if self.args.span_emb.find("head") != -1: # here the attention is computed
self.max_mention_width = tf.minimum(self.args.max_mention_width,
tf.reduce_max(self.end_span - self.begin_span))
mention_indices = tf.range(self.max_mention_width) + \
tf.expand_dims(self.begin_span, 2) # [batch, num_mentions, max_mention_width]
mention_indices = tf.minimum(tf.shape(self.word_embeddings)[1] - 1,
mention_indices) # [batch, num_mentions, max_mention_width]
#print("mention_indices = ", mention_indices)
batch_index = tf.tile(tf.expand_dims(tf.expand_dims(tf.range(tf.shape(mention_indices)[0]), 1), 2),
[1, tf.shape(mention_indices)[1], tf.shape(mention_indices)[2]])
mention_indices = tf.stack([batch_index, mention_indices], 3)
# [batch, num_mentions, max_mention_width, [row,col] ] 4d tensor
# this means that head will be either the same as boundaries or boundaries from bilstm and head from wordemb
head_input_vecs = boundaries_input_vecs if self.args.model_heads_from_bilstm else self.word_embeddings
mention_text_emb = tf.gather_nd(head_input_vecs, mention_indices)
# [batch, num_mentions, max_mention_width, 500 ] 4d tensor
#print("mention_text_emb = ", mention_text_emb)
with tf.variable_scope("head_scores"):
# from [batch, max_sent_len, 600] to [batch, max_sent_len, 1]
self.head_scores = util.projection(boundaries_input_vecs, 1, model=self)
# [batch, num_mentions, max_mention_width, 1]
mention_head_scores = tf.gather_nd(self.head_scores, mention_indices)
#print("mention_head_scores = ", mention_head_scores)
if not tf.__version__.startswith("1.4"):
temp_shape = tf.shape(mention_width)
temp = tf.sequence_mask(tf.reshape(mention_width, [-1]), self.max_mention_width, dtype=tf.float32)
temp_mask = tf.reshape(temp, [temp_shape[0], temp_shape[1], tf.shape(temp)[-1]])
else:
temp_mask = tf.sequence_mask(mention_width, self.max_mention_width, dtype=tf.float32)
mention_mask = tf.expand_dims(temp_mask, 3) # [batch, num_mentions, max_mention_width, 1]
mention_mask = tf.minimum(1.0, tf.maximum(self.args.zero, mention_mask)) # 1e-3
mention_attention = tf.nn.softmax(mention_head_scores + tf.log(mention_mask),
dim=2) # [batch, num_mentions, max_mention_width, 1]
mention_head_emb = tf.reduce_sum(mention_attention * mention_text_emb, 2) # [batch, num_mentions, emb]
#print("mention_head_emb = ", mention_head_emb)
mention_emb_list.append(mention_head_emb)
self.span_emb = tf.concat(mention_emb_list, 2) # [batch, num_mentions, emb i.e. 1700]
#print("span_emb = ", self.span_emb)
def add_lstm_score_op(self):
#print("cand_entities = ", self.cand_entities)
with tf.variable_scope("span_emb_ffnn"):
# [batch, num_mentions, 300]
if self.args.span_emb_ffnn[0] == 0:
span_emb_projected = util.projection(self.span_emb, 300, model=self)
else:
hidden_layers, hidden_size = self.args.span_emb_ffnn[0], self.args.span_emb_ffnn[1]
span_emb_projected = util.ffnn(self.span_emb, hidden_layers, hidden_size, 300,
self.dropout if self.args.ffnn_dropout else None, model=self)
#print("span_emb_projected = ", span_emb_projected)
scores = tf.matmul(tf.expand_dims(span_emb_projected, 2), self.entity_embeddings, transpose_b=True)
#print("scores = ", scores)
self.similarity_scores = tf.squeeze(scores, axis=2) # [batch, num_mentions, 1, 30]
#print("scores = ", self.similarity_scores) # [batch, num_mentions, 30]
def add_local_attention_op(self):
# shape=(b, num_of_spans, 30, 300)
attention_entity_emb = self.pure_entity_embeddings if self.args.attention_ent_vecs_no_regularization else self.entity_embeddings
with tf.variable_scope("attention"):
K = self.args.attention_K
left_mask = self._sequence_mask_v13(self.begin_span, K)
#left_mask = tf.sequence_mask(self.begin_span, K, dtype=tf.float32)
right_mask = self._sequence_mask_v13(tf.expand_dims(self.words_len, 1) - self.end_span, K)
#right_mask = tf.sequence_mask(tf.expand_dims(self.words_len, 1) - self.end_span, # number of words on the right
# K, dtype=tf.float32) # but maximum i get K not more
ctxt_mask = tf.concat([left_mask, right_mask], 2) # [batch, num_of_spans, 2*K]
ctxt_mask = tf.log(tf.minimum(1.0, tf.maximum(self.args.zero, ctxt_mask)))
# T, T, T, F, F | T, T, F, F, F
# -1, -2, -3, -4, -5 +0, +1, +2, +3, +4
leftctxt_indices = tf.maximum(0, tf.range(-1, -K - 1, -1) + \
tf.expand_dims(self.begin_span, 2)) # [batch, num_mentions, K]
rightctxt_indices = tf.minimum(tf.shape(self.pure_word_embeddings)[1] - 1, tf.range(K) + \
tf.expand_dims(self.end_span, 2)) # [batch, num_mentions, K]
ctxt_indices = tf.concat([leftctxt_indices, rightctxt_indices], 2) # [batch, num_mentions, 2*K]
batch_index = tf.tile(tf.expand_dims(tf.expand_dims(tf.range(tf.shape(ctxt_indices)[0]), 1), 2),
[1, tf.shape(ctxt_indices)[1], tf.shape(ctxt_indices)[2]])
ctxt_indices = tf.stack([batch_index, ctxt_indices], 3)
# [batch, num_of_spans, 2*K, 2] the last dimension is row,col for gather_nd
# [batch, num_of_spans, 2*K, [row,col]]
att_x_w = self.pure_word_embeddings # [batch, max_sent_len, 300]
if self.args.attention_on_lstm and self.args.nn_components.find("lstm") != -1:
# [batch, max_sent_len, 600] hidden_size_of_lstm*2 so project it to 300
# TODO maybe omit projection if already in 300 dimention? but projection allows transormation...
att_x_w = util.projection(self.context_emb, 300, model=self) # if tf.shape(self.context_emb)[-1] != 300 else self.context_emb
ctxt_word_emb = tf.gather_nd(att_x_w, ctxt_indices)
# [batch, num_of_spans, 2K, emb_size] emb_size = 300 only pure word emb used
# and not after we add char emb and dropout
x_c_voters = attention_entity_emb
# restrict the number of entities that participate in the forming of the x_c context vector
if self.args.attention_retricted_num_of_entities:
x_c_voters = tf.slice(attention_entity_emb, [0, 0, 0, 0],
[-1, -1, self.args.attention_retricted_num_of_entities, -1])
if self.args.attention_use_AB:
att_A = tf.get_variable("att_A", [300])
x_c_voters = att_A * x_c_voters
# [b, num_of_spans, 2*K, 300] mul [b, num_of_spans, 30, 300] instead of 30 it can be the reduced number of entities
scores = tf.matmul(ctxt_word_emb, x_c_voters, transpose_b=True) # [b, spans, 2K, 30]
scores = tf.reduce_max(scores, reduction_indices=[-1]) # max score of each word for each span acquired from any cand entity
scores = scores + ctxt_mask # some words are not valid out of window
# so we assign to them very low score
top_values, _ = tf.nn.top_k(scores, self.args.attention_R)
# [batch, num_of_spans, R]
#R_value = tf.reduce_min(top_values, axis=-1)
R_value = top_values[:, :, -1] # [batch, num_of_spans]
# same as above command but probably faster
R_value = tf.maximum(self.args.zero, R_value) # so to avoid keeping words that
# have max score with any of the entities <=0 (also score = 0 can have words with
# padding candidate entities)
threshold = tf.tile(tf.expand_dims(R_value, 2), [1, 1, 2 * K])
# [batch, num_of_spans, 2K]
scores = scores - tf.to_float(((scores - threshold) < 0)) * 50 # 50 where score<thr, 0 where score>=thr
scores = tf.nn.softmax(scores, dim=2) # [batch, num_of_spans, 2K]
scores = tf.expand_dims(scores, 3) # [batch, num_of_spans, 2K, 1]
# [batch, num_of_spans, 2K, 1] * [batch, num_of_spans, 2K, emb_size]
# = [batch, num_of_spans, 2K, emb_size]
x_c = tf.reduce_sum(scores * ctxt_word_emb, 2) # = [batch, num_of_spans, emb_size]
if self.args.attention_use_AB:
att_B = tf.get_variable("att_B", [300])
x_c = att_B * x_c
x_c = tf.expand_dims(x_c, 3) # [batch, num_of_spans, emb_size, 1]
# [batch, num_of_spans, 30, emb_size=300] mul with [batch, num_of_spans, emb_size, 1]
x_e__x_c = tf.matmul(attention_entity_emb, x_c) # [batch, num_of_spans, 30, 1]
x_e__x_c = tf.squeeze(x_e__x_c, axis=3) # [batch, num_of_spans, 30]
self.attention_scores = x_e__x_c
def custom_pem(self, log=True, buckets_boundaries=None, bucketing_name="pem_embeddings"):
if buckets_boundaries:
return self._pem_bucketing_embeddings(buckets_boundaries, bucketing_name)
elif self.args.pem_without_log:
return self.cand_entities_scores
else:
return tf.log(tf.minimum(1.0, tf.maximum(self.args.zero, self.cand_entities_scores)))
def _pem_bucketing_embeddings(self, buckets_boundaries, bucketing_name):
from tensorflow.python.ops import math_ops
bucketized_pem = math_ops._bucketize(self.cand_entities_scores, boundaries=buckets_boundaries)
with tf.variable_scope(bucketing_name):
_pem_embeddings = tf.get_variable(
name="pem_embeddings_var",
dtype=tf.float32,
shape=[len(buckets_boundaries) + 1, 1], trainable=True)
pem_embeddings = tf.nn.embedding_lookup(_pem_embeddings, bucketized_pem, name="pem_embeddings_lookup")
# pem_embeddings = Tensor("pem_embeddings/pem_embeddings_lookup:0", shape=(?, ?, ?, 1), dtype=float32)
return tf.squeeze(pem_embeddings, axis=3)
def add_cand_ent_scores_op(self):
"""
# TODO remove this code and this option (without p_e_m) just to experiment once...
if hasattr(self.args, 'no_p_e_m_usage') and self.args.no_p_e_m_usage:
self.final_scores = self.similarity_scores
return
"""
#now add the cand_entity_scores maybe also some extra features and through a simple ffnn
stack_values = []
if self.args.nn_components.find("lstm") != -1:
stack_values.append(self.similarity_scores)
if self.args.nn_components.find("pem") != -1:
# TODO rename to pem_scores
self.log_cand_entities_scores = self.custom_pem(
self.args.pem_without_log, self.args.pem_buckets_boundaries)
stack_values.append(self.log_cand_entities_scores)
if self.args.nn_components.find("attention") != -1:
stack_values.append(self.attention_scores)
"""
if len(stack_values) == 1:
# since only one scalar omit the final ffnn
self.final_scores = stack_values[0]
return
"""
scalar_predictors = tf.stack(stack_values, 3)
#print("scalar_predictors = ", scalar_predictors) #[batch, num_mentions, 30, 2]
with tf.variable_scope("similarity_and_prior_ffnn"):
# [batch, num_mentions, 30, 1] squeeze to [batch, num_mentions, 30]
if self.args.final_score_ffnn[0] == 0:
self.final_scores = util.projection(scalar_predictors, 1, model=self)
else:
hidden_layers, hidden_size = self.args.final_score_ffnn[0], self.args.final_score_ffnn[1]
self.final_scores = util.ffnn(scalar_predictors, hidden_layers, hidden_size, 1,
self.dropout if self.args.ffnn_dropout else None, model=self)
#self.final_scores = tf.squeeze(util.ffnn(scalar_predictors, 1, 100, 1, self.dropout), axis=3)
self.final_scores = tf.squeeze(self.final_scores, axis=3)
#print("final_scores = ", self.final_scores)
def add_global_voting_op(self):
with tf.variable_scope("global_voting"):
#self.final_scores_before_global = self.final_scores
# TODO important change
#self.final_scores_before_global = tf.where(tf.less(self.loss_mask, 1e-3), -3.0, self.final_scores)
self.final_scores_before_global = - (1 - self.loss_mask) * 50 + self.final_scores
if self.args.global_topkfromallspans:
batch_num = tf.shape(self.final_scores)[0]
spans_num = tf.shape(self.final_scores)[1] # num of spans
cand_ent_num = tf.shape(self.final_scores)[2] # 30
new_size = spans_num * cand_ent_num
temp = tf.diag(tf.ones([spans_num]))
temp = tf.tile(tf.expand_dims(temp, axis=2), [1, 1, cand_ent_num])
temp = tf.reshape(temp, [spans_num, new_size])
mask = tf.reshape(tf.tile(tf.expand_dims(temp, axis=1), [1, cand_ent_num, 1]), [new_size, new_size])
mask = 1 - mask
all_entities = tf.reshape(self.pure_entity_embeddings, [batch_num, new_size, 300])
all_scores = tf.matmul(all_entities, all_entities, transpose_b=True) # [batch, new_size, new_size]
filtered_scores = all_scores * mask
top_values, _ = tf.nn.top_k(filtered_scores, self.args.global_topkfromallspans)
# [batch, new_size, K]
if self.args.global_topkfromallspans_onlypositive:
top_values = tf.maximum(top_values, self.args.zero)
# so to avoid keeping cand ent that have score < of this value even if they are the
self.global_voting_scores = tf.reduce_mean(top_values, axis=2) # [batch, new_size]
self.global_voting_scores = tf.reshape(self.global_voting_scores, [batch_num, spans_num, cand_ent_num])
else:
if self.args.global_gmask_unambigious:
gmask = self._sequence_mask_v13(tf.equal(self.cand_entities_len, 1), tf.shape(self.final_scores)[2])
elif not self.args.global_topk:
gmask = tf.to_float(((self.final_scores_before_global - self.args.global_thr) >= 0)) # [b,s,30]
else:
top_values, _ = tf.nn.top_k(self.final_scores_before_global, self.args.global_topk)
# [batch, num_of_spans, K]
K_value = top_values[:, :, -1] # [batch, num_of_spans]
#if hasattr(self.args, 'global_topkthr'):
if self.args.global_topkthr:
K_value = tf.maximum(self.args.global_topkthr, K_value)
# so to avoid keeping cand ent that have score < of this value even if they are the
# top for this span. 30
threshold = tf.tile(tf.expand_dims(K_value, 2), [1, 1, tf.shape(self.final_scores)[-1]])
# [batch, num_of_spans, 30]
gmask = tf.to_float(((self.final_scores_before_global - threshold) >= 0))
gmask = gmask * self.loss_mask
if self.args.global_mask_scale_each_mention_voters_to_one:
temp = tf.reduce_sum(gmask, axis=2, keep_dims=True) # [batch, num_of_spans, 1]
temp = tf.where(tf.less(temp, 1e-4), temp, 1. / (temp + 1e-4))
gmask = gmask * temp
elif self.args.global_gmask_based_on_localscore:
"""
temp_masked_local_scores = gmask * self.final_scores_before_global # [batch, num_of_spans, 30]
# per last dimension (30) add the smallest score if negative otherwise keep it like that to zero
# so we only have positive mask scores
offset = tf.reduce_min(temp_masked_local_scores, axis=2) # [batch, num_of_spans]
offset = tf.expand_dims(tf.minimum(0.0, offset), axis=2) # [batch, num_of_spans, 1]
gmask = (temp_masked_local_scores - offset) * gmask # [batch, num_of_spans, 30]
gmask = tf.maximum(0.0, gmask)
"""
gmask = gmask * tf.nn.softmax(self.final_scores_before_global)
self.gmask = gmask
masked_entity_emb = self.pure_entity_embeddings * tf.expand_dims(gmask, axis=3) # [b,s,30,300] * [b,s,30,1]
if self.args.new_all_voters_emb:
batch_size = tf.shape(masked_entity_emb)[0]
all_voters_emb = tf.reduce_sum(tf.reshape(masked_entity_emb, [batch_size, -1, 300]), axis=1,
keep_dims=True) # [b, 1, 300]
else:
all_voters_emb = tf.reduce_sum(tf.reshape(masked_entity_emb, [-1, 300]), axis=0) # [300]
span_voters_emb = tf.reduce_sum(masked_entity_emb, axis=2) # [batch, num_of_spans, 300]
valid_voters_emb = all_voters_emb - span_voters_emb
#[300] - [batch, spans, 300] = [batch, spans, 300] (broadcasting)
if self.args.global_norm_or_mean == "norm":
valid_voters_emb = tf.nn.l2_normalize(valid_voters_emb, dim=2)
else:
all_voters_num = tf.reduce_sum(gmask) # scalar
span_voters_num = tf.reduce_sum(gmask, axis=2) # [batch, spans]
valid_voters_emb = valid_voters_emb / tf.expand_dims(all_voters_num - span_voters_num, axis=2)
self.global_voting_scores = tf.squeeze(tf.matmul(self.pure_entity_embeddings, tf.expand_dims(valid_voters_emb, axis=3)), axis=3)
# [b,s,30,300] matmul [b,s,300,1] --> [b,s,30,1]-->[b,s,30]
# TODO here i check the self.args.stage2_nn_components
stack_values = []
if self.args.stage2_nn_components.find("pem") != -1:
# TODO rename to pem_scores
self.gpem_scores = self.custom_pem(
self.args.gpem_without_log, self.args.gpem_buckets_boundaries)
stack_values.append(self.gpem_scores)
if self.args.stage2_nn_components.find("local") != -1:
stack_values.append(self.final_scores_before_global)
stack_values.append(self.global_voting_scores)
scalar_predictors = tf.stack(stack_values, 3)
#print("scalar_predictors = ", scalar_predictors) #[b, s, 30, 2]
with tf.variable_scope("psi_and_global_ffnn"):
# [batch, num_mentions, 30, 1] squeeze to [batch, num_mentions, 30]
if self.args.global_score_ffnn[0] == 0:
self.final_scores = util.projection(scalar_predictors, 1, model=self)
else:
hidden_layers, hidden_size = self.args.global_score_ffnn[0], self.args.global_score_ffnn[1]
self.final_scores = util.ffnn(scalar_predictors, hidden_layers, hidden_size, 1,
self.dropout if self.args.ffnn_dropout else None, model=self)
self.final_scores = tf.squeeze(self.final_scores, axis=3)
#print("final_scores = ", self.final_scores)
def add_loss_op(self):
cand_entities_labels = tf.cast(self.cand_entities_labels, tf.float32)
loss1 = cand_entities_labels * tf.nn.relu(self.args.gamma_thr - self.final_scores)
loss2 = (1 - cand_entities_labels) * tf.nn.relu(self.final_scores)
self.loss = loss1 + loss2
if self.args.nn_components.find("global") != -1 and not self.args.global_one_loss:
loss3 = cand_entities_labels * tf.nn.relu(self.args.gamma_thr - self.final_scores_before_global)
loss4 = (1 - cand_entities_labels) * tf.nn.relu(self.final_scores_before_global)
self.loss = loss1 + loss2 + loss3 + loss4
#print("loss_mask = ", loss_mask)
self.loss = self.loss_mask * self.loss
self.loss = tf.reduce_sum(self.loss)
# for tensorboard
#tf.summary.scalar("loss", self.loss)
def build(self):
self.add_placeholders()
self.add_embeddings_op()
if self.args.nn_components.find("lstm") != -1:
self.add_context_emb_op()
self.add_span_emb_op()
self.add_lstm_score_op()
if self.args.nn_components.find("attention") != -1:
self.add_local_attention_op()
self.add_cand_ent_scores_op()
if self.args.nn_components.find("global") != -1:
self.add_global_voting_op()
if self.args.running_mode.startswith("train"):
self.add_loss_op()
# Generic functions that add training op
self.add_train_op(self.args.lr_method, self.lr, self.loss, self.args.clip)
self.merged_summary_op = tf.summary.merge_all()
if self.args.running_mode == "train_continue":
self.restore_session("latest")
elif self.args.running_mode == "train":
self.initialize_session() # now self.sess is defined and vars are init
self.init_embeddings()
# if we run the evaluate.py script then we should call explicitly the model.restore("ed")
# or model.restore("el"). here it doesn't initialize or restore values for the evaluate.py
# case.
def _sequence_mask_v13(self, mytensor, max_width):
"""mytensor is a 2d tensor"""
if not tf.__version__.startswith("1.4"):
temp_shape = tf.shape(mytensor)
temp = tf.sequence_mask(tf.reshape(mytensor, [-1]), max_width, dtype=tf.float32)
temp_mask = tf.reshape(temp, [temp_shape[0], temp_shape[1], max_width]) # tf.shape(temp)[-1]])
else:
temp_mask = tf.sequence_mask(mytensor, max_width, dtype=tf.float32)
return temp_mask
================================================
FILE: code/model/ensemble_eval.py
================================================
import argparse
import pickle
import model.config as config
import os
import tensorflow as tf
from model.model_ablations import Model
import model.train as train
from evaluation.metrics import Evaluator, metrics_calculation_and_prediction_printing, threshold_calculation
import model.reader as reader
from model.util import load_train_args
def validation_loss_calculation(filename, opt_thr, el_mode):
if args.predictions_folder is not None:
printPredictions.process_file(el_mode, filename, opt_thr)
ensemble_fixed = []
ensemble_acc = [] # final_scores and similarity_scores. all the rest are fixed
for model_num, model_folder in enumerate(args.output_folder): # for all ensemble models
model, handles = create_input_pipeline(el_mode, model_folder,
[filename])
retrieve_l = [model.final_scores, model.similarity_scores,
model.cand_entities_len, model.cand_entities,
model.begin_span, model.end_span, model.spans_len,
model.begin_gm, model.end_gm,
model.ground_truth, model.ground_truth_len,
model.words_len, model.chunk_id,
# model.similarity_scores,
model.words, model.chars, model.chars_len,
model.log_cand_entities_scores]
elem_idx = 0
while True:
try:
result_l = model.sess.run(
retrieve_l, feed_dict={model.input_handle_ph: handles[0], model.dropout: 1})
if model_num == 0:
ensemble_fixed.append(result_l[2:])
ensemble_acc.append(result_l[:2])
else:
ensemble_acc[elem_idx][0] += result_l[0]
ensemble_acc[elem_idx][1] += result_l[1]
elem_idx += 1
except tf.errors.OutOfRangeError:
break
model.close_session()
evaluator = Evaluator(opt_thr, name=filename)
number_of_models = len(args.output_folder)
for (final_scores, similarity_scores), fixed in zip(ensemble_acc, ensemble_fixed):
final_scores /= number_of_models
similarity_scores /= number_of_models
metrics_calculation_and_prediction_printing(evaluator,
final_scores, similarity_scores, *fixed, el_mode,
printPredictions=printPredictions)
if printPredictions:
printPredictions.file_ended()
print(filename)
micro_f1, macro_f1 = evaluator.print_log_results(None, -1, el_mode)
return macro_f1
def optimal_thr_calc(el_mode):
filenames = args.el_datasets if el_mode else args.ed_datasets
val_datasets = args.el_val_datasets if el_mode else args.ed_val_datasets
ensemble_fixed = []
ensemble_acc = [] # final_scores and similarity_scores. all the rest are fixed
for model_num, model_folder in enumerate(args.output_folder): # for all ensemble models
model, handles = create_input_pipeline(el_mode, model_folder,
[filenames[i] for i in val_datasets])
retrieve_l = (model.final_scores, model.cand_entities_len, model.cand_entities,
model.begin_span, model.end_span, model.spans_len,
model.begin_gm, model.end_gm,
model.ground_truth, model.ground_truth_len,
model.words_len, model.chunk_id)
elem_idx = 0
for dataset_handle in handles: # 1, 4 for each validation dataset
while True:
try:
result_l = model.sess.run(
retrieve_l, feed_dict={model.input_handle_ph: dataset_handle, model.dropout: 1})
if model_num == 0:
ensemble_fixed.append(result_l[1:])
ensemble_acc.append(result_l[0])
else:
ensemble_acc[elem_idx] += result_l[0]
elem_idx += 1
except tf.errors.OutOfRangeError:
break
model.close_session()
number_of_models = len(args.output_folder)
tp_fp_scores_labels = []
fn_scores = []
for final_scores, fixed in zip(ensemble_acc, ensemble_fixed):
final_scores /= number_of_models
tp_fp_batch, fn_batch = threshold_calculation(final_scores, *fixed, el_mode)
tp_fp_scores_labels.extend(tp_fp_batch)
fn_scores.extend(fn_batch)
return train.optimal_thr_calc_aux(tp_fp_scores_labels, fn_scores)
def create_input_pipeline(el_mode, model_folder, filenames):
tf.reset_default_graph()
folder = config.base_folder+"data/tfrecords/" + args.experiment_name + ("/test/" if el_mode else "/train/")
datasets = []
for file in filenames:
datasets.append(reader.test_input_pipeline([folder+file], args))
input_handle_ph = tf.placeholder(tf.string, shape=[], name="input_handle_ph")
iterator = tf.contrib.data.Iterator.from_string_handle(
input_handle_ph, datasets[0].output_types, datasets[0].output_shapes)
next_element = iterator.get_next()
train_args = load_train_args(args.output_folder, "ensemble_eval")
print("loading Model:", model_folder)
#train_args.evaluation_script = True
train_args.entity_extension = args.entity_extension
model = Model(train_args, next_element)
model.build()
#print("model train_args:", model.args)
#print("model checkpoint_folder:", model.args.checkpoints_folder)
model.input_handle_ph = input_handle_ph
model.restore_session("el" if el_mode else "ed")
#iterators, handles = from_datasets_to_iterators_and_handles(model.sess, datasets)
iterators = []
handles = []
for dataset in datasets:
#iterator = dataset.make_initializable_iterator() # one shot iterators fits better here
iterator = dataset.make_one_shot_iterator()
iterators.append(iterator)
handles.append(model.sess.run(iterator.string_handle()))
return model, handles
def evaluate():
for el_mode in [False, True]:
filenames = args.el_datasets if el_mode else args.ed_datasets
if filenames:
print("Evaluating {} datasets".format("EL" if el_mode else "ED"))
opt_thr, _ = optimal_thr_calc(el_mode)
# TODO check the following lines
results = []
#for test_handle, test_name, test_it in zip(datasets, names):
for filename in filenames:
f1_score = validation_loss_calculation(filename, opt_thr, el_mode=el_mode)
results.append(f1_score)
def _parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--experiment_name", default="alldatasets_perparagr", #"standard",
help="under folder data/tfrecords/")
parser.add_argument("--training_name", help="under folder data/tfrecords/")
parser.add_argument("--predictions_folder", default=None, help="full or relative path. where to print"
"the result files.")
parser.add_argument("--ed_datasets", default="aida_train.txt_z_aida_dev.txt_z_aida_test.txt_z_"
"ace2004.txt_z_aquaint.txt_z_clueweb.txt_z_msnbc.txt_z_wikipedia.txt")
parser.add_argument("--el_datasets", default="aida_train.txt_z_aida_dev.txt_z_aida_test.txt_z_"
"ace2004.txt_z_aquaint.txt_z_clueweb.txt_z_msnbc.txt_z_wikipedia.txt")
parser.add_argument("--ed_val_datasets", default="1")
parser.add_argument("--el_val_datasets", default="1")
parser.add_argument("--all_spans_training", help="y_y_n_y_n")
parser.add_argument("--entity_extension", default=None, help="extension_entities or extension_entities_all etc")
args = parser.parse_args()
args.ed_datasets = args.ed_datasets.split('_z_') if args.ed_datasets != "" else None
args.el_datasets = args.el_datasets.split('_z_') if args.el_datasets != "" else None
args.ed_val_datasets = [int(x) for x in args.ed_val_datasets.split('_')]
args.el_val_datasets = [int(x) for x in args.el_val_datasets.split('_')]
args.training_name = args.training_name.split('_z_')
args.all_spans_training = ["all_spans_" if x == 'y' else "" for x in args.all_spans_training.split('_')]
assert(len(args.training_name) == len(args.all_spans_training))
if args.predictions_folder is not None and not os.path.exists(args.predictions_folder):
os.makedirs(args.predictions_folder)
if args.predictions_folder is not None and not os.path.exists(args.predictions_folder+"ed/"):
os.makedirs(args.predictions_folder+"ed/")
if args.predictions_folder is not None and not os.path.exists(args.predictions_folder+"el/"):
os.makedirs(args.predictions_folder+"el/")
args.output_folder = []
for training_name, prefix in zip(args.training_name, args.all_spans_training):
args.output_folder.append(config.base_folder+"data/tfrecords/" + \
args.experiment_name+"/{}training_folder/".format(prefix) + \
training_name+"/")
args.batch_size = 1
print(args)
return args
if __name__ == "__main__":
args = _parse_args()
printPredictions = None
if args.predictions_folder is not None:
from evaluation.print_predictions import PrintPredictions
printPredictions = PrintPredictions(config.base_folder+"data/tfrecords/"+
args.experiment_name+"/", args.predictions_folder)
evaluate()
================================================
FILE: code/model/evaluate.py
================================================
import argparse
import pickle
import model.config as config
import os
import tensorflow as tf
from model.model_ablations import Model
from evaluation.metrics import Evaluator, metrics_calculation_and_prediction_printing
import model.train as train
from model.util import load_train_args
def validation_loss_calculation(model, iterator, dataset_handle, opt_thr, el_mode, name=""):
if args.print_predictions:
printPredictions.process_file(el_mode, name, opt_thr)
model.sess.run(iterator.initializer)
evaluator = Evaluator(opt_thr, name=name)
while True:
try:
retrieve_l = [model.final_scores,
model.cand_entities_len, model.cand_entities,
model.begin_span, model.end_span, model.spans_len,
model.begin_gm, model.end_gm,
model.ground_truth, model.ground_truth_len,
model.words_len, model.chunk_id,
model.words, model.chars, model.chars_len]
scores_retrieve_l, scores_names_l = [], []
if model.args.nn_components.find("lstm") != -1:
scores_retrieve_l.append(model.similarity_scores)
scores_names_l.append("lstm")
if model.args.nn_components.find("pem") != -1:
scores_retrieve_l.append(model.log_cand_entities_scores)
scores_names_l.append("logpem")
if model.args.nn_components.find("attention") != -1:
scores_retrieve_l.append(model.attention_scores)
scores_names_l.append("attention")
if model.args.nn_components.find("global") != -1:
scores_retrieve_l.append(model.final_scores_before_global)
scores_names_l.append("before_global")
scores_retrieve_l.append(model.global_voting_scores)
scores_names_l.append("global_voting")
global_pairwise_scores = []
if args.print_global_voters:
global_pairwise_scores.append(model.gmask)
global_pairwise_scores.append(model.pure_entity_embeddings)
retrieve_l.append(scores_retrieve_l)
retrieve_l.append(global_pairwise_scores)
result_l = model.sess.run(
retrieve_l, feed_dict={model.input_handle_ph: dataset_handle, model.dropout: 1})
metrics_calculation_and_prediction_printing(evaluator, *result_l, scores_names_l, el_mode,
printPredictions=printPredictions)
except tf.errors.OutOfRangeError:
if args.print_predictions:
printPredictions.file_ended()
print(name)
micro_f1, macro_f1 = evaluator.print_log_results(None, -1, el_mode)
break
return macro_f1
# identical with the train.compute_ed_el_scores
def compute_ed_el_scores(model, handles, names, iterators, el_mode):
if args.hardcoded_thr:
opt_thr = args.hardcoded_thr
print("hardcoded threshold used:", opt_thr)
else:
# first compute the optimal threshold based on validation datasets.
opt_thr, _ = train.optimal_thr_calc(model, handles, iterators, el_mode)
# give the opt_thr and the projection variables to the PrintPredictions for insight
if printPredictions:
printPredictions.extra_info = print_thr_and_ffnn_values(model, opt_thr)
results = []
for test_handle, test_name, test_it in zip(handles, names, iterators):
f1_score = validation_loss_calculation(model, test_it, test_handle, opt_thr,
el_mode=el_mode, name=test_name)
results.append(f1_score)
return results
def evaluate():
ed_datasets, ed_names = train.create_el_ed_pipelines(gmonly_flag=True, filenames=args.ed_datasets, args=args)
el_datasets, el_names = train.create_el_ed_pipelines(gmonly_flag=False, filenames=args.el_datasets, args=args)
input_handle_ph = tf.placeholder(tf.string, shape=[], name="input_handle_ph")
sample_dataset = ed_datasets[0] if ed_datasets != [] else el_datasets[0]
iterator = tf.data.Iterator.from_string_handle(
input_handle_ph, sample_dataset.output_types, sample_dataset.output_shapes)
next_element = iterator.get_next()
model = Model(train_args, next_element)
model.build()
model.input_handle_ph = input_handle_ph # just for convenience so i can access it from everywhere
print(tf.global_variables())
if args.p_e_m_algorithm:
model.final_scores = model.cand_entities_scores
def ed_el_dataset_handles(sess, datasets):
test_iterators = []
test_handles = []
for dataset in datasets:
test_iterator = dataset.make_initializable_iterator()
test_iterators.append(test_iterator)
test_handles.append(sess.run(test_iterator.string_handle()))
return test_iterators, test_handles
for el_mode, datasets, names in zip([False, True], [ed_datasets, el_datasets], [ed_names, el_names]):
if names == []:
continue
model.restore_session("el" if el_mode else "ed")
#print_variables_values(model)
with model.sess as sess:
print("Evaluating {} datasets".format("EL" if el_mode else "ED"))
iterators, handles = ed_el_dataset_handles(sess, datasets)
compute_ed_el_scores(model, handles, names, iterators, el_mode=el_mode)
# TODO delete this function
def print_variables_values(model):
var_names = ['similarity_and_prior_ffnn/output_weights:0',
'similarity_and_prior_ffnn/output_bias:0']
for var_name in var_names:
var_handle = [v for v in tf.global_variables() if v.name == var_name][0]
print(var_name)
print(model.sess.run(var_handle))
def print_thr_and_ffnn_values(model, opt_thr):
result = "opt_thr={}, nn_components={}\n".format(opt_thr, model.args.nn_components)
var_names, var_print_names = [], []
if model.args.final_score_ffnn[0] == 0:
var_names.extend(['similarity_and_prior_ffnn/output_weights:0',
'similarity_and_prior_ffnn/output_bias:0'])
var_print_names.extend(['lstm_pem_attention_weights', 'lstm_pem_attention_bias'])
if model.args.nn_components.find("global") != -1 and model.args.global_score_ffnn[0] == 0:
# print only if simple projection, otherwise it could be too many variables
var_names.extend(['global_voting/psi_and_global_ffnn/output_weights:0',
'global_voting/psi_and_global_ffnn/output_bias:0'])
var_print_names.extend(['psi_globalscore_weights', 'psi_globalscore_bias'])
for var_name, print_name in zip(var_names, var_print_names):
var_handle = [v for
gitextract_uokgx3gu/ ├── .gitignore ├── Examples _ End-to-End Neural Entity Linking.ipynb ├── LICENSE ├── code/ │ ├── __init__.py │ ├── evaluation/ │ │ ├── .swm │ │ ├── .swn │ │ ├── .swo │ │ ├── metrics.py │ │ ├── metrics_old.py │ │ ├── print_predictions (copy).py │ │ ├── print_predictions.py │ │ └── summarize_all_experiments.py │ ├── gerbil/ │ │ ├── build_entity_universe.py │ │ ├── gerbil_recall_calculation.py │ │ ├── nn_processing.py │ │ ├── nn_processing_correct.py │ │ └── server.py │ ├── model/ │ │ ├── base_model.py │ │ ├── config.py │ │ ├── ed_model_original.py │ │ ├── ensemble_eval.py │ │ ├── evaluate.py │ │ ├── model.py │ │ ├── model_ablations.py │ │ ├── reader.py │ │ ├── train.py │ │ └── util.py │ ├── preprocessing/ │ │ ├── __init__.py │ │ ├── aida_insight.py │ │ ├── bridge_code_lua/ │ │ │ ├── ent_vecs_from_txt_to_npy.py │ │ │ └── ent_vecs_to_txt.lua │ │ ├── extra.py │ │ ├── old/ │ │ │ ├── old_code.py │ │ │ ├── prepro_datasets1_once.py │ │ │ ├── preprocess1.py │ │ │ └── preprocessv2.py │ │ ├── p_e_m.py │ │ ├── prepro_aida.py │ │ ├── prepro_aida_tokenize.py │ │ ├── prepro_gerbil_datasets.py │ │ ├── prepro_other_datasets.py │ │ ├── prepro_util.py │ │ ├── prepro_wikidump.py │ │ └── util.py │ └── script ├── create_entity_vectors.md ├── deep-ed/ │ └── deep-ed-master/ │ ├── LICENSE │ ├── README.md │ ├── data_gen/ │ │ ├── gen_p_e_m/ │ │ │ ├── gen_p_e_m_from_wiki.lua │ │ │ ├── gen_p_e_m_from_yago.lua │ │ │ ├── merge_crosswikis_wiki.lua │ │ │ └── unicode_map.lua │ │ ├── gen_test_train_data/ │ │ │ ├── gen_ace_msnbc_aquaint_csv.lua │ │ │ ├── gen_aida_test.lua │ │ │ ├── gen_aida_train.lua │ │ │ └── gen_all.lua │ │ ├── gen_wiki_data/ │ │ │ ├── gen_ent_wiki_w_repr.lua │ │ │ └── gen_wiki_hyp_train_data.lua │ │ ├── indexes/ │ │ │ ├── wiki_disambiguation_pages_index.lua │ │ │ ├── wiki_redirects_index.lua │ │ │ └── yago_crosswikis_wiki.lua │ │ └── parse_wiki_dump/ │ │ └── parse_wiki_dump_tools.lua │ ├── ed/ │ │ ├── args.lua │ │ ├── ed.lua │ │ ├── loss.lua │ │ ├── minibatch/ │ │ │ ├── build_minibatch.lua │ │ │ └── data_loader.lua │ │ ├── models/ │ │ │ ├── SetConstantDiag.lua │ │ │ ├── linear_layers.lua │ │ │ ├── model.lua │ │ │ ├── model_global.lua │ │ │ └── model_local.lua │ │ ├── test/ │ │ │ ├── check_coref.lua │ │ │ ├── coref_persons.lua │ │ │ ├── ent_freq_stats_test.lua │ │ │ ├── ent_p_e_m_stats_test.lua │ │ │ ├── test.lua │ │ │ └── test_one_loaded_model.lua │ │ └── train.lua │ ├── ent_vecs_scores.txt │ ├── entities/ │ │ ├── ent_name2id_freq/ │ │ │ ├── e_freq_gen.lua │ │ │ ├── e_freq_index.lua │ │ │ └── ent_name_id.lua │ │ ├── learn_e2v/ │ │ │ ├── 4EX_wiki_words.lua │ │ │ ├── batch_dataset_a.lua │ │ │ ├── e2v_a.lua │ │ │ ├── learn_a.lua │ │ │ ├── minibatch_a.lua │ │ │ └── model_a.lua │ │ ├── pretrained_e2v/ │ │ │ ├── check_ents.lua │ │ │ ├── e2v.lua │ │ │ └── e2v_txt_reader.lua │ │ └── relatedness/ │ │ ├── filter_wiki_canonical_words_RLTD.lua │ │ ├── filter_wiki_hyperlink_contexts_RLTD.lua │ │ └── relatedness.lua │ ├── log_train_entity_vecs │ ├── our_system_annotations.txt │ ├── utils/ │ │ ├── logger.lua │ │ ├── optim/ │ │ │ ├── adadelta_mem.lua │ │ │ ├── adagrad_mem.lua │ │ │ └── rmsprop_mem.lua │ │ └── utils.lua │ └── words/ │ ├── load_w_freq_and_vecs.lua │ ├── stop_words.lua │ ├── w2v/ │ │ ├── glove_reader.lua │ │ ├── w2v.lua │ │ └── word2vec_reader.lua │ └── w_freq/ │ ├── w_freq_gen.lua │ └── w_freq_index.lua ├── gerbil-SpotWrapNifWS4Test/ │ ├── .gitignore │ ├── Dockerfile │ ├── LICENSE │ ├── Makefile │ ├── README.md │ ├── curlExample.sh │ ├── docker-compose.yml │ ├── example.ttl │ ├── my_notes/ │ │ ├── messages_format │ │ ├── python_server_format │ │ ├── python_server_format_ed │ │ └── python_server_format_el │ ├── pom.xml │ ├── repository/ │ │ └── org/ │ │ ├── aksw/ │ │ │ └── gerbil.nif.transfer/ │ │ │ ├── 1.1.0-SNAPSHOT/ │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.jar │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.pom │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5 │ │ │ │ ├── gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1 │ │ │ │ ├── maven-metadata-local.xml │ │ │ │ ├── maven-metadata-local.xml.md5 │ │ │ │ └── maven-metadata-local.xml.sha1 │ │ │ ├── maven-metadata-local.xml │ │ │ ├── maven-metadata-local.xml.md5 │ │ │ └── maven-metadata-local.xml.sha1 │ │ └── restlet/ │ │ ├── org.restlet/ │ │ │ ├── 2.2.1/ │ │ │ │ ├── org.restlet-2.2.1.jar │ │ │ │ ├── org.restlet-2.2.1.jar.md5 │ │ │ │ ├── org.restlet-2.2.1.jar.sha1 │ │ │ │ ├── org.restlet-2.2.1.pom │ │ │ │ ├── org.restlet-2.2.1.pom.md5 │ │ │ │ └── org.restlet-2.2.1.pom.sha1 │ │ │ ├── maven-metadata-local.xml │ │ │ ├── maven-metadata-local.xml.md5 │ │ │ └── maven-metadata-local.xml.sha1 │ │ └── org.restlet.ext.servlet/ │ │ ├── 2.2.1/ │ │ │ ├── org.restlet.ext.servlet-2.2.1.jar │ │ │ ├── org.restlet.ext.servlet-2.2.1.jar.md5 │ │ │ ├── org.restlet.ext.servlet-2.2.1.jar.sha1 │ │ │ ├── org.restlet.ext.servlet-2.2.1.pom │ │ │ ├── org.restlet.ext.servlet-2.2.1.pom.md5 │ │ │ └── org.restlet.ext.servlet-2.2.1.pom.sha1 │ │ ├── maven-metadata-local.xml │ │ ├── maven-metadata-local.xml.md5 │ │ └── maven-metadata-local.xml.sha1 │ └── src/ │ └── main/ │ ├── java/ │ │ └── org/ │ │ └── aksw/ │ │ └── gerbil/ │ │ └── ws4test/ │ │ ├── EDResource.java │ │ ├── LocalIntermediateWebserver.java │ │ ├── MyResource.java │ │ ├── SpotlightClient.java │ │ ├── SpotlightResource.java │ │ ├── TestApplication.java │ │ └── data_format │ ├── resources/ │ │ └── log4j.properties │ └── webapp/ │ └── WEB-INF/ │ └── web.xml ├── readme.md └── requirements.txt
SYMBOL INDEX (394 symbols across 40 files)
FILE: code/evaluation/metrics.py
class Evaluator (line 7) | class Evaluator(object):
method __init__ (line 8) | def __init__(self, threshold, name):
method gm_add (line 17) | def gm_add(self, gm_in_batch):
method check_tp (line 20) | def check_tp(self, score, docid):
method check_fp (line 27) | def check_fp(self, score, docid):
method check_fn (line 34) | def check_fn(self, score, docid):
method _score_computation (line 41) | def _score_computation(self, el_mode):
method print_log_results (line 70) | def print_log_results(self, tf_writer, eval_cnt, el_mode):
method print_log_results_old (line 100) | def print_log_results_old(self, tf_writer, eval_cnt, el_mode):
class StrongMatcher (line 165) | class StrongMatcher(object):
method __init__ (line 170) | def __init__(self, b_e_gt_iterator):
method check (line 175) | def check(self, t):
class WeakMatcher (line 180) | class WeakMatcher(object):
method __init__ (line 190) | def __init__(self, b_e_gt_iterator):
method check (line 195) | def check(self, t):
class FNStrongMatcher (line 211) | class FNStrongMatcher(object):
method __init__ (line 218) | def __init__(self, filtered_spans):
method check (line 223) | def check(self, t):
class FNWeakMatcher (line 229) | class FNWeakMatcher(object):
method __init__ (line 239) | def __init__(self, filtered_spans):
method check (line 244) | def check(self, t):
function _filtered_spans_and_gm_gt_list (line 263) | def _filtered_spans_and_gm_gt_list(b, final_scores, cand_entities_len, c...
function threshold_calculation (line 299) | def threshold_calculation(final_scores, cand_entities_len, cand_entities,
function metrics_calculation (line 326) | def metrics_calculation(evaluator, final_scores, cand_entities_len, cand...
function metrics_calculation_and_prediction_printing (line 354) | def metrics_calculation_and_prediction_printing(evaluator, final_scores,
FILE: code/evaluation/metrics_old.py
class Evaluator_aux (line 7) | class Evaluator_aux(object):
method __init__ (line 8) | def __init__(self, threshold, name):
method check_tp (line 16) | def check_tp(self, score, docid):
method check_fp (line 21) | def check_fp(self, score, docid):
method check_fn (line 26) | def check_fn(self, score, docid):
method print_results (line 31) | def print_results(self):
class Evaluator (line 70) | class Evaluator(object):
method __init__ (line 71) | def __init__(self, weak_thr=None, strong_thr=None, name=""):
method weak_check_tp (line 80) | def weak_check_tp(self, score, docid):
method weak_check_fp (line 85) | def weak_check_fp(self, score, docid):
method weak_check_fn (line 90) | def weak_check_fn(self, score, docid):
method strong_check_tp (line 95) | def strong_check_tp(self, score, docid):
method strong_check_fp (line 100) | def strong_check_fp(self, score, docid):
method strong_check_fn (line 105) | def strong_check_fn(self, score, docid):
method print_log_results (line 110) | def print_log_results(self, writer, eval_cnt):
class WeakStrongMatching (line 136) | class WeakStrongMatching(object):
method __init__ (line 137) | def __init__(self, b_e_gt_iterator):
method strong_check (line 149) | def strong_check(self, t):
method weak_check (line 152) | def weak_check(self, t):
class FNWeakStrongMatching (line 168) | class FNWeakStrongMatching(object):
method __init__ (line 169) | def __init__(self, filtered_spans):
method strong_check (line 180) | def strong_check(self, t):
method weak_check (line 190) | def weak_check(self, t):
function validation_scores_calculation (line 206) | def validation_scores_calculation(evaluator, final_scores, cand_entities...
function evaluation_scores_calculation (line 271) | def evaluation_scores_calculation(evaluator, final_scores, cand_entities...
FILE: code/evaluation/print_predictions (copy).py
class PrintPredictions (line 8) | class PrintPredictions(object):
method __init__ (line 9) | def __init__(self, output_folder, predictions_folder, entity_extension...
method map_entity (line 20) | def map_entity(self, nnid):
method process_file (line 25) | def process_file(self, el_mode, name, opt_thr):
method file_ended (line 31) | def file_ended(self):
method process_sample (line 34) | def process_sample(self, chunkid,
class FPWeakMatcherLogging (line 112) | class FPWeakMatcherLogging(object):
method __init__ (line 123) | def __init__(self, printPredictions, b_e_gt_iterator, cand_entities, c...
method check (line 133) | def check(self, s, e, span_num):
FILE: code/evaluation/print_predictions.py
class GMBucketingResults (line 10) | class GMBucketingResults(object):
method __init__ (line 11) | def __init__(self, gm_bucketing_pempos):
method reinitialize (line 20) | def reinitialize(self):
method process_fn (line 26) | def process_fn(self, pos, match_with_winner, num_of_cand_entities):
method process_tp (line 37) | def process_tp(self, pos, num_of_cand_entities):
method print (line 45) | def print(self):
class PrintPredictions (line 54) | class PrintPredictions(object):
method __init__ (line 55) | def __init__(self, output_folder, predictions_folder, entity_extension...
method map_entity (line 70) | def map_entity(self, nnid, onlyname=False):
method process_file (line 75) | def process_file(self, el_mode, name, opt_thr):
method file_ended (line 83) | def file_ended(self):
method scores_text (line 88) | def scores_text(self, scores_l, scores_names_l, i, j):
method process_sample (line 91) | def process_sample(self, chunkid,
method print_gmask (line 191) | def print_gmask(self, gmask, span_num_b_e_gt, reconstructed_words, can...
class FPWeakMatcherLogging (line 208) | class FPWeakMatcherLogging(object):
method __init__ (line 219) | def __init__(self, printPredictions, span_num_b_e_gt, cand_entities, c...
method check (line 234) | def check(self, s, e, span_num, winner_pos=None):
class FNWeakMatcherLogging (line 285) | class FNWeakMatcherLogging(object):
method __init__ (line 290) | def __init__(self, printPredictions, filtered_spans, cand_entities, ca...
method check (line 306) | def check(self, gm_num, s, e, gt):
function print_global_pairwise_voting (line 367) | def print_global_pairwise_voting(gmask, span_num_b_e_gt, reconstructed_w...
FILE: code/evaluation/summarize_all_experiments.py
function process_experiment (line 5) | def process_experiment(ed_acc, el_acc, training_name):
function process_folder (line 72) | def process_folder(ed_acc, el_acc, training_name):
function file_is_used (line 88) | def file_is_used(filepath):
function main (line 99) | def main():
function _parse_args (line 137) | def _parse_args():
FILE: code/gerbil/build_entity_universe.py
class BuildEntityUniverse (line 10) | class BuildEntityUniverse(object):
method __init__ (line 11) | def __init__(self):
method process (line 16) | def process(self, text, given_spans):
method flush_entity_universe (line 26) | def flush_entity_universe(self):
class Struct (line 52) | class Struct(object):
method __init__ (line 53) | def __init__(self):
FILE: code/gerbil/gerbil_recall_calculation.py
class ProcessDataset (line 8) | class ProcessDataset(object):
method __init__ (line 9) | def __init__(self):
method process (line 18) | def process(self, filepath, filename):
function main (line 38) | def main():
FILE: code/gerbil/nn_processing.py
class StreamingSamples (line 14) | class StreamingSamples(object):
method __init__ (line 15) | def __init__(self):
method new_sample (line 27) | def new_sample(self, sample):
method gen (line 31) | def gen(self):
class NNProcessing (line 54) | class NNProcessing(object):
method __init__ (line 55) | def __init__(self, train_args, args):
method process (line 121) | def process(self, text, given_spans):
method map_words_to_char_positions (line 209) | def map_words_to_char_positions(self, text):
method nearest_idx (line 282) | def nearest_idx(self, key, values):
method _add_response_span (line 293) | def _add_response_span(self, response, span, words2charidx):
method nearest_given_span (line 303) | def nearest_given_span(self, begin_idx, end_idx): # [begin_idx, end...
method stanford_ner_spans (line 315) | def stanford_ner_spans(self, words_l, words2charidx):
function list_of_lists_to_2darray (line 343) | def list_of_lists_to_2darray(a):
function retrieve_optimal_threshold_from_logfile (line 351) | def retrieve_optimal_threshold_from_logfile(model_folder, checkpoint_pat...
FILE: code/gerbil/nn_processing_correct.py
class StreamingSamples (line 12) | class StreamingSamples(object):
method __init__ (line 13) | def __init__(self):
method new_sample (line 25) | def new_sample(self, sample):
method gen (line 29) | def gen(self):
class NNProcessing (line 52) | class NNProcessing(object):
method __init__ (line 53) | def __init__(self, train_args, args):
method process (line 122) | def process(self, text, given_spans):
method find_corefence_person (line 275) | def find_corefence_person(self, span_text):
method nearest_idx (line 295) | def nearest_idx(self, key, values):
method _add_response_span (line 306) | def _add_response_span(self, response, span, words2charidx):
method nearest_given_span (line 316) | def nearest_given_span(self, begin_idx, end_idx): # [begin_idx, end...
method custom_response (line 326) | def custom_response(self, filtered_spans, text, words2charidx, chunk_w...
method stanford_ner_spans (line 361) | def stanford_ner_spans(self, words_l, words2charidx):
function list_of_lists_to_2darray (line 420) | def list_of_lists_to_2darray(a):
function retrieve_optimal_threshold_from_logfile (line 428) | def retrieve_optimal_threshold_from_logfile(model_folder, checkpoint_pat...
FILE: code/gerbil/server.py
class GetHandler (line 10) | class GetHandler(BaseHTTPRequestHandler):
method do_POST (line 12) | def do_POST(self):
function read_json (line 29) | def read_json(post_data):
function _parse_args (line 37) | def _parse_args():
function terminate (line 83) | def terminate():
FILE: code/model/base_model.py
class BaseModel (line 5) | class BaseModel(object):
method __init__ (line 7) | def __init__(self, args):
method reinitialize_weights (line 13) | def reinitialize_weights(self, scope_name):
method add_train_op (line 19) | def add_train_op(self, lr_method, lr, loss, clip=-1):
method initialize_session (line 48) | def initialize_session(self):
method restore_session (line 61) | def restore_session(self, option="latest"):
method my_latest_checkpoint (line 97) | def my_latest_checkpoint(self, folder_path): # model-9.meta
method save_session (line 102) | def save_session(self, eval_cnt, save_ed_flag, save_el_flag):
method close_session (line 116) | def close_session(self):
method _restore_list (line 120) | def _restore_list(self):
method checkpoint_variables (line 124) | def checkpoint_variables(self):
method find_variable_handler_by_name (line 135) | def find_variable_handler_by_name(self, var_name):
FILE: code/model/ed_model_original.py
class EDModel (line 9) | class EDModel(BaseModel):
method __init__ (line 10) | def __init__(self, args, next_element):
method add_placeholders (line 41) | def add_placeholders(self):
method init_embeddings (line 103) | def init_embeddings(self):
method add_embeddings_op (line 113) | def add_embeddings_op(self):
method add_context_emb_op (line 195) | def add_context_emb_op(self):
method add_span_emb_op (line 208) | def add_span_emb_op(self):
method add_lstm_score_op (line 286) | def add_lstm_score_op(self):
method add_local_attention_op (line 302) | def add_local_attention_op(self):
method custom_pem (line 382) | def custom_pem(self, log=True, buckets_boundaries=None, bucketing_name...
method _pem_bucketing_embeddings (line 391) | def _pem_bucketing_embeddings(self, buckets_boundaries, bucketing_name):
method add_cand_ent_scores_op (line 403) | def add_cand_ent_scores_op(self):
method add_global_voting_op (line 442) | def add_global_voting_op(self):
method add_loss_op (line 549) | def add_loss_op(self):
method build (line 565) | def build(self):
method _sequence_mask_v13 (line 593) | def _sequence_mask_v13(self, mytensor, max_width):
FILE: code/model/ensemble_eval.py
function validation_loss_calculation (line 12) | def validation_loss_calculation(filename, opt_thr, el_mode):
function optimal_thr_calc (line 64) | def optimal_thr_calc(el_mode):
function create_input_pipeline (line 109) | def create_input_pipeline(el_mode, model_folder, filenames):
function evaluate (line 144) | def evaluate():
function _parse_args (line 157) | def _parse_args():
FILE: code/model/evaluate.py
function validation_loss_calculation (line 12) | def validation_loss_calculation(model, iterator, dataset_handle, opt_thr...
function compute_ed_el_scores (line 63) | def compute_ed_el_scores(model, handles, names, iterators, el_mode):
function evaluate (line 82) | def evaluate():
function print_variables_values (line 124) | def print_variables_values(model):
function print_thr_and_ffnn_values (line 133) | def print_thr_and_ffnn_values(model, opt_thr):
function _parse_args (line 151) | def _parse_args():
FILE: code/model/model.py
class Model (line 12) | class Model(BaseModel):
method __init__ (line 14) | def __init__(self, args, next_element):
method add_placeholders (line 51) | def add_placeholders(self):
method init_embeddings (line 56) | def init_embeddings(self):
method add_embeddings_op (line 66) | def add_embeddings_op(self):
method add_context_emb_op (line 145) | def add_context_emb_op(self):
method add_span_emb_op (line 158) | def add_span_emb_op(self):
method add_lstm_score_op (line 231) | def add_lstm_score_op(self):
method add_local_attention_op (line 249) | def add_local_attention_op(self):
method add_cand_ent_scores_op (line 316) | def add_cand_ent_scores_op(self):
method add_global_voting_op (line 339) | def add_global_voting_op(self):
method add_loss_op (line 370) | def add_loss_op(self):
method build (line 385) | def build(self):
method _sequence_mask_v13 (line 413) | def _sequence_mask_v13(self, mytensor, max_width):
FILE: code/model/model_ablations.py
class Model (line 9) | class Model(BaseModel):
method __init__ (line 11) | def __init__(self, args, next_element):
method add_placeholders (line 54) | def add_placeholders(self):
method init_embeddings (line 58) | def init_embeddings(self):
method add_embeddings_op (line 68) | def add_embeddings_op(self):
method add_context_emb_op (line 146) | def add_context_emb_op(self):
method add_span_emb_op (line 159) | def add_span_emb_op(self):
method add_lstm_score_op (line 232) | def add_lstm_score_op(self):
method add_local_attention_op (line 251) | def add_local_attention_op(self):
method custom_pem (line 323) | def custom_pem(self, log=True, buckets_boundaries=None, bucketing_name...
method _pem_bucketing_embeddings (line 331) | def _pem_bucketing_embeddings(self, buckets_boundaries, bucketing_name):
method add_cand_ent_scores_op (line 343) | def add_cand_ent_scores_op(self):
method add_global_voting_op (line 372) | def add_global_voting_op(self):
method add_loss_op (line 463) | def add_loss_op(self):
method build (line 478) | def build(self):
method _sequence_mask_v13 (line 506) | def _sequence_mask_v13(self, mytensor, max_width):
FILE: code/model/reader.py
function parse_sequence_example (line 6) | def parse_sequence_example(serialized):
function count_records_of_one_epoch (line 46) | def count_records_of_one_epoch(trainfiles):
function train_input_pipeline (line 78) | def train_input_pipeline(filenames, args):
function test_input_pipeline (line 89) | def test_input_pipeline(filenames, args):
FILE: code/model/train.py
function create_training_pipelines (line 12) | def create_training_pipelines(args):
function create_el_ed_pipelines (line 19) | def create_el_ed_pipelines(gmonly_flag, filenames, args):
function tensorboard_writers (line 31) | def tensorboard_writers(graph):
function validation_loss_calculation (line 45) | def validation_loss_calculation(model, iterator, dataset_handle, opt_thr...
function optimal_thr_calc (line 69) | def optimal_thr_calc(model, handles, iterators, el_mode):
function optimal_thr_calc_aux (line 94) | def optimal_thr_calc_aux(tp_fp_scores_labels, fn_scores):
function compute_ed_el_scores (line 136) | def compute_ed_el_scores(model, handles, names, iterators, el_mode):
function train (line 158) | def train():
function _parse_args (line 279) | def _parse_args():
function log_args (line 483) | def log_args(filepath):
function terminate (line 493) | def terminate():
FILE: code/model/util.py
function projection (line 5) | def projection(inputs, output_size, initializer=None, model=None):
function shape (line 8) | def shape(x, dim):
function ffnn (line 11) | def ffnn(inputs, num_hidden_layers, hidden_size, output_size, dropout, o...
function variable_summaries (line 63) | def variable_summaries(var):
class Tee (line 80) | class Tee(object):
method __init__ (line 82) | def __init__(self, name, mode):
method close (line 87) | def close(self):
method write (line 95) | def write(self, data):
method flush (line 99) | def flush(self):
method __del__ (line 103) | def __del__(self):
function _correct_train_args_leohnard_dalab (line 107) | def _correct_train_args_leohnard_dalab(train_args, model_folder):
function load_train_args (line 119) | def load_train_args(output_folder, running_mode):
function load_ent_vecs (line 182) | def load_ent_vecs(args):
FILE: code/preprocessing/aida_insight.py
function process_file (line 4) | def process_file(filename):
function main (line 25) | def main():
FILE: code/preprocessing/bridge_code_lua/ent_vecs_from_txt_to_npy.py
function keep_only_new_entities (line 6) | def keep_only_new_entities(ent_vecs, folder):
function main (line 23) | def main():
function _parse_args (line 33) | def _parse_args():
FILE: code/preprocessing/extra.py
function vocabulary_count_wiki (line 12) | def vocabulary_count_wiki():
function entity_count_wiki_aux (line 53) | def entity_count_wiki_aux():
function entity_count_wiki (line 74) | def entity_count_wiki():
function get_frequent_entities_set (line 111) | def get_frequent_entities_set(top=None, freq_thr=None, return_freq=False):
function entity_name_id_map_from_dump (line 134) | def entity_name_id_map_from_dump():
function compare_name_id_maps (line 169) | def compare_name_id_maps():
function test_wiki_name_id_map_txt_conflicts_when_lowering (line 200) | def test_wiki_name_id_map_txt_conflicts_when_lowering():
function create_p_e_m (line 220) | def create_p_e_m():
function load_p_e_m (line 296) | def load_p_e_m(): # need 82 % of my memory and real 0m50.133s where...
FILE: code/preprocessing/old/old_code.py
function p_e_m_disamb_redirect_wikinameid_maps (line 4) | def p_e_m_disamb_redirect_wikinameid_maps():
FILE: code/preprocessing/old/prepro_datasets1_once.py
function vocabulary_count_wiki (line 18) | def vocabulary_count_wiki():
function process_aida_aux (line 55) | def process_aida_aux(filepath):
function process_aida (line 77) | def process_aida():
function process_other_datasets (line 82) | def process_other_datasets(folder_path):
function _parse_args (line 89) | def _parse_args():
FILE: code/preprocessing/old/preprocess1.py
function main (line 28) | def main():
function re_faster (line 77) | def re_faster(line):
function profiler (line 81) | def profiler():
FILE: code/preprocessing/old/preprocessv2.py
function build_word_char_maps (line 16) | def build_word_char_maps():
function samples_load (line 68) | def samples_load(filename):
class Encoding (line 79) | class Encoding(object):
method __init__ (line 80) | def __init__(self):
method new_chunk (line 105) | def new_chunk(self):
method process_word (line 112) | def process_word(self, word):
method serialize (line 115) | def serialize(self):
method serialize_tfrecords (line 122) | def serialize_tfrecords(self):
method encode_wikidump (line 128) | def encode_wikidump(self):
FILE: code/preprocessing/p_e_m.py
function tokenize_p_e_m (line 14) | def tokenize_p_e_m():
function print_p_e_m_dictionary_to_file (line 39) | def print_p_e_m_dictionary_to_file(p_e_m, full_filepath):
function tokenize_p_e_m_and_merge_conflicts (line 58) | def tokenize_p_e_m_and_merge_conflicts(filename, yago=False):
function from_freq_to_prob (line 92) | def from_freq_to_prob(filename):
function merge_two_prob_dictionaries (line 114) | def merge_two_prob_dictionaries(filename1, filename2, newfilename):
FILE: code/preprocessing/prepro_aida.py
function process_aida (line 5) | def process_aida(in_filepath, out_filepath):
function split_dev_test (line 58) | def split_dev_test(in_filepath):
function create_necessary_folders (line 68) | def create_necessary_folders():
function _parse_args (line 72) | def _parse_args():
FILE: code/preprocessing/prepro_aida_tokenize.py
function process_aida (line 6) | def process_aida(in_filepath, out_filepath):
function split_dev_test (line 76) | def split_dev_test(in_filepath):
function create_necessary_folders (line 86) | def create_necessary_folders():
function _parse_args (line 90) | def _parse_args():
FILE: code/preprocessing/prepro_gerbil_datasets.py
class ProcessDataset (line 7) | class ProcessDataset(object):
method __init__ (line 8) | def __init__(self):
method process (line 17) | def process(self, filepath):
method process_readable (line 88) | def process_readable(self, filepath):
function fix_tokenizatVion (line 135) | def fix_tokenizatVion(doc_text):
function create_necessary_folders (line 145) | def create_necessary_folders():
function _parse_args (line 150) | def _parse_args():
function get_immediate_files (line 159) | def get_immediate_files(a_dir):
function main (line 164) | def main():
FILE: code/preprocessing/prepro_other_datasets.py
class ProcessDataset (line 8) | class ProcessDataset(object):
method __init__ (line 9) | def __init__(self):
method process (line 13) | def process(self, dataset_folder):
function create_necessary_folders (line 70) | def create_necessary_folders():
function _parse_args (line 74) | def _parse_args():
function get_immediate_subdirectories (line 83) | def get_immediate_subdirectories(a_dir):
function main (line 87) | def main():
FILE: code/preprocessing/prepro_util.py
class VocabularyCounter (line 13) | class VocabularyCounter(object):
method __init__ (line 17) | def __init__(self, lowercase_emb=False):
method add (line 29) | def add(self, filepath):
method print_statistics (line 47) | def print_statistics(self, word_edges=None,
method serialize (line 65) | def serialize(self, folder=None, name="vocab_freq.pickle"):
method count_datasets_vocabulary (line 73) | def count_datasets_vocabulary(self):
function build_word_char_maps (line 88) | def build_word_char_maps():
function build_word_char_maps_restore (line 140) | def build_word_char_maps_restore():
class Chunker (line 147) | class Chunker(object):
method __init__ (line 148) | def __init__(self):
method new_chunk (line 158) | def new_chunk(self):
method compute_result (line 164) | def compute_result(self, docid):
method process (line 189) | def process(self, filepath):
class SamplesGenerator (line 243) | class SamplesGenerator(object):
method __init__ (line 244) | def __init__(self, mode="allspans"):
method set_gmonly_mode (line 252) | def set_gmonly_mode(self):
method set_allspans_mode (line 255) | def set_allspans_mode(self):
method is_gmonly_mode (line 258) | def is_gmonly_mode(self):
method is_allspans_mode (line 261) | def is_allspans_mode(self):
method process (line 264) | def process(self, filepath):
method _process_allspans (line 270) | def _process_allspans(self, filepath):
method all_spans (line 324) | def all_spans(chunk_words):
method _process_gmonly (line 352) | def _process_gmonly(self, filepath):
class EncoderGenerator (line 408) | class EncoderGenerator(object):
method __init__ (line 412) | def __init__(self):
method set_gmonly_mode (line 418) | def set_gmonly_mode(self):
method set_allspans_mode (line 421) | def set_allspans_mode(self):
method is_gmonly_mode (line 424) | def is_gmonly_mode(self):
method is_allspans_mode (line 427) | def is_allspans_mode(self):
method process (line 430) | def process(self, filepath):
method _encode_cand_entities_and_labels (line 503) | def _encode_cand_entities_and_labels(self, cand_entities_p, cand_entit...
class TFRecordsGenerator (line 532) | class TFRecordsGenerator(object):
method __init__ (line 533) | def __init__(self):
method set_gmonly_mode (line 536) | def set_gmonly_mode(self):
method set_allspans_mode (line 539) | def set_allspans_mode(self):
method is_gmonly_mode (line 542) | def is_gmonly_mode(self):
method is_allspans_mode (line 545) | def is_allspans_mode(self):
method _to_sequence_example (line 549) | def _to_sequence_example(sample):
method process (line 608) | def process(self, filepath):
function create_tfrecords (line 629) | def create_tfrecords():
class PrintSamples (line 643) | class PrintSamples(object):
method __init__ (line 644) | def __init__(self, only_misses=True):
method print_candidates (line 648) | def print_candidates(self, ent_ids_list):
method print_sample (line 657) | def print_sample(self, sample):
function create_entity_universe (line 716) | def create_entity_universe(gmonly_files=None, allspans_files=None, print...
function create_necessary_folders (line 767) | def create_necessary_folders():
function _parse_args (line 772) | def _parse_args():
function log_args (line 806) | def log_args(folderpath):
FILE: code/preprocessing/prepro_wikidump.py
function wikidump_to_new_format (line 9) | def wikidump_to_new_format():
function subset_wikidump_only_relevant_mentions (line 46) | def subset_wikidump_only_relevant_mentions():
function _parse_args (line 142) | def _parse_args():
FILE: code/preprocessing/util.py
function load_entities_universe (line 18) | def load_entities_universe():
function load_wikiid2nnid (line 29) | def load_wikiid2nnid(extension_name=None):
function load_entity_extension (line 47) | def load_entity_extension(wikiid2nnid, extension_name):
function reverse_dict (line 63) | def reverse_dict(d, unique_values=False):
function load_redirections (line 72) | def load_redirections(lowercase=None):
function load_disambiguations (line 93) | def load_disambiguations():
function load_persons (line 111) | def load_persons():
function load_wiki_name_id_map (line 126) | def load_wiki_name_id_map(lowercase=False, filepath=None):
class FetchCandidateEntities (line 170) | class FetchCandidateEntities(object):
method __init__ (line 174) | def __init__(self, args):
method process (line 182) | def process(self, span):
class FetchFilteredCoreferencedCandEntities (line 209) | class FetchFilteredCoreferencedCandEntities(object):
method __init__ (line 210) | def __init__(self, args):
method init_coref (line 218) | def init_coref(self, el_mode):
method process (line 222) | def process(self, left, right, chunk_words):
method find_corefence_person (line 247) | def find_corefence_person(self, span_text, left_right_words):
class EntityNameIdMap (line 273) | class EntityNameIdMap(object):
method __init__ (line 274) | def __init__(self):
method init_compatible_ent_id (line 277) | def init_compatible_ent_id(self):
method init_gerbil_compatible_ent_id (line 280) | def init_gerbil_compatible_ent_id(self):
method init_hyperlink2id (line 284) | def init_hyperlink2id(self):
method hyperlink2id (line 291) | def hyperlink2id(self, line):
method is_valid_entity_id (line 314) | def is_valid_entity_id(self, ent_id):
method compatible_ent_id (line 317) | def compatible_ent_id(self, name=None, ent_id=None):
method gerbil_compatible_ent_id (line 332) | def gerbil_compatible_ent_id(self, uri):
function custom_p_e_m (line 348) | def custom_p_e_m(cand_ent_num=15, allowed_entities_set=None,
function get_immediate_files (line 445) | def get_immediate_files(a_dir):
FILE: gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/EDResource.java
class EDResource (line 40) | public class EDResource extends ServerResource {
method accept (line 49) | @Post
method sendRequestToCluster (line 65) | public List<Marking> sendRequestToCluster(String text , List<Marking> ...
method queryJson (line 103) | private JsonArray queryJson(String text, List<Marking> markings, Strin...
FILE: gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java
class LocalIntermediateWebserver (line 37) | public class LocalIntermediateWebserver extends ServerResource {
method accept (line 47) | @Post
method sendRequestToCluster (line 80) | public List<Marking> sendRequestToCluster(String text , List<Marking> ...
method queryJson (line 106) | private JsonObject queryJson(String text, List<Marking> markings, Stri...
FILE: gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/MyResource.java
class MyResource (line 39) | public class MyResource extends ServerResource {
method accept (line 50) | @Post
method sendRequestToCluster (line 83) | public List<Marking> sendRequestToCluster(String text , List<Marking> ...
method queryJson (line 121) | private JsonArray queryJson(String text, List<Marking> markings, Strin...
FILE: gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java
class SpotlightClient (line 41) | public class SpotlightClient {
method SpotlightClient (line 61) | public SpotlightClient() {
method SpotlightClient (line 80) | public SpotlightClient(final String serviceURL) {
method request (line 89) | protected String request(final String inputText, final String requestU...
method annotateSavely (line 130) | public List<TypedNamedEntity> annotateSavely(final Document document) {
method annotate (line 139) | public List<TypedNamedEntity> annotate(final Document document) throws...
method parseAnnotationResponse (line 144) | protected List<TypedNamedEntity> parseAnnotationResponse(final String ...
method spotSavely (line 193) | public List<Span> spotSavely(final Document document) {
method spot (line 202) | public List<Span> spot(final Document document) throws IOException {
method parseSpottingResponse (line 207) | protected List<Span> parseSpottingResponse(final String response) {
method disambiguate (line 236) | public List<TypedNamedEntity> disambiguate(final Document document) th...
method disambiguateSavely (line 260) | public List<TypedNamedEntity> disambiguateSavely(final Document docume...
method main (line 269) | @SuppressWarnings({"rawtypes", "unchecked"})
FILE: gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java
class SpotlightResource (line 17) | public class SpotlightResource extends ServerResource {
method accept (line 25) | @Post
FILE: gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java
class TestApplication (line 25) | public class TestApplication extends Application {
method createInboundRoot (line 30) | @Override
Condensed preview — 169 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (9,532K chars).
[
{
"path": ".gitignore",
"chars": 159,
"preview": "*.aux\n*.bbl\n*.bcf\n*.blg\n*.log\n*.out\n*.run.xml\n*.toc\n*.synctex.gz\ndata/\n**.idea/\n**__pycache__/\n*.swp\n*.~lock.*ods#\nend2e"
},
{
"path": "Examples _ End-to-End Neural Entity Linking.ipynb",
"chars": 7032,
"preview": "{\n \"cells\": [\n {\n \"cell_type\": \"markdown\",\n \"metadata\": {},\n \"source\": [\n \"## Try on your own input\\n\",\n \"T"
},
{
"path": "LICENSE",
"chars": 11358,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "code/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "code/evaluation/metrics.py",
"chars": 20354,
"preview": "import numpy as np\nfrom collections import defaultdict\nfrom operator import itemgetter\nimport tensorflow as tf\n\n\nclass E"
},
{
"path": "code/evaluation/metrics_old.py",
"chars": 15155,
"preview": "import numpy as np\nfrom collections import defaultdict\nfrom operator import itemgetter\nimport tensorflow as tf\n\n\nclass E"
},
{
"path": "code/evaluation/print_predictions (copy).py",
"chars": 8487,
"preview": "\nfrom termcolor import colored\nimport pickle\nfrom preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_na"
},
{
"path": "code/evaluation/print_predictions.py",
"chars": 20490,
"preview": "\nfrom termcolor import colored\nimport pickle\nfrom preprocessing.util import load_wikiid2nnid, reverse_dict, load_wiki_na"
},
{
"path": "code/evaluation/summarize_all_experiments.py",
"chars": 7088,
"preview": "import argparse\nimport os\n\n\ndef process_experiment(ed_acc, el_acc, training_name):\n if not os.path.exists(os.path.joi"
},
{
"path": "code/gerbil/build_entity_universe.py",
"chars": 2549,
"preview": "\nimport pickle\nfrom nltk.tokenize import word_tokenize\nimport preprocessing.prepro_util as prepro_util\nfrom preprocessin"
},
{
"path": "code/gerbil/gerbil_recall_calculation.py",
"chars": 1378,
"preview": "\nimport argparse\nimport os\nimport preprocessing.util as util\nimport rdflib\n\n\nclass ProcessDataset(object):\n def __ini"
},
{
"path": "code/gerbil/nn_processing.py",
"chars": 18201,
"preview": "\nfrom model.model_ablations import Model\nfrom time import sleep\nimport tensorflow as tf\nimport pickle\nfrom nltk.tokenize"
},
{
"path": "code/gerbil/nn_processing_correct.py",
"chars": 21705,
"preview": "\nfrom model.model_ablations import Model\nfrom time import sleep\nimport tensorflow as tf\nimport pickle\nfrom nltk.tokenize"
},
{
"path": "code/gerbil/server.py",
"chars": 3897,
"preview": "from http.server import BaseHTTPRequestHandler, HTTPServer\nimport json\nimport argparse\nimport model.config as config\nfro"
},
{
"path": "code/model/base_model.py",
"chars": 6816,
"preview": "import os\nimport tensorflow as tf\n\n\nclass BaseModel(object):\n\n def __init__(self, args):\n self.args = args\n "
},
{
"path": "code/model/config.py",
"chars": 97,
"preview": "base_folder = \"../\"\nspans_separators = [\".\"] #maybe also try ['.', ',', ';']\n\nunk_ent_id = \"0\"\n\n"
},
{
"path": "code/model/ed_model_original.py",
"chars": 36093,
"preview": "import numpy as np\nimport pickle\nimport tensorflow as tf\nimport model.config as config\nfrom .base_model import BaseModel"
},
{
"path": "code/model/ensemble_eval.py",
"chars": 9725,
"preview": "import argparse\nimport pickle\nimport model.config as config\nimport os\nimport tensorflow as tf\nfrom model.model_ablations"
},
{
"path": "code/model/evaluate.py",
"chars": 11656,
"preview": "import argparse\nimport pickle\nimport model.config as config\nimport os\nimport tensorflow as tf\nfrom model.model_ablations"
},
{
"path": "code/model/model.py",
"chars": 26571,
"preview": "\n# b9d87f7 on Mar 21 Nikolaos Kolitsas ffnn dropout and some minor modif in evaluate to accept entity extension\n# ed_mo"
},
{
"path": "code/model/model_ablations.py",
"chars": 32890,
"preview": "import numpy as np\nimport pickle\nimport tensorflow as tf\nimport model.config as config\n\nfrom .base_model import BaseMode"
},
{
"path": "code/model/reader.py",
"chars": 4143,
"preview": "import tensorflow as tf\nimport argparse\nimport model.config as config\nimport pickle\n\ndef parse_sequence_example(serializ"
},
{
"path": "code/model/train.py",
"chars": 26924,
"preview": "import argparse\nimport model.reader as reader\nimport model.config as config\nimport os\nimport tensorflow as tf\nfrom evalu"
},
{
"path": "code/model/util.py",
"chars": 9256,
"preview": "import tensorflow as tf\nimport numpy as np\nimport model.config as config\n\ndef projection(inputs, output_size, initialize"
},
{
"path": "code/preprocessing/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "code/preprocessing/aida_insight.py",
"chars": 1992,
"preview": "import model.config as config\n\n\ndef process_file(filename):\n entities = set()\n mentions = set()\n with open(conf"
},
{
"path": "code/preprocessing/bridge_code_lua/ent_vecs_from_txt_to_npy.py",
"chars": 1463,
"preview": "import numpy as np\nfrom preprocessing.util import load_wikiid2nnid\nimport model.config as config\n\n\ndef keep_only_new_ent"
},
{
"path": "code/preprocessing/bridge_code_lua/ent_vecs_to_txt.lua",
"chars": 1177,
"preview": "require 'torch'\nrequire 'nn'\nrequire 'os'\nel_path = os.getenv(\"EL_PATH\")\ncmd = torch.CmdLine()\ncmd:option('-ent_vecs_fil"
},
{
"path": "code/preprocessing/extra.py",
"chars": 14318,
"preview": "import pickle\nfrom collections import defaultdict\nimport numpy as np\nimport time\nimport sys\nimport os\n\nimport preprocess"
},
{
"path": "code/preprocessing/old/old_code.py",
"chars": 3948,
"preview": "\n\n\ndef p_e_m_disamb_redirect_wikinameid_maps():\n\n wall_start = time.time()\n redirections = dict()\n with open(co"
},
{
"path": "code/preprocessing/old/prepro_datasets1_once.py",
"chars": 3784,
"preview": "import pickle\nimport argparse\nimport time\n#import datetime\nfrom collections import defaultdict\n\n#import sys\n#import os\n#"
},
{
"path": "code/preprocessing/old/preprocess1.py",
"chars": 7054,
"preview": "import pickle\nimport argparse\nfrom collections import defaultdict\nfrom nltk.tokenize import sent_tokenize\nfrom nltk.toke"
},
{
"path": "code/preprocessing/old/preprocessv2.py",
"chars": 9060,
"preview": "import pickle\nimport argparse\nfrom collections import defaultdict\nimport numpy as np\nimport time\nimport sys\nimport os\n\n#"
},
{
"path": "code/preprocessing/p_e_m.py",
"chars": 6736,
"preview": "import pickle\nfrom collections import defaultdict\nfrom nltk.tokenize import word_tokenize\nimport time\nimport sys\nimport "
},
{
"path": "code/preprocessing/prepro_aida.py",
"chars": 4059,
"preview": "import argparse\nimport os\nimport preprocessing.util as util\n\ndef process_aida(in_filepath, out_filepath):\n\n # _, wiki"
},
{
"path": "code/preprocessing/prepro_aida_tokenize.py",
"chars": 5388,
"preview": "import argparse\nimport os\nimport preprocessing.util as util\nfrom subprocess import call\n\ndef process_aida(in_filepath, o"
},
{
"path": "code/preprocessing/prepro_gerbil_datasets.py",
"chars": 8811,
"preview": "import argparse\nimport os\nimport preprocessing.util as util\nfrom nltk.tokenize import word_tokenize\n\n\nclass ProcessDatas"
},
{
"path": "code/preprocessing/prepro_other_datasets.py",
"chars": 4322,
"preview": "import argparse\nimport os\nimport preprocessing.util as util\nimport xml.etree.ElementTree as ET\nfrom subprocess import ca"
},
{
"path": "code/preprocessing/prepro_util.py",
"chars": 40447,
"preview": "import pickle\nfrom collections import defaultdict, namedtuple\nimport numpy as np\nimport argparse\n\nimport os\nimport model"
},
{
"path": "code/preprocessing/prepro_wikidump.py",
"chars": 6667,
"preview": "import argparse\nimport os\nimport sys\nimport preprocessing.util as util\nimport preprocessing.config as config\nimport trac"
},
{
"path": "code/preprocessing/util.py",
"chars": 20099,
"preview": "import pickle\nfrom collections import defaultdict\nimport numpy as np\nimport time\nimport sys\n#print(\"preprocessing/util.p"
},
{
"path": "code/script",
"chars": 2623,
"preview": "time python -m preprocessing.prepro_util --p_e_m_choice=yago --cand_ent_num=5 \\\n --allowed_entities_set_top=100000"
},
{
"path": "create_entity_vectors.md",
"chars": 11587,
"preview": "# Creating Entity Vectors\nAs it is already mentioned, we have created entity vectors for 502661 entities from many\ndiffe"
},
{
"path": "deep-ed/deep-ed-master/LICENSE",
"chars": 11358,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "deep-ed/deep-ed-master/README.md",
"chars": 9825,
"preview": "# Source code for \"Deep Joint Entity Disambiguation with Local Neural Attention\"\n\n[O-E. Ganea and T. Hofmann, full paper"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_p_e_m/gen_p_e_m_from_wiki.lua",
"chars": 3535,
"preview": "-- Generate p(e|m) index from Wikipedia\n-- Run: th data_gen/gen_p_e_m/gen_p_e_m_from_wiki.lua -root_data_dir $DATA_PATH\n"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_p_e_m/gen_p_e_m_from_yago.lua",
"chars": 2521,
"preview": "-- Generate p(e|m) index from Wikipedia\n-- Run: th data_gen/gen_p_e_m/gen_p_e_m_from_yago.lua -root_data_dir $DATA_PATH\n"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_p_e_m/merge_crosswikis_wiki.lua",
"chars": 4168,
"preview": "-- Merge Wikipedia and Crosswikis p(e|m) indexes\n-- Run: th data_gen/gen_p_e_m/merge_crosswikis_wiki.lua -root_data_dir "
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_p_e_m/unicode_map.lua",
"chars": 8639,
"preview": "local unicode = {'\\\\u00bb', '\\\\u007d', '\\\\u00a1', '\\\\u0259', '\\\\u0641', '\\\\u0398', '\\\\u00fd', '\\\\u0940', '\\\\u00f9', '\\\\u"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_test_train_data/gen_ace_msnbc_aquaint_csv.lua",
"chars": 6955,
"preview": "-- Generate test data from the ACE/MSNBC/AQUAINT datasets by keeping the context and\n-- entity candidates for each annot"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_test_train_data/gen_aida_test.lua",
"chars": 7595,
"preview": "-- Generate test data from the AIDA dataset by keeping the context and\n-- entity candidates for each annotated mention\n\n"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_test_train_data/gen_aida_train.lua",
"chars": 5981,
"preview": "-- Generate train data from the AIDA dataset by keeping the context and\n-- entity candidates for each annotated mention\n"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_test_train_data/gen_all.lua",
"chars": 461,
"preview": "-- Generates all training and test data for entity disambiguation.\n\nif not ent_p_e_m_index then\n require 'torch'\n dofi"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_wiki_data/gen_ent_wiki_w_repr.lua",
"chars": 2859,
"preview": "if not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_dir', '', 'Root path of the data, $DATA_PATH.')\n cmd:t"
},
{
"path": "deep-ed/deep-ed-master/data_gen/gen_wiki_data/gen_wiki_hyp_train_data.lua",
"chars": 5542,
"preview": "-- Generate training data from Wikipedia hyperlinks by keeping the context and\n-- entity candidates for each hyperlink\n\n"
},
{
"path": "deep-ed/deep-ed-master/data_gen/indexes/wiki_disambiguation_pages_index.lua",
"chars": 830,
"preview": "-- Loads the link disambiguation index from Wikipedia\n\nif not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_"
},
{
"path": "deep-ed/deep-ed-master/data_gen/indexes/wiki_redirects_index.lua",
"chars": 970,
"preview": "-- Loads the link redirect index from Wikipedia\n\nif not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_dir', "
},
{
"path": "deep-ed/deep-ed-master/data_gen/indexes/yago_crosswikis_wiki.lua",
"chars": 5644,
"preview": "-- Loads the merged p(e|m) index.\nif not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_dir', '', 'Root path "
},
{
"path": "deep-ed/deep-ed-master/data_gen/parse_wiki_dump/parse_wiki_dump_tools.lua",
"chars": 5575,
"preview": "-- Utility functions to extract the text and hyperlinks from each page in the Wikipedia corpus.\n\nif not table_len then\n "
},
{
"path": "deep-ed/deep-ed-master/ed/args.lua",
"chars": 5119,
"preview": "-- We add params abbreviations to the abbv map if they are important hyperparameters\n-- used to differentiate between di"
},
{
"path": "deep-ed/deep-ed-master/ed/ed.lua",
"chars": 1160,
"preview": "require 'optim'\nrequire 'torch'\nrequire 'gnuplot'\nrequire 'nn'\nrequire 'xlua'\n\ntds = tds or require 'tds'\ndofile 'utils/"
},
{
"path": "deep-ed/deep-ed-master/ed/loss.lua",
"chars": 267,
"preview": "if opt.loss == 'nll' then\n criterion = nn.CrossEntropyCriterion()\nelse\n -- max-margin with margin parameter = 0.01\n c"
},
{
"path": "deep-ed/deep-ed-master/ed/minibatch/build_minibatch.lua",
"chars": 10577,
"preview": "-- Builds and fills a minibatch. In our case, minibatches are variable sized because they\n-- contain all mentions in opt"
},
{
"path": "deep-ed/deep-ed-master/ed/minibatch/data_loader.lua",
"chars": 2965,
"preview": "-- Data loader for training of ED models.\n\ntrain_file = opt.root_data_dir .. 'generated/test_train_data/aida_train.csv'\n"
},
{
"path": "deep-ed/deep-ed-master/ed/models/SetConstantDiag.lua",
"chars": 1783,
"preview": "-- Torch layer that receives as input a squared matrix and sets its diagonal to a constant value.\n\nlocal SetConstantDiag"
},
{
"path": "deep-ed/deep-ed-master/ed/models/linear_layers.lua",
"chars": 2039,
"preview": "-- Define all parametrized layers: linear layers (diagonal matrices) A,B,C + network f\n\nfunction new_linear_layer(out_di"
},
{
"path": "deep-ed/deep-ed-master/ed/models/model.lua",
"chars": 508,
"preview": "dofile 'ed/models/SetConstantDiag.lua'\ndofile 'ed/models/linear_layers.lua'\ndofile 'ed/models/model_local.lua'\ndofile 'e"
},
{
"path": "deep-ed/deep-ed-master/ed/models/model_global.lua",
"chars": 6425,
"preview": "-- Definition of the neural network used for global (joint) ED. Section 5 of our paper. \n-- It unrolls a fixed number of"
},
{
"path": "deep-ed/deep-ed-master/ed/models/model_local.lua",
"chars": 7553,
"preview": "-- Definition of the local neural network with attention used for local (independent per each mention) ED. \n-- Section 4"
},
{
"path": "deep-ed/deep-ed-master/ed/test/check_coref.lua",
"chars": 1104,
"preview": "-- Runs our trivial coreference resolution method and outputs the new set of \n-- entity candidates. Used for debugging t"
},
{
"path": "deep-ed/deep-ed-master/ed/test/coref_persons.lua",
"chars": 7033,
"preview": "-- Given a dataset, try to retrieve better entity candidates\n-- for ambiguous mentions of persons. For example, suppose "
},
{
"path": "deep-ed/deep-ed-master/ed/test/ent_freq_stats_test.lua",
"chars": 1121,
"preview": "-- Statistics of annotated entities based on their frequency in Wikipedia corpus \n-- Table 6 (left) from our paper\nlocal"
},
{
"path": "deep-ed/deep-ed-master/ed/test/ent_p_e_m_stats_test.lua",
"chars": 1183,
"preview": "-- Statistics of annotated entities based on their p(e|m) prio\n-- Table 6 (right) from our paper\n\nlocal function ent_pri"
},
{
"path": "deep-ed/deep-ed-master/ed/test/test.lua",
"chars": 14489,
"preview": "dofile 'ed/test/coref_persons.lua'\ndofile 'ed/test/ent_freq_stats_test.lua'\ndofile 'ed/test/ent_p_e_m_stats_test.lua'\n\nt"
},
{
"path": "deep-ed/deep-ed-master/ed/test/test_one_loaded_model.lua",
"chars": 1374,
"preview": "-- Test one single ED model trained using ed/ed.lua\n\n-- Run: CUDA_VISIBLE_DEVICES=0 th ed/test/test_one_loaded_model.lua"
},
{
"path": "deep-ed/deep-ed-master/ed/train.lua",
"chars": 4335,
"preview": "-- Training of ED models.\n\nif opt.opt == 'SGD' then\n optimState = {\n learningRate = opt.lr,\n momentum = 0.9,"
},
{
"path": "deep-ed/deep-ed-master/ent_vecs_scores.txt",
"chars": 35611,
"preview": "\u001b[0m\u001b[33mmeasure =\u001b[39m\u001b[0m\t\u001b[0mNDCG1\u001b[0m\t\u001b[0mNDCG5\u001b[0m\t\u001b[0mNDCG10\u001b[0m\t\u001b[0mMAP\u001b[0m\t\u001b[0mTOTAL VALIDATION\u001b[0m\t\r\n\u001b[0m\u001b[3"
},
{
"path": "deep-ed/deep-ed-master/entities/ent_name2id_freq/e_freq_gen.lua",
"chars": 1816,
"preview": "-- Creates a file that contains entity frequencies.\n\nif not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_di"
},
{
"path": "deep-ed/deep-ed-master/entities/ent_name2id_freq/e_freq_index.lua",
"chars": 1194,
"preview": "-- Loads an index containing entity -> frequency pairs. \n-- TODO: rewrite this file in a simpler way (is complicated bec"
},
{
"path": "deep-ed/deep-ed-master/entities/ent_name2id_freq/ent_name_id.lua",
"chars": 6099,
"preview": "------------------ Load entity name-id mappings ------------------\n-- Each entity has:\n-- a) a Wikipedia URL referred "
},
{
"path": "deep-ed/deep-ed-master/entities/learn_e2v/4EX_wiki_words.lua",
"chars": 304644,
"preview": "-- Small dataset. Used for debugging / unit testing\n\nif not is_stop_word_or_number then\n dofile 'words/stop_words.lua'\n"
},
{
"path": "deep-ed/deep-ed-master/entities/learn_e2v/batch_dataset_a.lua",
"chars": 2676,
"preview": "dofile 'utils/utils.lua'\n\nif opt.entities == 'ALL' then\n wiki_words_train_file = opt.root_data_dir .. 'generated/wiki_c"
},
{
"path": "deep-ed/deep-ed-master/entities/learn_e2v/e2v_a.lua",
"chars": 3484,
"preview": "-- Entity embeddings utilities\n\nassert(opt.num_words_per_ent)\n\n-- Word lookup:\ngeom_w2v_M = w2vutils.M:float()\n\n-- Stats"
},
{
"path": "deep-ed/deep-ed-master/entities/learn_e2v/learn_a.lua",
"chars": 9346,
"preview": "-- Training of entity embeddings. \n\n-- To run:\n-- i) delete all _RLTD files\n-- ii) th entities/relatedness/filter_wiki_"
},
{
"path": "deep-ed/deep-ed-master/entities/learn_e2v/minibatch_a.lua",
"chars": 5733,
"preview": "assert(opt.entities == '4EX' or opt.entities == 'ALL' or opt.entities == 'RLTD', opt.entities)\n\nfunction empty_minibatch"
},
{
"path": "deep-ed/deep-ed-master/entities/learn_e2v/model_a.lua",
"chars": 5522,
"preview": "-- Definition of the neural network used to learn entity embeddings.\n-- To run a simple unit test that checks the forwar"
},
{
"path": "deep-ed/deep-ed-master/entities/pretrained_e2v/check_ents.lua",
"chars": 1098,
"preview": "if not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_dir', '', 'Root path of the data, $DATA_PATH.')\n cmd:o"
},
{
"path": "deep-ed/deep-ed-master/entities/pretrained_e2v/e2v.lua",
"chars": 4538,
"preview": "-- Loads pre-trained entity vectors trained using the file entity/learn_e2v/learn_a.lua\n\nassert(opt.ent_vecs_filename)\np"
},
{
"path": "deep-ed/deep-ed-master/entities/pretrained_e2v/e2v_txt_reader.lua",
"chars": 909,
"preview": "print('==> loading e2v')\n\nlocal V = torch.ones(get_total_num_ents(), ent_vecs_size):mul(1e-10) -- not zero because of c"
},
{
"path": "deep-ed/deep-ed-master/entities/relatedness/filter_wiki_canonical_words_RLTD.lua",
"chars": 970,
"preview": "\nif not opt then\n cmd = torch.CmdLine()\n cmd:option('-root_data_dir', '', 'Root path of the data, $DATA_PATH.')\n cmd:"
},
{
"path": "deep-ed/deep-ed-master/entities/relatedness/filter_wiki_hyperlink_contexts_RLTD.lua",
"chars": 2228,
"preview": "-- Filter all training data s.t. only candidate entities and ground truth entities for which\n-- we have a valid entity e"
},
{
"path": "deep-ed/deep-ed-master/entities/relatedness/relatedness.lua",
"chars": 14161,
"preview": "-- The code in this file does two things:\n-- a) extracts and puts the entity relatedness dataset in two maps (reltd_va"
},
{
"path": "deep-ed/deep-ed-master/log_train_entity_vecs",
"chars": 0,
"preview": ""
},
{
"path": "deep-ed/deep-ed-master/our_system_annotations.txt",
"chars": 7184737,
"preview": "=====> TESTING <=== \t\n\n===> AQUAINT; num mentions = 727\t\n\t\n\u001b[34m============================================\u001b[39m\t\n\u001b[34m"
},
{
"path": "deep-ed/deep-ed-master/utils/logger.lua",
"chars": 4797,
"preview": "--[[ Logger: a simple class to log symbols during training,\n and automate plot generation\nExample:\n logger = o"
},
{
"path": "deep-ed/deep-ed-master/utils/optim/adadelta_mem.lua",
"chars": 2051,
"preview": "-- Memory optimized implementation of optim.adadelta\n\n--[[ ADADELTA implementation for SGD http://arxiv.org/abs/1212.570"
},
{
"path": "deep-ed/deep-ed-master/utils/optim/adagrad_mem.lua",
"chars": 1867,
"preview": "-- Memory optimized implementation of optim.adagrad\n\n--[[ ADAGRAD implementation for SGD\nARGS:\n- `opfunc` : a function t"
},
{
"path": "deep-ed/deep-ed-master/utils/optim/rmsprop_mem.lua",
"chars": 1887,
"preview": "-- Memory optimized implementation of optim.rmsprop\n\n--[[ An implementation of RMSprop\nARGS:\n- 'opfunc' : a function tha"
},
{
"path": "deep-ed/deep-ed-master/utils/utils.lua",
"chars": 2523,
"preview": "function topk(one_dim_tensor, k) \n local bestk, indices = torch.topk(one_dim_tensor, k, true)\n local sorted, newindice"
},
{
"path": "deep-ed/deep-ed-master/words/load_w_freq_and_vecs.lua",
"chars": 3567,
"preview": "-- Loads all common words in both Wikipedia and Word2vec/Glove , their unigram frequencies and their pre-trained Word2Ve"
},
{
"path": "deep-ed/deep-ed-master/words/stop_words.lua",
"chars": 5197,
"preview": "all_stop_words = { ['a'] = 1, ['about'] = 1, ['above'] = 1, ['across'] = 1, ['after'] = 1, ['afterwards'] = 1, ['again']"
},
{
"path": "deep-ed/deep-ed-master/words/w2v/glove_reader.lua",
"chars": 335,
"preview": "local M = torch.zeros(total_num_words(), word_vecs_size):float()\n\n--Reading Contents\nfor line in io.lines(w2v_txtfilenam"
},
{
"path": "deep-ed/deep-ed-master/words/w2v/w2v.lua",
"chars": 9659,
"preview": "-- Loads pre-trained word embeddings from either Word2Vec or Glove\n\nassert(get_id_from_word)\nassert(common_w2v_freq_word"
},
{
"path": "deep-ed/deep-ed-master/words/w2v/word2vec_reader.lua",
"chars": 972,
"preview": "-- Adapted from https://github.com/rotmanmi/word2vec.torch\nfunction read_string_w2v(file) \n local str = {}\n while tru"
},
{
"path": "deep-ed/deep-ed-master/words/w_freq/w_freq_gen.lua",
"chars": 1479,
"preview": "-- Computes an unigram frequency of each word in the Wikipedia corpus\n\nif not opt then\n cmd = torch.CmdLine()\n cmd:opt"
},
{
"path": "deep-ed/deep-ed-master/words/w_freq/w_freq_index.lua",
"chars": 4735,
"preview": "-- Loads all words and their frequencies and IDs from a dictionary.\nassert(common_w2v_freq_words)\nif not opt.unig_power "
},
{
"path": "gerbil-SpotWrapNifWS4Test/.gitignore",
"chars": 76,
"preview": "*.class\n*.cache\n.classpath\n.project\ntarget\n.settings\ngerbil_keys.properties\n"
},
{
"path": "gerbil-SpotWrapNifWS4Test/Dockerfile",
"chars": 235,
"preview": "FROM tomcat:8.0.36-jre8\n\n# remove the default tomcat application\nRUN rm -rf /usr/local/tomcat/webapps/ROOT /usr/local/to"
},
{
"path": "gerbil-SpotWrapNifWS4Test/LICENSE",
"chars": 34520,
"preview": " GNU AFFERO GENERAL PUBLIC LICENSE\n Version 3, 19 November 2007\n\n Copyright (C)"
},
{
"path": "gerbil-SpotWrapNifWS4Test/Makefile",
"chars": 220,
"preview": "default: build dockerize\n\nbuild:\n mvn clean package -U\n\ndockerize:\n docker build -t git.project-hobbit.eu:4567/ger"
},
{
"path": "gerbil-SpotWrapNifWS4Test/README.md",
"chars": 306,
"preview": "Gerbil\n========\n<i>General Entity Annotator Benchmark</i>\n\nThis branch is part of the Gerbil project. It contains a very"
},
{
"path": "gerbil-SpotWrapNifWS4Test/curlExample.sh",
"chars": 122,
"preview": "curl -d \"@example.ttl\" -H \"Content-Type: application/x-turtle\" http://localhost:8080/gerbil-spotWrapNifWS4Test/spotlight"
},
{
"path": "gerbil-SpotWrapNifWS4Test/docker-compose.yml",
"chars": 157,
"preview": "version: '2'\nservices:\n # spotwrapnifws4test .\n nifws4test:\n build: .\n restart: always\n container_name: nifws"
},
{
"path": "gerbil-SpotWrapNifWS4Test/example.ttl",
"chars": 1284,
"preview": "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n@pr"
},
{
"path": "gerbil-SpotWrapNifWS4Test/my_notes/messages_format",
"chars": 222028,
"preview": "Request: AnnotatedDocumentImpl [uri=http://www.aksw.org/gerbil/NifWebService/request_117, text=NYMEX natgas ends sharply"
},
{
"path": "gerbil-SpotWrapNifWS4Test/my_notes/python_server_format",
"chars": 139988,
"preview": "\n\n\nfor TEST-A uri=http://www.aksw.org/gerbil/NifWebService/request_215, was the last one\n\nreceived post body: b'{\"sp"
},
{
"path": "gerbil-SpotWrapNifWS4Test/my_notes/python_server_format_ed",
"chars": 166046,
"preview": "\n\n[[Node: IteratorGetNext = IteratorGetNext[output_shapes=[ \n[?], [?,?], [?], [?,?,?], [?,?], "
},
{
"path": "gerbil-SpotWrapNifWS4Test/my_notes/python_server_format_el",
"chars": 139988,
"preview": "\n\n\nfor TEST-A uri=http://www.aksw.org/gerbil/NifWebService/request_215, was the last one\n\nreceived post body: b'{\"sp"
},
{
"path": "gerbil-SpotWrapNifWS4Test/pom.xml",
"chars": 3703,
"preview": "<project xmlns=\"http://maven.apache.org/POM/4.0.0\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n\txsi:schemaLoca"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.md5",
"chars": 32,
"preview": "d8cb94795d15a968a097109b2739f6f5"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-javadoc.jar.sha1",
"chars": 40,
"preview": "0b44e6e4b45bdbaf175e6cef700fef1f971542b9"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.md5",
"chars": 32,
"preview": "a622142dc8912285288b320b5bf51a3b"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT-sources.jar.sha1",
"chars": 40,
"preview": "ea177f3f37dfe9ab6ab032b4689ed3f4e90c69c6"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.md5",
"chars": 32,
"preview": "2ba688c9e81ac41ebf8948bd6c2ec228"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.jar.sha1",
"chars": 40,
"preview": "c9d243a0a93e95804bbeb740b4f756f3eb4b9a71"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom",
"chars": 5377,
"preview": "<!-- The MIT License (MIT) Copyright (C) ${year} Agile Knowledge Engineering \n\tand Semantic Web (AKSW) (usbeck@informati"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.md5",
"chars": 32,
"preview": "bd93e677c0a2041a4e2d205ff00eaf30"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/gerbil.nif.transfer-1.1.0-SNAPSHOT.pom.sha1",
"chars": 40,
"preview": "303d1b06abe9fa32d1c3470e977882590202a475"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml",
"chars": 1123,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<metadata modelVersion=\"1.1.0\">\n <groupId>org.aksw</groupId>\n <artifactId>gerbi"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.md5",
"chars": 32,
"preview": "8d28899dd51b52fc95bce996f12f0fb1"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/1.1.0-SNAPSHOT/maven-metadata-local.xml.sha1",
"chars": 40,
"preview": "8cdcb67c4c86e693884264f9cb92f04ae0b9b788"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml",
"chars": 387,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<metadata>\n <groupId>org.aksw</groupId>\n <artifactId>gerbil.nif.transfer</artif"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.md5",
"chars": 32,
"preview": "0b6e4e09b60735f465f60ce53de21b22"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/aksw/gerbil.nif.transfer/maven-metadata-local.xml.sha1",
"chars": 40,
"preview": "af04aebdca0e9affc3dd1c4f714097b4a8f4b8b0"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.md5",
"chars": 32,
"preview": "f0a69fc7748a4859f185869a803fd31d"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.jar.sha1",
"chars": 40,
"preview": "e5ef2d645af76d7787f4fa5c360d4916864795f3"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom",
"chars": 1626,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!--\n\n Copyright (C) 2014 Michael Röder (michael.roeder@unister.de)\n\n Permi"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.md5",
"chars": 32,
"preview": "fa265a18a02fe6b1afee5d59ef09f9b2"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/2.2.1/org.restlet-2.2.1.pom.sha1",
"chars": 40,
"preview": "bf2f49d20b9b7708c5fdf548d9d5403e95e5824d"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/maven-metadata-local.xml",
"chars": 1460,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!--\n\n Copyright (C) 2014 Michael Röder (michael.roeder@unister.de)\n\n Permi"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/maven-metadata-local.xml.md5",
"chars": 32,
"preview": "08a59985f1be54d59c6a67bc52f71769"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet/maven-metadata-local.xml.sha1",
"chars": 40,
"preview": "b7656b95e6a29dd775f74fdbf58044f9c7122cbd"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.md5",
"chars": 32,
"preview": "83e5a0a3c182e3a0b54bb1c9ea89a4ba"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.jar.sha1",
"chars": 40,
"preview": "4f1f9af4af0c187ffdcbad3f7b20df3158a2b860"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom",
"chars": 1638,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!--\n\n Copyright (C) 2014 Michael Röder (michael.roeder@unister.de)\n\n Permi"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.md5",
"chars": 32,
"preview": "119436ba0b07f90b0ee6bed3bb5a38c9"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/2.2.1/org.restlet.ext.servlet-2.2.1.pom.sha1",
"chars": 40,
"preview": "20160a0a755fcbd86a1a3a0ef6092247a67b0b56"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml",
"chars": 1472,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!--\n\n Copyright (C) 2014 Michael Röder (michael.roeder@unister.de)\n\n Permi"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.md5",
"chars": 32,
"preview": "b25961e20f84c3eb27a617f9698731fd"
},
{
"path": "gerbil-SpotWrapNifWS4Test/repository/org/restlet/org.restlet.ext.servlet/maven-metadata-local.xml.sha1",
"chars": 40,
"preview": "27494e8d89841244a60eab9037402baf55f2eb35"
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/EDResource.java",
"chars": 5761,
"preview": "\npackage org.aksw.gerbil.ws4test;\n\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.Reader;\nimport"
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/LocalIntermediateWebserver.java",
"chars": 5860,
"preview": "package org.aksw.gerbil.ws4test;\n\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.Reader;\nimport "
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/MyResource.java",
"chars": 6631,
"preview": "package org.aksw.gerbil.ws4test;\n\nimport java.io.IOException;\nimport java.io.InputStream;\nimport java.io.Reader;\nimport "
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/SpotlightClient.java",
"chars": 10911,
"preview": "package org.aksw.gerbil.ws4test;\n\nimport java.io.BufferedReader;\nimport java.io.DataOutputStream;\nimport java.io.IOExcep"
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/SpotlightResource.java",
"chars": 1708,
"preview": "package org.aksw.gerbil.ws4test;\n\nimport java.io.IOException;\nimport java.io.Reader;\nimport java.util.ArrayList;\n\nimport"
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/TestApplication.java",
"chars": 1681,
"preview": "/**\n * Copyright (C) 2014 Michael Röder (michael.roeder@unister.de)\n *\n * Permission is hereby granted, free of charge, "
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/java/org/aksw/gerbil/ws4test/data_format",
"chars": 2669,
"preview": "\n\ned\n\nRequest: AnnotatedDocumentImpl [uri=http://www.aksw.org/gerbil/NifWebService/request_228, text=SOCCER - PSV HIT VO"
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/resources/log4j.properties",
"chars": 511,
"preview": "# Direct log messages to stdout\nlog4j.rootLogger=WARN,stdout,CATALINA\n\nlog4j.appender.stdout=org.apache.log4j.ConsoleApp"
},
{
"path": "gerbil-SpotWrapNifWS4Test/src/main/webapp/WEB-INF/web.xml",
"chars": 953,
"preview": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<web-app id=\"WebApp_ID\" version=\"2.4\" xmlns=\"http://java.sun.com/xml/ns/j2ee\" xml"
},
{
"path": "readme.md",
"chars": 12385,
"preview": "# End-to-End Neural Entity Linking (CoNLL 2018, full paper)\n### Python source code\n\n\n1: Setting up the environment.\n```\n"
},
{
"path": "requirements.txt",
"chars": 150,
"preview": "enum34==1.1.6\nrequests==2.18.4\nscipy==1.0.0\ntensorflow-tensorboard==0.4.0\nnumpy==1.14.0\ntermcolor==1.1.0\ntensorflow==2.5"
}
]
// ... and 8 more files (download for full content)
About this extraction
This page contains the full source code of the dalab/end2end_neural_el GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 169 files (8.6 MB), approximately 2.3M tokens, and a symbol index with 394 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.