Repository: hugochan/BAMnet Branch: master Commit: b693616a9241 Files: 34 Total size: 169.2 KB Directory structure: gitextract_pbgz21hk/ ├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt └── src/ ├── build_all_data.py ├── build_pretrained_w2v.py ├── config/ │ ├── bamnet_webq.yml │ └── entnet_webq.yml ├── core/ │ ├── __init__.py │ ├── bamnet/ │ │ ├── __init__.py │ │ ├── bamnet.py │ │ ├── ent_modules.py │ │ ├── entnet.py │ │ ├── modules.py │ │ └── utils.py │ ├── build_data/ │ │ ├── __init__.py │ │ ├── build_all.py │ │ ├── build_data.py │ │ ├── freebase.py │ │ ├── utils.py │ │ └── webquestions.py │ ├── config.py │ └── utils/ │ ├── __init__.py │ ├── freebase_utils.py │ ├── generic_utils.py │ ├── metrics.py │ └── utils.py ├── joint_test.py ├── run_freebase.py ├── run_webquestions.py ├── test.py ├── test_entnet.py ├── train.py └── train_entnet.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ data/ runs/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # BAMnet Code & data accompanying the NAACL2019 paper ["Bidirectional Attentive Memory Networks for Question Answering over Knowledge Bases"](https://arxiv.org/abs/1903.02188) ## Get started ### Prerequisites This code is written in python 3. You will need to install a few python packages in order to run the code. We recommend you to use `virtualenv` to manage your python packages and environments. Please take the following steps to create a python virtual environment. * If you have not installed `virtualenv`, install it with ```pip install virtualenv```. * Create a virtual environment with ```virtualenv venv```. * Activate the virtual environment with `source venv/bin/activate`. * Install the package requirements with `pip install -r requirements.txt`. ### Run the KBQA system * Download the preprocessed data from [here](https://1drv.ms/u/s!AjiSpuwVTt09gSE2niFGjdIVsqA7?e=PEf6sT) and put the data folder under the root directory. * Create a folder (e.g., `runs/WebQ/`) to save model checkpoint. You can download the pretrained models from [here](https://1drv.ms/u/s!AjiSpuwVTt09gSLcnrp0GyKtpWBg?e=DtqYt8). (Note: if you cannot access the above data and pretrained models, please download from [here](http://academic.hugochan.net/download/BAMnet-WebQ.zip).) * Please modify the config files in the `src/config/` folder to suit your needs. Note that you can start with modifying only the data folder (e.g., `data_dir`, `model_file`, `pre_word2vec`) and vocab size (e.g., `vocab_size`, `num_ent_types`, `num_relations`), and leave other hyperparameters as they are. * Go to the `BAMnet/src` folder, train the BAMnet model ``` python train.py -config config/bamnet_webq.yml ``` * Test the BAMnet model (with ground-truth topic entity) ``` python test.py -config config/bamnet_webq.yml ``` * Train the topic entity predictor ``` python train_entnet.py -config config/entnet_webq.yml ``` * Test the topic entity predictor ``` python test_entnet.py -config config/entnet_webq.yml ``` * Test the whole system (BAMnet + topic entity predictor) ``` python joint_test.py -bamnet_config config/bamnet_webq.yml -entnet_config config/entnet_webq.yml -raw_data ../data/WebQ ``` ### Preprocess the dataset on your own * Go to the `BAMnet/src` folder, to prepare data for the BAMnet model, run the following cmd: ``` python build_all_data.py -data_dir ../data/WebQ -fb_dir ../data/WebQ -out_dir ../data/WebQ ``` * To prepare data for the topic entity predictor model, run the following cmd: ``` python build_all_data.py -dtype ent -data_dir ../data/WebQ -fb_dir ../data/WebQ -out_dir ../data/WebQ ``` Note that in the message printed out, your will see some data statistics such as `vocab_size`, `num_ent_types `, `num_relations`. These numbers will be used later when modifying the config files. * Download the pretrained Glove word ebeddings [glove.840B.300d.zip](http://nlp.stanford.edu/data/wordvecs/glove.840B.300d.zip). * Unzip the file and convert glove format to word2vec format using the following cmd: ``` python -m gensim.scripts.glove2word2vec --input glove.840B.300d.txt --output glove.840B.300d.w2v ``` * Fetch the pretrained Glove vectors for our vocabulary. ``` python build_pretrained_w2v.py -emb glove.840B.300d.w2v -data_dir ../data/WebQ -out ../data/WebQ/glove_pretrained_300d_w2v.npy -emb_size 300 ``` ## Architecture
## Experiment results on WebQuestions ### Results on WebQuestions test set. Bold: best in-category performance.
### Predicted answers of BAMnet w/ and w/o bidirectional attention on the WebQuestions test set ![pred_examples](images/pred_examples.png "pred_examples") ### Attention heatmap generated by the reasoning module ![attn_heatmap](images/attn_heatmap.png "attn_heatmap") ## Reference If you found this code useful, please consider citing the following paper: Yu Chen, Lingfei Wu, Mohammed J. Zaki. **"Bidirectional Attentive Memory Networks for Question Answering over Knowledge Bases."** *In Proc. 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL-HLT2019). June 2019.* @article{chen2019bidirectional, title={Bidirectional Attentive Memory Networks for Question Answering over Knowledge Bases}, author={Chen, Yu and Wu, Lingfei and Zaki, Mohammed J}, journal={arXiv preprint arXiv:1903.02188}, year={2019} } ================================================ FILE: requirements.txt ================================================ rapidfuzz==0.3.0 gensim==3.5.0 nltk==3.4.5 numpy==1.14.5 PyYAML==5.1 torch==0.4.1 ================================================ FILE: src/build_all_data.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import argparse from core.build_data.build_data import build_vocab, build_data, build_seed_ent_data from core.utils.utils import * from core.build_data import utils as build_utils if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-data_dir', '--data_dir', required=True, type=str, help='path to the data dir') parser.add_argument('-fb_dir', '--fb_dir', required=True, type=str, help='path to the freebase dir') parser.add_argument('-out_dir', '--out_dir', required=True, type=str, help='path to the output dir') parser.add_argument('-dtype', '--data_type', default='qa', type=str, help='data type') parser.add_argument('-min_freq', '--min_freq', default=1, type=int, help='min word vocab freq') parser.add_argument('-topn', '--topn', default=15, type=int, help='top n candidates') args = parser.parse_args() train_data = load_ndjson(os.path.join(args.data_dir, 'raw_train.json')) valid_data = load_ndjson(os.path.join(args.data_dir, 'raw_valid.json')) test_data = load_ndjson(os.path.join(args.data_dir, 'raw_test.json')) freebase = load_ndjson(os.path.join(args.fb_dir, 'freebase_full.json'), return_type='dict') if not (os.path.exists(os.path.join(args.out_dir, 'entity2id.json')) and \ os.path.exists(os.path.join(args.out_dir, 'entityType2id.json')) and \ os.path.exists(os.path.join(args.out_dir, 'relation2id.json')) and \ os.path.exists(os.path.join(args.out_dir, 'vocab2id.json'))): used_fbkeys = set() for each in train_data + valid_data: used_fbkeys.update(each['freebaseKeyCands'][:args.topn]) print('# of used_fbkeys: {}'.format(len(used_fbkeys))) entity2id, entityType2id, relation2id, vocab2id = build_vocab(train_data + valid_data, freebase, used_fbkeys, min_freq=args.min_freq) dump_json(entity2id, os.path.join(args.out_dir, 'entity2id.json')) dump_json(entityType2id, os.path.join(args.out_dir, 'entityType2id.json')) dump_json(relation2id, os.path.join(args.out_dir, 'relation2id.json')) dump_json(vocab2id, os.path.join(args.out_dir, 'vocab2id.json')) else: entity2id = load_json(os.path.join(args.out_dir, 'entity2id.json')) entityType2id = load_json(os.path.join(args.out_dir, 'entityType2id.json')) relation2id = load_json(os.path.join(args.out_dir, 'relation2id.json')) vocab2id = load_json(os.path.join(args.out_dir, 'vocab2id.json')) print('Using pre-built vocabs stored in %s' % args.out_dir) if args.data_type == 'qa': train_vec = build_data(train_data, freebase, entity2id, entityType2id, relation2id, vocab2id) valid_vec = build_data(valid_data, freebase, entity2id, entityType2id, relation2id, vocab2id) test_vec = build_data(test_data, freebase, entity2id, entityType2id, relation2id, vocab2id) dump_json(train_vec, os.path.join(args.out_dir, 'train_vec.json')) dump_json(valid_vec, os.path.join(args.out_dir, 'valid_vec.json')) dump_json(test_vec, os.path.join(args.out_dir, 'test_vec.json')) print('Saved data to {}'.format(os.path.join(args.out_dir, 'train(valid, or test)_vec.json'))) else: train_vec = build_seed_ent_data(train_data, freebase, entity2id, entityType2id, relation2id, vocab2id, args.topn, dtype='train') valid_vec = build_seed_ent_data(valid_data, freebase, entity2id, entityType2id, relation2id, vocab2id, args.topn, dtype='valid') test_vec = build_seed_ent_data(test_data, freebase, entity2id, entityType2id, relation2id, vocab2id, args.topn, dtype='test') dump_json(train_vec, os.path.join(args.out_dir, 'train_ent_vec.json')) dump_json(valid_vec, os.path.join(args.out_dir, 'valid_ent_vec.json')) dump_json(test_vec, os.path.join(args.out_dir, 'test_ent_vec.json')) print('Saved data to {}'.format(os.path.join(args.out_dir, 'train(valid, or test)_ent_vec.json'))) # Mark the data as built. build_utils.mark_done(args.out_dir) ================================================ FILE: src/build_pretrained_w2v.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import argparse import os from core.utils.utils import load_json from core.utils.generic_utils import dump_embeddings if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-emb', '--embed_path', required=True, type=str, help='path to the pretrained word embeddings') parser.add_argument('-data_dir', '--data_dir', required=True, type=str, help='path to the data dir') parser.add_argument('-out', '--out_path', required=True, type=str, help='path to the output path') parser.add_argument('-emb_size', '--emb_size', required=True, type=int, help='embedding size') parser.add_argument('--binary', action='store_true', help='flag: binary file') args = parser.parse_args() vocab_dict = load_json(os.path.join(args.data_dir, 'vocab2id.json')) dump_embeddings(vocab_dict, args.embed_path, args.out_path, emb_size=args.emb_size, binary=True if args.binary else False) ================================================ FILE: src/config/bamnet_webq.yml ================================================ # Seed 15 Data name: 'WebQuestions' data_dir: '../data/WebQ/' train_data: 'train_vec.json' valid_data: 'valid_vec.json' test_data: 'test_vec.json' pre_word2vec: '../data/WebQ/glove_pretrained_300d_w2v.npy' # Full vocab vocab_size: 100797 num_ent_types: 1712 num_relations: 4996 num_query_words: 10 # Output model_file: '../runs/WebQ/bamnet.md' # Model query_size: 32 query_markup_size: 1 # Not used ans_bow_size: 1 # Not used ans_path_bow_size: null ans_ctx_entity_bow_size: 6 vocab_embed_size: 300 hidden_size: 128 o_embed_size: 128 mem_size: 96 word_emb_dropout: 0.3 que_enc_dropout: 0.3 ans_enc_dropout: 0.2 attention: 'add' num_hops: 1 # Training learning_rate: 0.001 batch_size: 32 num_epochs: 100 valid_patience: 10 margin: 1 # Testing test_batch_size: 1 test_margin: - 0.7 # Device no_cuda: False gpu: 0 ================================================ FILE: src/config/entnet_webq.yml ================================================ # WebQuestions Data name: 'WebQuestions' data_dir: '../data/WebQ/' train_data: 'train_ent_vec.json' valid_data: 'valid_ent_vec.json' test_data: 'test_ent_vec.json' # Full vocab vocab_size: 100797 num_ent_types: 1712 num_relations: 4996 pre_word2vec: '../data/WebQ/glove_pretrained_300d_w2v.npy' # Output model_file: '../runs/WebQ/entnet.md' # Model query_size: 32 max_seed_ent_name_size: null max_seed_type_name_size: null max_seed_rel_name_size: null max_seed_rel_size: null vocab_embed_size: 300 hidden_size: 128 o_embed_size: 128 word_emb_dropout: 0.3 que_enc_dropout: 0.3 ent_enc_dropout: 0.2 attention: 'simple' seq_enc_type: 'cnn' num_ent_hops: 1 # Training learning_rate: 0.001 batch_size: 32 num_epochs: 100 valid_patience: 10 # Testing test_batch_size: 1 # Device no_cuda: False gpu: 0 ================================================ FILE: src/core/__init__.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' ================================================ FILE: src/core/bamnet/__init__.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' ================================================ FILE: src/core/bamnet/bamnet.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import os import timeit import numpy as np import torch from torch import optim from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.nn import MultiLabelMarginLoss import torch.backends.cudnn as cudnn from .modules import BAMnet from .utils import to_cuda, next_batch from ..utils.utils import load_ndarray from ..utils.generic_utils import unique from ..utils.metrics import * from .. import config CTX_BOW_INDEX = -5 def get_text_overlap(raw_query, query_mentions, ctx_ent_names, vocab2id, ctx_stops, query): def longest_common_substring(s1, s2): m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))] longest, x_longest = 0, 0 for x in range(1, 1 + len(s1)): for y in range(1, 1 + len(s2)): if s1[x - 1] == s2[y - 1]: m[x][y] = m[x - 1][y - 1] + 1 if m[x][y] > longest: longest = m[x][y] x_longest = x else: m[x][y] = 0 return s1[x_longest - longest: x_longest] sub_seq = longest_common_substring(raw_query, ctx_ent_names) if len(set(sub_seq) - ctx_stops) == 0: return [] men_type = None for men, type_ in query_mentions: if type_.lower() in config.constraint_mention_types: if '_'.join(sub_seq) in '_'.join(men): men_type = '__{}__'.format(type_.lower()) break if men_type: return [vocab2id[men_type] if men_type in vocab2id else config.RESERVED_TOKENS['UNK']] else: return [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for x in sub_seq] class BAMnetAgent(object): """ Bidirectional attentive memory network agent. """ def __init__(self, opt, ctx_stops, vocab2id): self.ctx_stops = ctx_stops self.vocab2id = vocab2id opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # It enables benchmark mode in cudnn, which # leads to faster runtime when the input sizes do not vary. cudnn.benchmark = True self.opt = opt if self.opt['pre_word2vec']: pre_w2v = load_ndarray(self.opt['pre_word2vec']) else: pre_w2v = None self.model = BAMnet(opt['vocab_size'], opt['vocab_embed_size'], \ opt['o_embed_size'], opt['hidden_size'], \ opt['num_ent_types'], opt['num_relations'], \ opt['num_query_words'], \ word_emb_dropout=opt['word_emb_dropout'], \ que_enc_dropout=opt['que_enc_dropout'], \ ans_enc_dropout=opt['ans_enc_dropout'], \ pre_w2v=pre_w2v, \ num_hops=opt['num_hops'], \ att=opt['attention'], \ use_cuda=opt['cuda']) if opt['cuda']: self.model.cuda() # MultiLabelMarginLoss # For each sample in the mini-batch: # loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0) self.loss_fn = MultiLabelMarginLoss() optim_params = [p for p in self.model.parameters() if p.requires_grad] self.optimizers = {'bamnet': optim.Adam(optim_params, lr=opt['learning_rate'])} self.scheduler = ReduceLROnPlateau(self.optimizers['bamnet'], mode='min', \ patience=self.opt['valid_patience'] // 3, verbose=True) if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing model parameters from ' + opt['model_file']) self.load(opt['model_file']) super(BAMnetAgent, self).__init__() def train(self, train_X, train_y, valid_X, valid_y, valid_cand_labels, valid_gold_ans_labels, seed=1234): print('Training size: {}, Validation size: {}'.format(len(train_y), len(valid_y))) random1 = np.random.RandomState(seed) random2 = np.random.RandomState(seed) random3 = np.random.RandomState(seed) random4 = np.random.RandomState(seed) random5 = np.random.RandomState(seed) random6 = np.random.RandomState(seed) random7 = np.random.RandomState(seed) memories, queries, query_words, raw_queries, query_mentions, query_lengths = train_X gold_ans_inds = train_y valid_memories, valid_queries, valid_query_words, valid_raw_queries, valid_query_mentions, valid_query_lengths = valid_X valid_gold_ans_inds = valid_y n_incr_error = 0 # nb. of consecutive increase in error best_loss = float("inf") num_batches = len(queries) // self.opt['batch_size'] + (len(queries) % self.opt['batch_size'] != 0) num_valid_batches = len(valid_queries) // self.opt['batch_size'] + (len(valid_queries) % self.opt['batch_size'] != 0) for epoch in range(1, self.opt['num_epochs'] + 1): start = timeit.default_timer() n_incr_error += 1 random1.shuffle(memories) random2.shuffle(queries) random3.shuffle(query_words) random4.shuffle(raw_queries) random5.shuffle(query_mentions) random6.shuffle(query_lengths) random7.shuffle(gold_ans_inds) train_gen = next_batch(memories, queries, query_words, raw_queries, query_mentions, query_lengths, gold_ans_inds, self.opt['batch_size']) train_loss = 0 for batch_xs, batch_ys in train_gen: train_loss += self.train_step(batch_xs, batch_ys) / num_batches valid_gen = next_batch(valid_memories, valid_queries, valid_query_words, valid_raw_queries, valid_query_mentions, valid_query_lengths, valid_gold_ans_inds, self.opt['batch_size']) valid_loss = 0 for batch_valid_xs, batch_valid_ys in valid_gen: valid_loss += self.train_step(batch_valid_xs, batch_valid_ys, is_training=False) / num_valid_batches self.scheduler.step(valid_loss) # if False: if epoch > 0: pred = self.predict(valid_X, valid_cand_labels, batch_size=1, margin=self.opt['margin'], silence=True) predictions = [unique([x[0] for x in each]) for each in pred] valid_f1 = calc_avg_f1(valid_gold_ans_labels, predictions, verbose=False)[-1] else: valid_f1 = 0. print('Epoch {}/{}: Runtime: {}s, Train loss: {:.4}, valid loss: {:.4}, valid F1: {:.4}'.format(epoch, self.opt['num_epochs'], \ int(timeit.default_timer() - start), train_loss, valid_loss, valid_f1)) if valid_loss < best_loss: best_loss = valid_loss n_incr_error = 0 self.save() if n_incr_error >= self.opt['valid_patience']: print('Early stopping occured. Optimization Finished!') self.save(self.opt['model_file'] + '.final') break def predict(self, xs, cand_labels, batch_size=32, margin=1, ys=None, verbose=False, silence=False): '''Prediction scores are returned in the verbose mode. ''' if not silence: print('Testing size: {}'.format(len(cand_labels))) memories, queries, query_words, raw_queries, query_mentions, query_lengths = xs gen = next_batch(memories, queries, query_words, raw_queries, query_mentions, query_lengths, cand_labels, batch_size) predictions = [] for batch_xs, batch_cands in gen: batch_pred = self.predict_step(batch_xs, batch_cands, margin, verbose=verbose) predictions.extend(batch_pred) return predictions def train_step(self, xs, ys, is_training=True): # Sets the module in training mode. # This has any effect only on modules such as Dropout or BatchNorm. self.model.train(mode=is_training) with torch.set_grad_enabled(is_training): # Organize inputs for network selected_memories, new_ys, ctx_mask = self.dynamic_ctx_negative_sampling(xs[0], ys, self.opt['mem_size'], \ self.opt['ans_ctx_entity_bow_size'], xs[3], xs[4], xs[1]) selected_memories = [to_cuda(torch.LongTensor(np.array(x)), self.opt['cuda']) for x in zip(*selected_memories)] ctx_mask = to_cuda(ctx_mask, self.opt['cuda']) queries = to_cuda(torch.LongTensor(xs[1]), self.opt['cuda']) query_words = to_cuda(torch.LongTensor(xs[2]), self.opt['cuda']) query_lengths = to_cuda(torch.LongTensor(xs[5]), self.opt['cuda']) mem_hop_scores = self.model(selected_memories, queries, query_lengths, query_words, ctx_mask=None) # Set margin new_ys, mask_ys = self.pack_gold_ans(new_ys, mem_hop_scores[-1].size(1), placeholder=-1) loss = 0 for _, s in enumerate(mem_hop_scores): s = self.set_loss_margin(s, mask_ys, self.opt['margin']) loss += self.loss_fn(s, new_ys) loss /= len(mem_hop_scores) if is_training: for o in self.optimizers.values(): o.zero_grad() loss.backward() for o in self.optimizers.values(): o.step() return loss.item() def predict_step(self, xs, cand_labels, margin, verbose=False): self.model.train(mode=False) with torch.set_grad_enabled(False): # Organize inputs for network memories, ctx_mask = self.pad_ctx_memory(xs[0], self.opt['ans_ctx_entity_bow_size'], xs[3], xs[4], xs[1]) memories = [to_cuda(torch.LongTensor(np.array(x)), self.opt['cuda']) for x in zip(*memories)] ctx_mask = to_cuda(ctx_mask, self.opt['cuda']) queries = to_cuda(torch.LongTensor(xs[1]), self.opt['cuda']) query_words = to_cuda(torch.LongTensor(xs[2]), self.opt['cuda']) query_lengths = to_cuda(torch.LongTensor(xs[5]), self.opt['cuda']) mem_hop_scores = self.model(memories, queries, query_lengths, query_words, ctx_mask=None) predictions = self.ranked_predictions(cand_labels, mem_hop_scores[-1].data, margin) return predictions def dynamic_ctx_negative_sampling(self, memories, ys, mem_size, ctx_bow_size, raw_queries, query_mentions, queries): # Randomly select negative samples from the candidiate answer set ctx_bow_size = max(min(max(map(len, (a for x in list(zip(*memories))[CTX_BOW_INDEX] for y in x for a in y)), default=0), ctx_bow_size), 1) selected_memories = [] new_ys = [] ctx_mask = [] for i in range(len(ys)): n = len(memories[i][0]) - 1 # The last element is a dummy candidate num_gold = len(ys[i]) if mem_size > len(ys[i]) else \ (mem_size - min(mem_size // 2, n - len(ys[i]))) # Max possible (pos, neg) pairs selected_gold_inds = np.random.choice(ys[i], num_gold, replace=False).tolist() if len(ys[i]) > 0 else [] if n > len(ys[i]): p = np.ones(n) p[ys[i]] = 0 p = p / np.sum(p) selected_inds = np.random.choice(n, min(mem_size, n) - num_gold, replace=False, p=p).tolist() else: selected_inds = [] augmented_selected_inds = selected_gold_inds + selected_inds + [-1] * max(mem_size - n, 0) xx = [min(mem_size, n)] + [np.array(x)[augmented_selected_inds] for x in memories[i][:CTX_BOW_INDEX]] ctx_bow = [] ctx_bow_len = [] ctx_num = [] tmp_ctx_mask = np.zeros(mem_size) for _, idx in enumerate(augmented_selected_inds): tmp_ctx = [] tmp_ctx_len = [] for ctx_ent_names in memories[i][CTX_BOW_INDEX][idx]: sub_seq = get_text_overlap(raw_queries[i], query_mentions[i], ctx_ent_names, self.vocab2id, self.ctx_stops, queries[i]) if len(sub_seq) > 0: tmp_ctx_mask[_] = 1 tmp_ctx.append(sub_seq[:ctx_bow_size] + [config.RESERVED_TOKENS['PAD']] * max(0, ctx_bow_size - len(sub_seq))) tmp_ctx_len.append(max(min(ctx_bow_size, len(sub_seq)), 1)) ctx_bow.append(tmp_ctx) ctx_bow_len.append(tmp_ctx_len) ctx_num.append(len(tmp_ctx)) xx += [ctx_bow, ctx_bow_len, ctx_num] xx += [np.array(x)[augmented_selected_inds] for x in memories[i][CTX_BOW_INDEX+1:]] selected_memories.append(xx) new_ys.append(list(range(num_gold))) ctx_mask.append(tmp_ctx_mask) max_ctx_num = max(max([y for x in selected_memories for y in x[CTX_BOW_INDEX]]), 1) for i in range(len(selected_memories)): # Example for j in range(len(selected_memories[i][-1])): # Cand count = selected_memories[i][CTX_BOW_INDEX][j] if count < max_ctx_num: selected_memories[i][CTX_BOW_INDEX - 2][j] += [[config.RESERVED_TOKENS['PAD']] * ctx_bow_size] * (max_ctx_num - count) selected_memories[i][CTX_BOW_INDEX - 1][j] += [1] * (max_ctx_num - count) return selected_memories, new_ys, torch.Tensor(np.array(ctx_mask)) def pad_ctx_memory(self, memories, ctx_bow_size, raw_queries, query_mentions, queries): cand_ans_size = max(max(map(len, list(zip(*memories))[0]), default=0) - 1, 1) # The last element is a dummy candidate ctx_bow_size = max(min(max(map(len, (a for x in list(zip(*memories))[CTX_BOW_INDEX] for y in x for a in y)), default=0), ctx_bow_size), 1) pad_memories = [] ctx_mask = [] for i in range(len(memories)): n = len(memories[i][0]) - 1 # The last element is a dummy candidate augmented_inds = list(range(n)) + [-1] * (cand_ans_size - n) xx = [n] + [np.array(x)[augmented_inds] for x in memories[i][:CTX_BOW_INDEX]] ctx_bow = [] ctx_bow_len = [] ctx_num = [] tmp_ctx_mask = np.zeros(cand_ans_size) for _, idx in enumerate(augmented_inds): tmp_ctx = [] tmp_ctx_len = [] for ctx_ent_names in memories[i][CTX_BOW_INDEX][idx]: sub_seq = get_text_overlap(raw_queries[i], query_mentions[i], ctx_ent_names, self.vocab2id, self.ctx_stops, queries[i]) if len(sub_seq) > 0: tmp_ctx_mask[_] = 1 tmp_ctx.append(sub_seq[:ctx_bow_size] + [config.RESERVED_TOKENS['PAD']] * max(0, ctx_bow_size - len(sub_seq))) tmp_ctx_len.append(max(min(ctx_bow_size, len(sub_seq)), 1)) ctx_bow.append(tmp_ctx) ctx_bow_len.append(tmp_ctx_len) ctx_num.append(len(tmp_ctx)) xx += [ctx_bow, ctx_bow_len, ctx_num] xx += [np.array(x)[augmented_inds] for x in memories[i][CTX_BOW_INDEX+1:]] pad_memories.append(xx) ctx_mask.append(tmp_ctx_mask) max_ctx_num = max(max([y for x in pad_memories for y in x[CTX_BOW_INDEX]]), 1) for i in range(len(pad_memories)): # Example for j in range(len(pad_memories[i][-1])): # Cand count = pad_memories[i][CTX_BOW_INDEX][j] if count < max_ctx_num: pad_memories[i][CTX_BOW_INDEX - 2][j] += [[config.RESERVED_TOKENS['PAD']] * ctx_bow_size] * (max_ctx_num - count) pad_memories[i][CTX_BOW_INDEX - 1][j] += [1] * (max_ctx_num - count) return pad_memories, torch.Tensor(np.array(ctx_mask)) def pack_gold_ans(self, x, N, placeholder=-1): y = np.ones((len(x), N), dtype='int64') * placeholder mask = np.zeros((len(x), N)) for i in range(len(x)): y[i, :len(x[i])] = x[i] mask[i, :len(x[i])] = 1 return to_cuda(torch.LongTensor(y), self.opt['cuda']), to_cuda(torch.Tensor(mask), self.opt['cuda']) def set_loss_margin(self, scores, gold_mask, margin): """Since the pytorch built-in MultiLabelMarginLoss fixes the margin as 1. We simply work around this annoying feature by *modifying* the golden scores. E.g., if we want margin as 3, we decrease each golden score by 3 - 1 before feeding it to the built-in loss. """ new_scores = scores - (margin - 1) * gold_mask return new_scores def ranked_predictions(self, cand_labels, scores, margin): _, sorted_inds = scores.sort(descending=True, dim=1) return [[(cand_labels[i][j], scores[i][j]) for j in r if scores[i][j] + margin >= scores[i][r[0]] \ and cand_labels[i][j] != 'UNK'] \ if len(cand_labels[i]) > 0 and scores[i][r[0]] > -1e4 else [] \ for i, r in enumerate(sorted_inds)] # Very large negative ones are dummy candidates def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path: checkpoint = {} checkpoint['bamnet'] = self.model.state_dict() checkpoint['bamnet_optim'] = self.optimizers['bamnet'].state_dict() with open(path, 'wb') as write: torch.save(checkpoint, write) print('Saved model to {}'.format(path)) def load(self, path): with open(path, 'rb') as read: checkpoint = torch.load(read, map_location=lambda storage, loc: storage) self.model.load_state_dict(checkpoint['bamnet']) self.optimizers['bamnet'].load_state_dict(checkpoint['bamnet_optim']) ================================================ FILE: src/core/bamnet/ent_modules.py ================================================ ''' Created on Sep, 2018 @author: hugo ''' import numpy as np import torch import torch.nn as nn from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence import torch.nn.functional as F from .modules import SeqEncoder, SelfAttention_CoAtt, Attention from .utils import to_cuda INF = 1e20 VERY_SMALL_NUMBER = 1e-10 class Entnet(nn.Module): def __init__(self, vocab_size, vocab_embed_size, o_embed_size, \ hidden_size, num_ent_types, num_relations, \ seq_enc_type='cnn', \ word_emb_dropout=None, \ que_enc_dropout=None,\ ent_enc_dropout=None, \ pre_w2v=None, \ num_hops=1, \ att='add', \ use_cuda=True): super(Entnet, self).__init__() self.use_cuda = use_cuda self.seq_enc_type = seq_enc_type self.que_enc_dropout = que_enc_dropout self.ent_enc_dropout = ent_enc_dropout self.num_hops = num_hops self.hidden_size = hidden_size self.que_enc = SeqEncoder(vocab_size, vocab_embed_size, hidden_size, \ seq_enc_type=seq_enc_type, \ word_emb_dropout=word_emb_dropout, \ bidirectional=True, \ cnn_kernel_size=[2, 3], \ init_word_embed=pre_w2v, \ use_cuda=use_cuda).que_enc self.ent_enc = EntEncoder(o_embed_size, hidden_size, \ num_ent_types, num_relations, \ vocab_size=vocab_size, \ vocab_embed_size=vocab_embed_size, \ shared_embed=self.que_enc.embed, \ seq_enc_type=seq_enc_type, \ word_emb_dropout=word_emb_dropout, \ ent_enc_dropout=ent_enc_dropout, \ use_cuda=use_cuda) self.batchnorm = nn.BatchNorm1d(hidden_size) if seq_enc_type in ('lstm', 'gru'): self.self_atten = SelfAttention_CoAtt(hidden_size) print('[ Using self-attention on question encoder ]') self.ent_memory_hop = EntRomHop(hidden_size, hidden_size, hidden_size, atten_type=att) print('[ Using {}-hop entity memory update ]'.format(num_hops)) def forward(self, memories, queries, query_lengths): x_ent_names, x_ent_name_len, x_type_names, x_types, x_type_name_len, x_rel_names, x_rels, x_rel_name_len, x_rel_mask = memories x_rel_mask = self.create_mask_3D(x_rel_mask, x_rels.size(-1), use_cuda=self.use_cuda) # Question encoder if self.seq_enc_type in ('lstm', 'gru'): Q_r = self.que_enc(queries, query_lengths)[0] if self.que_enc_dropout: Q_r = F.dropout(Q_r, p=self.que_enc_dropout, training=self.training) query_mask = self.create_mask(query_lengths, Q_r.size(1), self.use_cuda) q_r = self.self_atten(Q_r, query_lengths, query_mask) else: q_r = self.que_enc(queries, query_lengths)[1] if self.que_enc_dropout: q_r = F.dropout(q_r, p=self.que_enc_dropout, training=self.training) # Entity encoder ent_val, ent_key = self.ent_enc(x_ent_names, x_ent_name_len, x_type_names, x_types, x_type_name_len, x_rel_names, x_rels, x_rel_name_len, x_rel_mask) ent_val = torch.cat([each.unsqueeze(2) for each in ent_val], 2) ent_key = torch.cat([each.unsqueeze(2) for each in ent_key], 2) ent_val = torch.sum(ent_val, 2) ent_key = torch.sum(ent_key, 2) mem_hop_scores = [] mid_score = self.clf_score(q_r, ent_key) mem_hop_scores.append(mid_score) for _ in range(self.num_hops): q_r = q_r + self.ent_memory_hop(q_r, ent_key, ent_val) q_r = self.batchnorm(q_r) mid_score = self.clf_score(q_r, ent_key) mem_hop_scores.append(mid_score) return mem_hop_scores def clf_score(self, q_r, ent_key): return torch.matmul(ent_key, q_r.unsqueeze(-1)).squeeze(-1) def create_mask(self, x, N, use_cuda=True): x = x.data mask = np.zeros((x.size(0), N)) for i in range(x.size(0)): mask[i, :x[i]] = 1 return to_cuda(torch.Tensor(mask), use_cuda) def create_mask_3D(self, x, N, use_cuda=True): x = x.data mask = np.zeros((x.size(0), x.size(1), N)) for i in range(x.size(0)): for j in range(x.size(1)): mask[i, j, :x[i, j]] = 1 return to_cuda(torch.Tensor(mask), use_cuda) class EntEncoder(nn.Module): """Entity Encoder""" def __init__(self, o_embed_size, hidden_size, num_ent_types, num_relations, vocab_size=None, \ vocab_embed_size=None, shared_embed=None, seq_enc_type='lstm', word_emb_dropout=None, \ ent_enc_dropout=None, use_cuda=True): super(EntEncoder, self).__init__() # Cannot have embed and vocab_size set as None at the same time. self.ent_enc_dropout = ent_enc_dropout self.hidden_size = hidden_size self.relation_embed = nn.Embedding(num_relations, o_embed_size, padding_idx=0) self.embed = shared_embed if shared_embed is not None else nn.Embedding(vocab_size, vocab_embed_size, padding_idx=0) self.vocab_embed_size = self.embed.weight.data.size(1) self.linear_node_name_key = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_node_type_key = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_rels_key = nn.Linear(hidden_size + o_embed_size, hidden_size, bias=False) self.linear_node_name_val = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_node_type_val = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_rels_val = nn.Linear(hidden_size + o_embed_size, hidden_size, bias=False) self.kg_enc_ent = SeqEncoder(vocab_size, \ self.vocab_embed_size, \ hidden_size, \ seq_enc_type=seq_enc_type, \ word_emb_dropout=word_emb_dropout, \ bidirectional=True, \ cnn_kernel_size=[3], \ shared_embed=shared_embed, \ use_cuda=use_cuda).que_enc # entity name self.kg_enc_type = SeqEncoder(vocab_size, \ self.vocab_embed_size, \ hidden_size, \ seq_enc_type=seq_enc_type, \ word_emb_dropout=word_emb_dropout, \ bidirectional=True, \ cnn_kernel_size=[3], \ shared_embed=shared_embed, \ use_cuda=use_cuda).que_enc # entity type name self.kg_enc_rel = SeqEncoder(vocab_size, \ self.vocab_embed_size, \ hidden_size, \ seq_enc_type=seq_enc_type, \ word_emb_dropout=word_emb_dropout, \ bidirectional=True, \ cnn_kernel_size=[3], \ shared_embed=shared_embed, \ use_cuda=use_cuda).que_enc # relation name def forward(self, x_ent_names, x_ent_name_len, x_type_names, x_types, x_type_name_len, x_rel_names, x_rels, x_rel_name_len, x_rel_mask): node_ent_names, node_type_names, node_types, edge_rel_names, edge_rels = self.enc_kg_features(x_ent_names, x_ent_name_len, x_type_names, x_types, x_type_name_len, x_rel_names, x_rels, x_rel_name_len, x_rel_mask) node_name_key = self.linear_node_name_key(node_ent_names) node_type_key = self.linear_node_type_key(node_type_names) rel_key = self.linear_rels_key(torch.cat([edge_rel_names, edge_rels], -1)) node_name_val = self.linear_node_name_val(node_ent_names) node_type_val = self.linear_node_type_val(node_type_names) rel_val = self.linear_rels_val(torch.cat([edge_rel_names, edge_rels], -1)) ent_comp_val = [node_name_val, node_type_val, rel_val] ent_comp_key = [node_name_key, node_type_key, rel_key] return ent_comp_val, ent_comp_key def enc_kg_features(self, x_ent_names, x_ent_name_len, x_type_names, x_types, x_type_name_len, x_rel_names, x_rels, x_rel_name_len, x_rel_mask): node_ent_names = (self.kg_enc_ent(x_ent_names.view(-1, x_ent_names.size(-1)), x_ent_name_len.view(-1))[1]).view(x_ent_names.size(0), x_ent_names.size(1), -1) node_type_names = (self.kg_enc_type(x_type_names.view(-1, x_type_names.size(-1)), x_type_name_len.view(-1))[1]).view(x_type_names.size(0), x_type_names.size(1), -1) node_types = None edge_rel_names = torch.mean((self.kg_enc_rel(x_rel_names.view(-1, x_rel_names.size(-1)), x_rel_name_len.view(-1))[1]).view(x_rel_names.size(0), x_rel_names.size(1), x_rel_names.size(2), -1), 2) edge_rels = torch.mean(self.relation_embed(x_rels.view(-1, x_rels.size(-1))), 1).view(x_rels.size(0), x_rels.size(1), -1) if self.ent_enc_dropout: node_ent_names = F.dropout(node_ent_names, p=self.ent_enc_dropout, training=self.training) node_type_names = F.dropout(node_type_names, p=self.ent_enc_dropout, training=self.training) # node_types = F.dropout(node_types, p=self.ent_enc_dropout, training=self.training) edge_rel_names = F.dropout(edge_rel_names, p=self.ent_enc_dropout, training=self.training) edge_rels = F.dropout(edge_rels, p=self.ent_enc_dropout, training=self.training) return node_ent_names, node_type_names, node_types, edge_rel_names, edge_rels class EntRomHop(nn.Module): def __init__(self, query_embed_size, in_memory_embed_size, hidden_size, atten_type='add'): super(EntRomHop, self).__init__() self.atten = Attention(hidden_size, query_embed_size, in_memory_embed_size, atten_type=atten_type) self.gru_step = GRUStep(hidden_size, in_memory_embed_size) def forward(self, h_state, key_memory_embed, val_memory_embed, atten_mask=None): attention = self.atten(h_state, key_memory_embed, atten_mask=atten_mask) probs = torch.softmax(attention, dim=-1) memory_output = torch.bmm(probs.unsqueeze(1), val_memory_embed).squeeze(1) h_state = self.gru_step(h_state, memory_output) return h_state class GRUStep(nn.Module): def __init__(self, hidden_size, input_size): super(GRUStep, self).__init__() '''GRU module''' self.linear_z = nn.Linear(hidden_size + input_size, hidden_size, bias=False) self.linear_r = nn.Linear(hidden_size + input_size, hidden_size, bias=False) self.linear_t = nn.Linear(hidden_size + input_size, hidden_size, bias=False) def forward(self, h_state, input_): z = torch.sigmoid(self.linear_z(torch.cat([h_state, input_], -1))) r = torch.sigmoid(self.linear_r(torch.cat([h_state, input_], -1))) t = torch.tanh(self.linear_t(torch.cat([r * h_state, input_], -1))) h_state = (1 - z) * h_state + z * t return h_state ================================================ FILE: src/core/bamnet/entnet.py ================================================ ''' Created on Sep, 2018 @author: hugo ''' import os import timeit import numpy as np import torch from torch import optim from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.nn import CrossEntropyLoss, MultiLabelMarginLoss import torch.backends.cudnn as cudnn from .ent_modules import Entnet from .utils import to_cuda, next_ent_batch from ..utils.utils import load_ndarray from ..utils.generic_utils import unique from ..utils.metrics import * class EntnetAgent(object): def __init__(self, opt): opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.set_device(opt['gpu']) # It enables benchmark mode in cudnn, which # leads to faster runtime when the input sizes do not vary. cudnn.benchmark = True self.opt = opt if self.opt['pre_word2vec']: pre_w2v = load_ndarray(self.opt['pre_word2vec']) else: pre_w2v = None self.ent_model = Entnet(opt['vocab_size'], opt['vocab_embed_size'], \ opt['o_embed_size'], opt['hidden_size'], \ opt['num_ent_types'], opt['num_relations'], \ seq_enc_type=opt['seq_enc_type'], \ word_emb_dropout=opt['word_emb_dropout'], \ que_enc_dropout=opt['que_enc_dropout'], \ ent_enc_dropout=opt['ent_enc_dropout'], \ pre_w2v=pre_w2v, \ num_hops=opt['num_ent_hops'], \ att=opt['attention'], \ use_cuda=opt['cuda']) if opt['cuda']: self.ent_model.cuda() self.loss_fn = MultiLabelMarginLoss() optim_params = [p for p in self.ent_model.parameters() if p.requires_grad] self.optimizers = {'entnet': optim.Adam(optim_params, lr=opt['learning_rate'])} self.scheduler = ReduceLROnPlateau(self.optimizers['entnet'], mode='min', \ patience=self.opt['valid_patience'] // 3, verbose=True) if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing ent_model parameters from ' + opt['model_file']) self.load(opt['model_file']) else: self.save() self.load(opt['model_file']) super(EntnetAgent, self).__init__() def train(self, train_X, train_y, valid_X, valid_y, seed=1234): print('Training size: {}, Validation size: {}'.format(len(train_y), len(valid_y))) random1 = np.random.RandomState(seed) random2 = np.random.RandomState(seed) random3 = np.random.RandomState(seed) random4 = np.random.RandomState(seed) memories, queries, query_lengths = train_X ent_inds = train_y valid_memories, valid_queries, valid_query_lengths = valid_X valid_ent_inds = valid_y n_incr_error = 0 # nb. of consecutive increase in error best_loss = float("inf") best_acc = 0 num_batches = len(queries) // self.opt['batch_size'] + (len(queries) % self.opt['batch_size'] != 0) num_valid_batches = len(valid_queries) // self.opt['batch_size'] + (len(valid_queries) % self.opt['batch_size'] != 0) for epoch in range(1, self.opt['num_epochs'] + 1): start = timeit.default_timer() n_incr_error += 1 random1.shuffle(memories) random2.shuffle(queries) random3.shuffle(query_lengths) random4.shuffle(ent_inds) train_gen = next_ent_batch(memories, queries, query_lengths, ent_inds, self.opt['batch_size']) train_loss = 0 for batch_xs, batch_ys in train_gen: train_loss += self.train_step(batch_xs, batch_ys) / num_batches valid_gen = next_ent_batch(valid_memories, valid_queries, valid_query_lengths, valid_ent_inds, self.opt['batch_size']) valid_loss = 0 for batch_valid_xs, batch_valid_ys in valid_gen: valid_loss += self.train_step(batch_valid_xs, batch_valid_ys, is_training=False) / num_valid_batches self.scheduler.step(valid_loss) if epoch > 0: valid_acc = self.evaluate(valid_X, valid_ent_inds, batch_size=1, silence=True) # valid_acc = 0. print('Epoch {}/{}: Runtime: {}s, Training loss: {:.4}, validation loss: {:.4}, validation ACC: {:.4}'.format(epoch, self.opt['num_epochs'], \ int(timeit.default_timer() - start), train_loss, valid_loss, valid_acc)) # self.scheduler.step(valid_acc) # if valid_acc > best_acc: # best_acc = valid_acc # n_incr_error = 0 # self.save() if valid_loss < best_loss: best_loss = valid_loss n_incr_error = 0 self.save() if n_incr_error >= self.opt['valid_patience']: print('Early stopping occured. Optimization Finished!') self.save(self.opt['model_file'] + '.final') break def evaluate(self, xs, ys, batch_size=1, silence=False): '''Prediction scores are returned in the verbose mode. ''' if not silence: print('Data size: {}'.format(len(xs[0]))) memories, queries, query_lengths = xs gen = next_ent_batch(memories, queries, query_lengths, ys, batch_size) correct = 0 num_samples = 0 for batch_xs, batch_ys in gen: correct += self.evaluate_step(batch_xs, batch_ys) num_samples += len(batch_ys) acc = 100 * correct / num_samples return acc def predict(self, xs, cand_labels, batch_size=1, silence=False): if not silence: print('Data size: {}'.format(len(xs[0]))) memories, queries, query_lengths = xs gen = next_ent_batch(memories, queries, query_lengths, cand_labels, batch_size) predictions = [] for batch_xs, batch_cands in gen: batch_pred = self.predict_step(batch_xs, batch_cands) predictions.extend(batch_pred) return predictions def train_step(self, xs, ys, is_training=True): # Sets the module in training mode. # This has any effect only on modules such as Dropout or BatchNorm. self.ent_model.train(mode=is_training) with torch.set_grad_enabled(is_training): # Organize inputs for network memories = [to_cuda(torch.LongTensor(np.array(x)), self.opt['cuda']) for x in zip(*xs[0])] queries = to_cuda(torch.LongTensor(xs[1]), self.opt['cuda']) query_lengths = to_cuda(torch.LongTensor(xs[2]), self.opt['cuda']) mem_hop_scores = self.ent_model(memories, queries, query_lengths) # ys = to_cuda(torch.LongTensor(ys), self.opt['cuda']).squeeze(-1) # Set margin ys, mask_ys = self.pack_gold_ans(ys, mem_hop_scores[-1].size(1), placeholder=-1) loss = 0 for _, s in enumerate(mem_hop_scores): loss += self.loss_fn(s, ys) loss /= len(mem_hop_scores) if is_training: for o in self.optimizers.values(): o.zero_grad() loss.backward() for o in self.optimizers.values(): o.step() return loss.item() def evaluate_step(self, xs, ys): self.ent_model.train(mode=False) with torch.set_grad_enabled(False): # Organize inputs for network memories = [to_cuda(torch.LongTensor(np.array(x)), self.opt['cuda']) for x in zip(*xs[0])] queries = to_cuda(torch.LongTensor(xs[1]), self.opt['cuda']) query_lengths = to_cuda(torch.LongTensor(xs[2]), self.opt['cuda']) scores = self.ent_model(memories, queries, query_lengths)[-1] ys = to_cuda(torch.LongTensor(ys), self.opt['cuda']).squeeze(1) predictions = scores.max(1)[1].type_as(ys) correct = predictions.eq(ys).sum() return correct.item() def predict_step(self, xs, cand_labels): self.ent_model.train(mode=False) with torch.set_grad_enabled(False): # Organize inputs for network memories = [to_cuda(torch.LongTensor(np.array(x)), self.opt['cuda']) for x in zip(*xs[0])] queries = to_cuda(torch.LongTensor(xs[1]), self.opt['cuda']) query_lengths = to_cuda(torch.LongTensor(xs[2]), self.opt['cuda']) scores = self.ent_model(memories, queries, query_lengths)[-1] predictions = self.ranked_predictions(cand_labels, scores) return predictions def pack_gold_ans(self, x, N, placeholder=-1): y = np.ones((len(x), N), dtype='int64') * placeholder mask = np.zeros((len(x), N)) for i in range(len(x)): y[i, :len(x[i])] = x[i] mask[i, :len(x[i])] = 1 return to_cuda(torch.LongTensor(y), self.opt['cuda']), to_cuda(torch.Tensor(mask), self.opt['cuda']) def ranked_predictions(self, cand_labels, scores): _, sorted_inds = scores.sort(descending=True, dim=1) return [cand_labels[i][r[0]] if len(cand_labels[i]) > 0 else '' \ for i, r in enumerate(sorted_inds)] def save(self, path=None): path = self.opt.get('model_file', None) if path is None else path if path: checkpoint = {} checkpoint['entnet'] = self.ent_model.state_dict() checkpoint['entnet_optim'] = self.optimizers['entnet'].state_dict() with open(path, 'wb') as write: torch.save(checkpoint, write) print('Saved ent_model to {}'.format(path)) def load(self, path): with open(path, 'rb') as read: checkpoint = torch.load(read, map_location=lambda storage, loc: storage) self.ent_model.load_state_dict(checkpoint['entnet']) self.optimizers['entnet'].load_state_dict(checkpoint['entnet_optim']) ================================================ FILE: src/core/bamnet/modules.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import numpy as np import torch import torch.nn as nn from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence import torch.nn.functional as F from .utils import to_cuda INF = 1e20 VERY_SMALL_NUMBER = 1e-10 class BAMnet(nn.Module): def __init__(self, vocab_size, vocab_embed_size, o_embed_size, \ hidden_size, num_ent_types, num_relations, num_query_words, \ word_emb_dropout=None,\ que_enc_dropout=None,\ ans_enc_dropout=None, \ pre_w2v=None, \ num_hops=1, \ att='add', \ use_cuda=True): super(BAMnet, self).__init__() self.use_cuda = use_cuda self.word_emb_dropout = word_emb_dropout self.que_enc_dropout = que_enc_dropout self.ans_enc_dropout = ans_enc_dropout self.num_hops = num_hops self.hidden_size = hidden_size self.que_enc = SeqEncoder(vocab_size, vocab_embed_size, hidden_size, \ seq_enc_type='lstm', \ word_emb_dropout=word_emb_dropout, bidirectional=True, \ init_word_embed=pre_w2v, use_cuda=use_cuda).que_enc self.ans_enc = AnsEncoder(o_embed_size, hidden_size, \ num_ent_types, num_relations, \ vocab_size=vocab_size, \ vocab_embed_size=vocab_embed_size, \ shared_embed=self.que_enc.embed, \ word_emb_dropout=word_emb_dropout, \ ans_enc_dropout=ans_enc_dropout, \ use_cuda=use_cuda) self.qw_embed = nn.Embedding(num_query_words, o_embed_size // 8, padding_idx=0) self.batchnorm = nn.BatchNorm1d(hidden_size) self.init_atten = Attention(hidden_size, hidden_size, hidden_size, atten_type=att) self.self_atten = SelfAttention_CoAtt(hidden_size) print('[ Using self-attention on question encoder ]') self.memory_hop = RomHop(hidden_size, hidden_size, hidden_size, atten_type=att) print('[ Using {}-hop memory update ]'.format(self.num_hops)) def kb_aware_query_enc(self, memories, queries, query_lengths, ans_mask, ctx_mask=None): # Question encoder Q_r = self.que_enc(queries, query_lengths)[0] if self.que_enc_dropout: Q_r = F.dropout(Q_r, p=self.que_enc_dropout, training=self.training) query_mask = create_mask(query_lengths, Q_r.size(1), self.use_cuda) q_r_init = self.self_atten(Q_r, query_lengths, query_mask) # Answer encoder _, _, _, x_type_bow, x_types, x_type_bow_len, x_path_bow, x_paths, x_path_bow_len, x_ctx_ent, x_ctx_ent_len, x_ctx_ent_num, _, _, _, _ = memories ans_comp_val, ans_comp_key = self.ans_enc(x_type_bow, x_types, x_type_bow_len, x_path_bow, x_paths, x_path_bow_len, x_ctx_ent, x_ctx_ent_len, x_ctx_ent_num) if self.ans_enc_dropout: for _ in range(len(ans_comp_key)): ans_comp_key[_] = F.dropout(ans_comp_key[_], p=self.ans_enc_dropout, training=self.training) # KB memory summary ans_comp_atts = [self.init_atten(q_r_init, each, atten_mask=ans_mask) for each in ans_comp_key] if ctx_mask is not None: ans_comp_atts[-1] = ctx_mask * ans_comp_atts[-1] - (1 - ctx_mask) * INF ans_comp_probs = [torch.softmax(each, dim=-1) for each in ans_comp_atts] memory_summary = [] for i, probs in enumerate(ans_comp_probs): memory_summary.append(torch.bmm(probs.unsqueeze(1), ans_comp_val[i])) memory_summary = torch.cat(memory_summary, 1) # Co-attention CoAtt = torch.bmm(Q_r, memory_summary.transpose(1, 2)) # co-attention matrix CoAtt = query_mask.unsqueeze(-1) * CoAtt - (1 - query_mask).unsqueeze(-1) * INF if ctx_mask is not None: # mask over empty ctx elements ctx_mask_global = (ctx_mask.sum(-1, keepdim=True) > 0).float() CoAtt[:, :, -1] = ctx_mask_global * CoAtt[:, :, -1].clone() - (1 - ctx_mask_global) * INF q_att = F.max_pool1d(CoAtt, kernel_size=CoAtt.size(-1)).squeeze(-1) q_att = torch.softmax(q_att, dim=-1) return (ans_comp_val, ans_comp_key), (q_att, Q_r), query_mask def forward(self, memories, queries, query_lengths, query_words, ctx_mask=None): ctx_mask = None mem_hop_scores = [] ans_mask = create_mask(memories[0], memories[2].size(1), self.use_cuda) # Multi-task learning on answer type matching # question word vec self.qw_vec = torch.mean(self.qw_embed(query_words), 1) # answer type vec x_types = memories[4] ans_types = torch.mean(self.ans_enc.ent_type_embed(x_types.view(-1, x_types.size(-1))), 1).view(x_types.size(0), x_types.size(1), -1) qw_anstype_loss = torch.bmm(ans_types, self.qw_vec.unsqueeze(2)).squeeze(2) if ans_mask is not None: qw_anstype_loss = ans_mask * qw_anstype_loss - (1 - ans_mask) * INF # Make dummy candidates have large negative scores mem_hop_scores.append(qw_anstype_loss) # Kb-aware question attention module (ans_val, ans_key), (q_att, Q_r), query_mask = self.kb_aware_query_enc(memories, queries, query_lengths, ans_mask, ctx_mask=ctx_mask) ans_val = torch.cat([each.unsqueeze(2) for each in ans_val], 2) ans_key = torch.cat([each.unsqueeze(2) for each in ans_key], 2) q_r = torch.bmm(q_att.unsqueeze(1), Q_r).squeeze(1) mid_score = self.scoring(ans_key.sum(2), q_r, mask=ans_mask) mem_hop_scores.append(mid_score) Q_r, ans_key, ans_val = self.memory_hop(Q_r, ans_key, ans_val, q_att, atten_mask=ans_mask, ctx_mask=ctx_mask, query_mask=query_mask) q_r = torch.bmm(q_att.unsqueeze(1), Q_r).squeeze(1) mid_score = self.scoring(ans_key, q_r, mask=ans_mask) mem_hop_scores.append(mid_score) # Generalization module for _ in range(self.num_hops): q_r_tmp = self.memory_hop.gru_step(q_r, ans_key, ans_val, atten_mask=ans_mask) q_r = self.batchnorm(q_r + q_r_tmp) mid_score = self.scoring(ans_key, q_r, mask=ans_mask) mem_hop_scores.append(mid_score) return mem_hop_scores def premature_score(self, memories, queries, query_lengths, ctx_mask=None): ctx_mask = None ans_mask = create_mask(memories[0], memories[2].size(1), self.use_cuda) # Kb-aware question attention module (ans_val, ans_key), (q_att, Q_r), query_mask = self.kb_aware_query_enc(memories, queries, query_lengths, ans_mask, ctx_mask=ctx_mask) ans_key = torch.cat([each.unsqueeze(2) for each in ans_key], 2) mem_hop_scores = [] q_r = torch.bmm(q_att.unsqueeze(1), Q_r).squeeze(1) score = self.scoring(ans_key.sum(2), q_r, mask=ans_mask) return score def scoring(self, ans_r, q_r, mask=None): score = torch.bmm(ans_r, q_r.unsqueeze(2)).squeeze(2) if mask is not None: score = mask * score - (1 - mask) * INF # Make dummy candidates have large negative scores return score class RomHop(nn.Module): def __init__(self, query_embed_size, in_memory_embed_size, hidden_size, atten_type='add'): super(RomHop, self).__init__() self.hidden_size = hidden_size self.gru_linear_z = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.gru_linear_r = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.gru_linear_t = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.gru_atten = Attention(hidden_size, query_embed_size, in_memory_embed_size, atten_type=atten_type) def forward(self, query_embed, in_memory_embed, out_memory_embed, query_att, \ atten_mask=None, ctx_mask=None, query_mask=None): output = self.update_coatt_cat_maxpool(query_embed, in_memory_embed, out_memory_embed, query_att, \ atten_mask=atten_mask, ctx_mask=ctx_mask, query_mask=query_mask) return output def gru_step(self, h_state, in_memory_embed, out_memory_embed, atten_mask=None): attention = self.gru_atten(h_state, in_memory_embed, atten_mask=atten_mask) probs = torch.softmax(attention, dim=-1) memory_output = torch.bmm(probs.unsqueeze(1), out_memory_embed).squeeze(1) # GRU-like memory update z = torch.sigmoid(self.gru_linear_z(torch.cat([h_state, memory_output], -1))) r = torch.sigmoid(self.gru_linear_r(torch.cat([h_state, memory_output], -1))) t = torch.tanh(self.gru_linear_t(torch.cat([r * h_state, memory_output], -1))) output = (1 - z) * h_state + z * t return output def update_coatt_cat_maxpool(self, query_embed, in_memory_embed, out_memory_embed, query_att, atten_mask=None, ctx_mask=None, query_mask=None): attention = torch.bmm(query_embed, in_memory_embed.view(in_memory_embed.size(0), -1, in_memory_embed.size(-1))\ .transpose(1, 2)).view(query_embed.size(0), query_embed.size(1), in_memory_embed.size(1), -1) # bs * N * M * k if ctx_mask is not None: attention[:, :, :, -1] = ctx_mask.unsqueeze(1) * attention[:, :, :, -1].clone() - (1 - ctx_mask).unsqueeze(1) * INF if atten_mask is not None: attention = atten_mask.unsqueeze(1).unsqueeze(-1) * attention - (1 - atten_mask).unsqueeze(1).unsqueeze(-1) * INF if query_mask is not None: attention = query_mask.unsqueeze(2).unsqueeze(-1) * attention - (1 - query_mask).unsqueeze(2).unsqueeze(-1) * INF # Importance module kb_feature_att = F.max_pool1d(attention.view(attention.size(0), attention.size(1), -1).transpose(1, 2), kernel_size=attention.size(1)).squeeze(-1).view(attention.size(0), -1, attention.size(-1)) kb_feature_att = torch.softmax(kb_feature_att, dim=-1).view(-1, kb_feature_att.size(-1)).unsqueeze(1) in_memory_embed = torch.bmm(kb_feature_att, in_memory_embed.view(-1, in_memory_embed.size(2), in_memory_embed.size(-1))).squeeze(1).view(in_memory_embed.size(0), in_memory_embed.size(1), -1) out_memory_embed = out_memory_embed.sum(2) # Enhanced module attention = F.max_pool1d(attention.view(attention.size(0), -1, attention.size(-1)), kernel_size=attention.size(-1)).squeeze(-1).view(attention.size(0), attention.size(1), attention.size(2)) probs = torch.softmax(attention, dim=-1) new_query_embed = query_embed + query_att.unsqueeze(2) * torch.bmm(probs, out_memory_embed) probs2 = torch.softmax(attention, dim=1) kb_att = torch.bmm(query_att.unsqueeze(1), probs).squeeze(1) in_memory_embed = in_memory_embed + kb_att.unsqueeze(2) * torch.bmm(probs2.transpose(1, 2), new_query_embed) return new_query_embed, in_memory_embed, out_memory_embed class AnsEncoder(nn.Module): """Answer Encoder""" def __init__(self, o_embed_size, hidden_size, num_ent_types, num_relations, vocab_size=None, \ vocab_embed_size=None, shared_embed=None, word_emb_dropout=None, \ ans_enc_dropout=None, use_cuda=True): super(AnsEncoder, self).__init__() # Cannot have embed and vocab_size set as None at the same time. self.use_cuda = use_cuda self.ans_enc_dropout = ans_enc_dropout self.hidden_size = hidden_size self.ent_type_embed = nn.Embedding(num_ent_types, o_embed_size // 8, padding_idx=0) self.relation_embed = nn.Embedding(num_relations, o_embed_size, padding_idx=0) self.embed = shared_embed if shared_embed is not None else nn.Embedding(vocab_size, vocab_embed_size, padding_idx=0) self.vocab_embed_size = self.embed.weight.data.size(1) self.linear_type_bow_key = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_paths_key = nn.Linear(hidden_size + o_embed_size, hidden_size, bias=False) self.linear_ctx_key = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_type_bow_val = nn.Linear(hidden_size, hidden_size, bias=False) self.linear_paths_val = nn.Linear(hidden_size + o_embed_size, hidden_size, bias=False) self.linear_ctx_val = nn.Linear(hidden_size, hidden_size, bias=False) # lstm for ans encoder self.lstm_enc_type = EncoderRNN(vocab_size, self.vocab_embed_size, hidden_size, \ dropout=word_emb_dropout, \ bidirectional=True, \ shared_embed=shared_embed, \ rnn_type='lstm', \ use_cuda=use_cuda) self.lstm_enc_path = EncoderRNN(vocab_size, self.vocab_embed_size, hidden_size, \ dropout=word_emb_dropout, \ bidirectional=True, \ shared_embed=shared_embed, \ rnn_type='lstm', \ use_cuda=use_cuda) self.lstm_enc_ctx = EncoderRNN(vocab_size, self.vocab_embed_size, hidden_size, \ dropout=word_emb_dropout, \ bidirectional=True, \ shared_embed=shared_embed, \ rnn_type='lstm', \ use_cuda=use_cuda) def forward(self, x_type_bow, x_types, x_type_bow_len, x_path_bow, x_paths, x_path_bow_len, x_ctx_ents, x_ctx_ent_len, x_ctx_ent_num): ans_type_bow, ans_types, ans_path_bow, ans_paths, ans_ctx_ent = self.enc_ans_features(x_type_bow, x_types, x_type_bow_len, x_path_bow, x_paths, x_path_bow_len, x_ctx_ents, x_ctx_ent_len, x_ctx_ent_num) ans_val, ans_key = self.enc_comp_kv(ans_type_bow, ans_types, ans_path_bow, ans_paths, ans_ctx_ent) return ans_val, ans_key def enc_comp_kv(self, ans_type_bow, ans_types, ans_path_bow, ans_paths, ans_ctx_ent): ans_type_bow_val = self.linear_type_bow_val(ans_type_bow) ans_paths_val = self.linear_paths_val(torch.cat([ans_path_bow, ans_paths], -1)) ans_ctx_val = self.linear_ctx_val(ans_ctx_ent) ans_type_bow_key = self.linear_type_bow_key(ans_type_bow) ans_paths_key = self.linear_paths_key(torch.cat([ans_path_bow, ans_paths], -1)) ans_ctx_key = self.linear_ctx_key(ans_ctx_ent) ans_comp_val = [ans_type_bow_val, ans_paths_val, ans_ctx_val] ans_comp_key = [ans_type_bow_key, ans_paths_key, ans_ctx_key] return ans_comp_val, ans_comp_key def enc_ans_features(self, x_type_bow, x_types, x_type_bow_len, x_path_bow, x_paths, x_path_bow_len, x_ctx_ents, x_ctx_ent_len, x_ctx_ent_num): ''' x_types: answer type x_paths: answer path, i.e., bow of relation x_ctx_ents: answer context, i.e., bow of entity words, (batch_size, num_cands, num_ctx, L) ''' # ans_types = torch.mean(self.ent_type_embed(x_types.view(-1, x_types.size(-1))), 1).view(x_types.size(0), x_types.size(1), -1) ans_type_bow = (self.lstm_enc_type(x_type_bow.view(-1, x_type_bow.size(-1)), x_type_bow_len.view(-1))[1]).view(x_type_bow.size(0), x_type_bow.size(1), -1) ans_path_bow = (self.lstm_enc_path(x_path_bow.view(-1, x_path_bow.size(-1)), x_path_bow_len.view(-1))[1]).view(x_path_bow.size(0), x_path_bow.size(1), -1) ans_paths = torch.mean(self.relation_embed(x_paths.view(-1, x_paths.size(-1))), 1).view(x_paths.size(0), x_paths.size(1), -1) # Avg over ctx ctx_num_mask = create_mask(x_ctx_ent_num.view(-1), x_ctx_ents.size(2), self.use_cuda).view(x_ctx_ent_num.shape + (-1,)) ans_ctx_ent = (self.lstm_enc_ctx(x_ctx_ents.view(-1, x_ctx_ents.size(-1)), x_ctx_ent_len.view(-1))[1]).view(x_ctx_ents.size(0), x_ctx_ents.size(1), x_ctx_ents.size(2), -1) ans_ctx_ent = ctx_num_mask.unsqueeze(-1) * ans_ctx_ent ans_ctx_ent = torch.sum(ans_ctx_ent, dim=2) / torch.clamp(x_ctx_ent_num.float().unsqueeze(-1), min=VERY_SMALL_NUMBER) if self.ans_enc_dropout: # ans_types = F.dropout(ans_types, p=self.ans_enc_dropout, training=self.training) ans_type_bow = F.dropout(ans_type_bow, p=self.ans_enc_dropout, training=self.training) ans_path_bow = F.dropout(ans_path_bow, p=self.ans_enc_dropout, training=self.training) ans_paths = F.dropout(ans_paths, p=self.ans_enc_dropout, training=self.training) ans_ctx_ent = F.dropout(ans_ctx_ent, p=self.ans_enc_dropout, training=self.training) return ans_type_bow, None, ans_path_bow, ans_paths, ans_ctx_ent class SeqEncoder(object): """Question Encoder""" def __init__(self, vocab_size, embed_size, hidden_size, \ seq_enc_type='lstm', word_emb_dropout=None, cnn_kernel_size=[3], bidirectional=False, \ shared_embed=None, init_word_embed=None, use_cuda=True): if seq_enc_type in ('lstm', 'gru'): self.que_enc = EncoderRNN(vocab_size, embed_size, hidden_size, \ dropout=word_emb_dropout, \ bidirectional=bidirectional, \ shared_embed=shared_embed, \ init_word_embed=init_word_embed, \ rnn_type=seq_enc_type, \ use_cuda=use_cuda) elif seq_enc_type == 'cnn': self.que_enc = EncoderCNN(vocab_size, embed_size, hidden_size, \ kernel_size=cnn_kernel_size, dropout=word_emb_dropout, \ shared_embed=shared_embed, \ init_word_embed=init_word_embed, \ use_cuda=use_cuda) else: raise RuntimeError('Unknown SeqEncoder type: {}'.format(seq_enc_type)) class EncoderRNN(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, dropout=None, \ bidirectional=False, shared_embed=None, init_word_embed=None, rnn_type='lstm', use_cuda=True): super(EncoderRNN, self).__init__() if not rnn_type in ('lstm', 'gru'): raise RuntimeError('rnn_type is expected to be lstm or gru, got {}'.format(rnn_type)) if bidirectional: print('[ Using bidirectional {} encoder ]'.format(rnn_type)) else: print('[ Using {} encoder ]'.format(rnn_type)) if bidirectional and hidden_size % 2 != 0: raise RuntimeError('hidden_size is expected to be even in the bidirectional mode!') self.dropout = dropout self.rnn_type = rnn_type self.use_cuda = use_cuda self.hidden_size = hidden_size // 2 if bidirectional else hidden_size self.num_directions = 2 if bidirectional else 1 self.embed = shared_embed if shared_embed is not None else nn.Embedding(vocab_size, embed_size, padding_idx=0) model = nn.LSTM if rnn_type == 'lstm' else nn.GRU self.model = model(embed_size, self.hidden_size, 1, batch_first=True, bidirectional=bidirectional) if shared_embed is None: self.init_weights(init_word_embed) def init_weights(self, init_word_embed): if init_word_embed is not None: print('[ Using pretrained word embeddings ]') self.embed.weight.data.copy_(torch.from_numpy(init_word_embed)) else: self.embed.weight.data.uniform_(-0.08, 0.08) def forward(self, x, x_len): """x: [batch_size * max_length] x_len: [batch_size] """ x = self.embed(x) if self.dropout: x = F.dropout(x, p=self.dropout, training=self.training) sorted_x_len, indx = torch.sort(x_len, 0, descending=True) x = pack_padded_sequence(x[indx], sorted_x_len.data.tolist(), batch_first=True) h0 = to_cuda(torch.zeros(self.num_directions, x_len.size(0), self.hidden_size), self.use_cuda) if self.rnn_type == 'lstm': c0 = to_cuda(torch.zeros(self.num_directions, x_len.size(0), self.hidden_size), self.use_cuda) packed_h, (packed_h_t, _) = self.model(x, (h0, c0)) if self.num_directions == 2: packed_h_t = torch.cat([packed_h_t[i] for i in range(packed_h_t.size(0))], -1) else: packed_h, packed_h_t = self.model(x, h0) if self.num_directions == 2: packed_h_t = packed_h_t.transpose(0, 1).contiguous().view(query_lengths.size(0), -1) hh, _ = pad_packed_sequence(packed_h, batch_first=True) # restore the sorting _, inverse_indx = torch.sort(indx, 0) restore_hh = hh[inverse_indx] restore_packed_h_t = packed_h_t[inverse_indx] return restore_hh, restore_packed_h_t class EncoderCNN(nn.Module): def __init__(self, vocab_size, embed_size, hidden_size, kernel_size=[2, 3], \ dropout=None, shared_embed=None, init_word_embed=None, use_cuda=True): super(EncoderCNN, self).__init__() print('[ Using CNN encoder with kernel size: {} ]'.format(kernel_size)) self.use_cuda = use_cuda self.dropout = dropout self.embed = shared_embed if shared_embed is not None else nn.Embedding(vocab_size, embed_size, padding_idx=0) self.cnns = nn.ModuleList([nn.Conv1d(embed_size, hidden_size, kernel_size=k, padding=k-1) for k in kernel_size]) if len(kernel_size) > 1: self.fc = nn.Linear(len(kernel_size) * hidden_size, hidden_size) if shared_embed is None: self.init_weights(init_word_embed) def init_weights(self, init_word_embed): if init_word_embed is not None: print('[ Using pretrained word embeddings ]') self.embed.weight.data.copy_(torch.from_numpy(init_word_embed)) else: self.embed.weight.data.uniform_(-0.08, 0.08) def forward(self, x, x_len=None): """x: [batch_size * max_length] x_len: reserved """ x = self.embed(x) if self.dropout: x = F.dropout(x, p=self.dropout, training=self.training) # Trun(batch_size, seq_len, embed_size) to (batch_size, embed_size, seq_len) for cnn1d x = x.transpose(1, 2) z = [conv(x) for conv in self.cnns] output = [F.max_pool1d(i, kernel_size=i.size(-1)).squeeze(-1) for i in z] if len(output) > 1: output = self.fc(torch.cat(output, -1)) else: output = output[0] return None, output class Attention(nn.Module): def __init__(self, hidden_size, h_state_embed_size=None, in_memory_embed_size=None, atten_type='simple'): super(Attention, self).__init__() self.atten_type = atten_type if not h_state_embed_size: h_state_embed_size = hidden_size if not in_memory_embed_size: in_memory_embed_size = hidden_size if atten_type in ('mul', 'add'): self.W = torch.Tensor(h_state_embed_size, hidden_size) self.W = nn.Parameter(nn.init.xavier_uniform_(self.W)) if atten_type == 'add': self.W2 = torch.Tensor(in_memory_embed_size, hidden_size) self.W2 = nn.Parameter(nn.init.xavier_uniform_(self.W2)) self.W3 = torch.Tensor(hidden_size, 1) self.W3 = nn.Parameter(nn.init.xavier_uniform_(self.W3)) elif atten_type == 'simple': pass else: raise RuntimeError('Unknown atten_type: {}'.format(self.atten_type)) def forward(self, query_embed, in_memory_embed, atten_mask=None): if self.atten_type == 'simple': # simple attention attention = torch.bmm(in_memory_embed, query_embed.unsqueeze(2)).squeeze(2) elif self.atten_type == 'mul': # multiplicative attention attention = torch.bmm(in_memory_embed, torch.mm(query_embed, self.W).unsqueeze(2)).squeeze(2) elif self.atten_type == 'add': # additive attention attention = torch.tanh(torch.mm(in_memory_embed.view(-1, in_memory_embed.size(-1)), self.W2)\ .view(in_memory_embed.size(0), -1, self.W2.size(-1)) \ + torch.mm(query_embed, self.W).unsqueeze(1)) attention = torch.mm(attention.view(-1, attention.size(-1)), self.W3).view(attention.size(0), -1) else: raise RuntimeError('Unknown atten_type: {}'.format(self.atten_type)) if atten_mask is not None: # Exclude masked elements from the softmax attention = atten_mask * attention - (1 - atten_mask) * INF return attention class SelfAttention_CoAtt(nn.Module): def __init__(self, hidden_size, use_cuda=True): super(SelfAttention_CoAtt, self).__init__() self.use_cuda = use_cuda self.hidden_size = hidden_size self.model = nn.LSTM(2 * hidden_size, hidden_size // 2, batch_first=True, bidirectional=True) def forward(self, x, x_len, atten_mask): CoAtt = torch.bmm(x, x.transpose(1, 2)) CoAtt = atten_mask.unsqueeze(1) * CoAtt - (1 - atten_mask).unsqueeze(1) * INF CoAtt = torch.softmax(CoAtt, dim=-1) new_x = torch.cat([torch.bmm(CoAtt, x), x], -1) sorted_x_len, indx = torch.sort(x_len, 0, descending=True) new_x = pack_padded_sequence(new_x[indx], sorted_x_len.data.tolist(), batch_first=True) h0 = to_cuda(torch.zeros(2, x_len.size(0), self.hidden_size // 2), self.use_cuda) c0 = to_cuda(torch.zeros(2, x_len.size(0), self.hidden_size // 2), self.use_cuda) packed_h, (packed_h_t, _) = self.model(new_x, (h0, c0)) # restore the sorting _, inverse_indx = torch.sort(indx, 0) packed_h_t = torch.cat([packed_h_t[i] for i in range(packed_h_t.size(0))], -1) restore_packed_h_t = packed_h_t[inverse_indx] output = restore_packed_h_t return output def create_mask(x, N, use_cuda=True): x = x.data mask = np.zeros((x.size(0), N)) for i in range(x.size(0)): mask[i, :x[i]] = 1 return to_cuda(torch.Tensor(mask), use_cuda) ================================================ FILE: src/core/bamnet/utils.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import torch from torch.autograd import Variable import numpy as np def to_cuda(x, use_cuda=True): if use_cuda and torch.cuda.is_available(): x = x.cuda() return x # One pass over the dataset def next_batch(memories, queries, query_words, raw_queries, query_mentions, query_lengths, gold_ans_inds, batch_size): for i in range(0, len(memories), batch_size): yield (memories[i: i + batch_size], queries[i: i + batch_size], query_words[i: i + batch_size], raw_queries[i: i + batch_size], query_mentions[i: i + batch_size], query_lengths[i: i + batch_size]), gold_ans_inds[i: i + batch_size] # One pass over the dataset def next_ent_batch(memories, queries, query_lengths, gold_inds, batch_size): for i in range(0, len(memories), batch_size): yield (memories[i: i + batch_size], queries[i: i + batch_size], query_lengths[i: i + batch_size]), gold_inds[i: i + batch_size] ================================================ FILE: src/core/build_data/__init__.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' ================================================ FILE: src/core/build_data/build_all.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import os from . import utils as build_utils from ..utils.utils import * from .build_data import build_vocab, build_data def build(dpath, version=None, out_dir=None): if not build_utils.built(dpath, version_string=version): raise RuntimeError("Please build/preprocess the data by running the build_all_data.py script!") ================================================ FILE: src/core/build_data/build_data.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import os import math import argparse from itertools import count from rapidfuzz import fuzz, process from collections import defaultdict from ..utils.utils import * from ..utils.generic_utils import normalize_answer, unique from ..utils.freebase_utils import if_filterout from .. import config IGNORE_DUMMY = True ENT_TYPE_HOP = 1 # Entity mention types: 'NP', 'ORGANIZATION', 'DATE', 'NUMBER', 'MISC', 'ORDINAL', 'DURATION', 'PERSON', 'TIME', 'LOCATION' def build_kb_data(kb, used_fbkeys=None): entities = defaultdict(int) entity_types = defaultdict(int) relations = defaultdict(int) vocabs = defaultdict(int) if not used_fbkeys: used_fbkeys = kb.keys() for k in used_fbkeys: if not k in kb: continue v = kb[k] entities[v['id']] += 1 # We prefer notable_types than type since they are more representative. # If notable_types are not available, we use only the first available type. # We found the type field contains much noise. selected_types = (v['notable_types'] + v['type'])[:ENT_TYPE_HOP] for ent_type in selected_types: entity_types[ent_type] += 1 for token in [y for x in selected_types for y in x.lower().split('/')[-1].split('_')]: vocabs[token] += 1 # Add entity vocabs selected_names = v['name'][:1] + v['alias'] # We need all topic entity alias for token in [y for x in selected_names for y in tokenize(x.lower())]: vocabs[token] += 1 if not 'neighbors' in v: continue for kk, vv in v['neighbors'].items(): # 1st hop if if_filterout(kk): continue relations[kk] += 1 # Add relation vocabs for token in [x for x in kk.lower().split('/')[-1].split('_')]: vocabs[token] += 1 for nbr in vv: if isinstance(nbr, str): for token in [y for y in tokenize(nbr.lower())]: vocabs[token] += 1 continue elif isinstance(nbr, bool): continue elif isinstance(nbr, float): continue # vocabs.update([y for y in tokenize(str(nbr).lower())]) elif isinstance(nbr, dict): nbr_k = list(nbr.keys())[0] nbr_v = nbr[nbr_k] entities[nbr_k] += 1 selected_types = (nbr_v['notable_types'] + nbr_v['type'])[:ENT_TYPE_HOP] for ent_type in selected_types: entity_types[ent_type] += 1 selected_names = (nbr_v['name'] + nbr_v['alias'])[:1] for token in [y for x in selected_names for y in tokenize(x.lower())] + \ [y for x in selected_types for y in x.lower().split('/')[-1].split('_')]: vocabs[token] += 1 if not 'neighbors' in nbr_v: continue for kkk, vvv in nbr_v['neighbors'].items(): # 2nd hop if if_filterout(kkk): continue relations[kkk] += 1 # Add relation vocabs for token in [x for x in kkk.lower().split('/')[-1].split('_')]: vocabs[token] += 1 for nbr_nbr in vvv: if isinstance(nbr_nbr, str): for token in [y for y in tokenize(nbr_nbr.lower())]: vocabs[token] += 1 continue elif isinstance(nbr_nbr, bool): continue elif isinstance(nbr_nbr, float): # vocabs.update([y for y in tokenize(str(nbr_nbr).lower())]) continue elif isinstance(nbr_nbr, dict): nbr_nbr_k = list(nbr_nbr.keys())[0] nbr_nbr_v = nbr_nbr[nbr_nbr_k] entities[nbr_nbr_k] += 1 selected_types = (nbr_nbr_v['notable_types'] + nbr_nbr_v['type'])[:ENT_TYPE_HOP] for ent_type in selected_types: entity_types[ent_type] += 1 selected_names = (nbr_nbr_v['name'] + nbr_nbr_v['alias'])[:1] for token in [y for x in selected_names for y in tokenize(x.lower())] + \ [y for x in selected_types for y in x.lower().split('/')[-1].split('_')]: vocabs[token] += 1 else: raise RuntimeError('Unknown type: %s' % type(nbr_nbr)) else: raise RuntimeError('Unknown type: %s' % type(nbr)) return (entities, entity_types, relations, vocabs) def build_qa_vocab(qa): vocabs = defaultdict(int) for each in qa: for token in tokenize(each['qText'].lower()): vocabs[token] += 1 return vocabs def delex_query_topic_ent(query, topic_ent, ent_types): query = tokenize(query.lower()) if topic_ent == '': return query, None ent_type_dict = {} for ent, type_ in ent_types: if ent not in ent_type_dict: ent_type_dict[ent] = type_ else: if ent_type_dict[ent] == 'NP': ent_type_dict[ent] = type_ ret = process.extract(topic_ent.replace('_', ' '), set(list(zip(*ent_types))[0]), scorer=fuzz.token_sort_ratio) if len(ret) == 0: return query, None # We prefer Non-NP entity mentions # e.g., we prefer `uk` than `people in the uk` when matching `united_kingdom` topic_men = None topic_score = None for token, score in ret: if ent_type_dict[token].lower() in config.topic_mention_types: topic_men = token topic_score = score break if topic_men is None: return query, None topic_ent_type = ent_type_dict[topic_men].lower() topic_tokens = tokenize(topic_men.lower()) indices = [i for i, x in enumerate(query) if x == topic_tokens[0]] for i in indices: if query[i: i + len(topic_tokens)] == topic_tokens: start_idx = i end_idx = i + len(topic_tokens) break query_template = query[:start_idx] + [topic_ent_type] + query[end_idx:] return query_template, topic_men def delex_query(query, ent_mens, mention_types): for men, type_ in ent_mens: type_ = type_.lower() if type_ in mention_types: men = tokenize(men.lower()) indices = [i for i, x in enumerate(query) if x == men[0]] start_idx = None for i in indices: if query[i: i + len(men)] == men: start_idx = i end_idx = i + len(men) break if start_idx is not None: query = query[:start_idx] + ['__{}__'.format(type_)] + query[end_idx:] return query def build_data(qa, kb, entity2id, entityType2id, relation2id, vocab2id, pred_seed_ents=None): queries = [] raw_queries = [] query_mentions = [] memories = [] cand_labels = [] # Candidate answer labels (i.e., names) gold_ans_labels = [] # True gold answer labels gold_ans_inds = [] # The "gold" answer indices corresponding to the cand list for qid, each in enumerate(qa): freebase_key = each['freebaseKey'] if not pred_seed_ents else pred_seed_ents[qid] if isinstance(freebase_key, list): freebase_key = freebase_key[0] if len(freebase_key) > 0 else '' # Convert query to query template query, topic_men = delex_query_topic_ent(each['qText'], freebase_key, each['entities']) query2 = delex_query(query, each['entities'], config.delex_mention_types) q = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for x in query2] queries.append(q) raw_queries.append(query) query_mentions.append([(tokenize(x[0].lower()), x[1].lower()) for x in each['entities'] if topic_men != x[0]]) gold_ans_labels.append(each['answers']) if not freebase_key in kb: gold_ans_inds.append([]) memories.append([[]] * 8) cand_labels.append([]) continue ans_cands = build_ans_cands(kb[freebase_key], entity2id, entityType2id, relation2id, vocab2id) memories.append(ans_cands[:-1]) cand_labels.append(ans_cands[-1]) if len(ans_cands[0]) == 0: gold_ans_inds.append([]) continue norm_cand_labels = [normalize_answer(x) for x in ans_cands[-1]] tmp_cand_inds = [] for a in each['answers']: a = normalize_answer(a) # Find all the candidiate answers which match the gold answer. inds = [i for i, j in zip(count(), norm_cand_labels) if j == a] tmp_cand_inds.extend(inds) # Note that tmp_cand_inds can be empty in which case # the question can *NOT* be answered by this KB entity. gold_ans_inds.append(tmp_cand_inds) return (queries, raw_queries, query_mentions, memories, cand_labels, gold_ans_inds, gold_ans_labels) def build_vocab(data, freebase, used_fbkeys=None, min_freq=1): entities, entity_types, relations, kb_vocabs = build_kb_data(freebase, used_fbkeys) # Entity all_entities = set({ent for ent in entities if entities[ent] >= min_freq}) entity2id = dict(zip(all_entities, range(len(config.RESERVED_ENTS), len(all_entities) + len(config.RESERVED_ENTS)))) for ent, idx in config.RESERVED_ENTS.items(): entity2id.update({ent: idx}) # Entity type all_ent_types = set({ent_type for ent_type in entity_types if entity_types[ent_type] >= min_freq}) all_ent_types.update(config.extra_ent_types) entityType2id = dict(zip(all_ent_types, range(len(config.RESERVED_ENT_TYPES), len(all_ent_types) + len(config.RESERVED_ENT_TYPES)))) for ent_type, idx in config.RESERVED_ENT_TYPES.items(): entityType2id.update({ent_type: idx}) # Relation all_relations = set({rel for rel in relations if relations[rel] >= min_freq}) all_relations.update(config.extra_rels) relation2id = dict(zip(all_relations, range(len(config.RESERVED_RELS), len(all_relations) + len(config.RESERVED_RELS)))) for rel, idx in config.RESERVED_RELS.items(): relation2id.update({rel: idx}) # Vocab vocabs = build_qa_vocab(data) for token, count in kb_vocabs.items(): vocabs[token] += count # sorted_vocabs = sorted(vocabs.items(), key=lambda d:d[1], reverse=True) all_tokens = set({token for token in vocabs if vocabs[token] >= min_freq}) all_tokens.update(config.extra_vocab_tokens) vocab2id = dict(zip(all_tokens, range(len(config.RESERVED_TOKENS), len(all_tokens) + len(config.RESERVED_TOKENS)))) for token, idx in config.RESERVED_TOKENS.items(): vocab2id.update({token: idx}) print('Num of entities: %s' % len(entity2id)) print('Num of entity_types: %s' % len(entityType2id)) print('Num of relations: %s' % len(relation2id)) print('Num of vocabs: %s' % len(vocab2id)) return entity2id, entityType2id, relation2id, vocab2id def build_ans_cands(graph, entity2id, entityType2id, relation2id, vocab2id): cand_ans_bows = [] # bow of answer entity cand_ans_entities = [] # answer entity cand_ans_types = [] # type of answer entity cand_ans_type_bows = [] # bow of answer entity type cand_ans_paths = [] # relation path from topic entity to answer entity cand_ans_path_bows = [] cand_ans_ctx = [] # context (i.e., 1-hop entity bows and relation bows) connects to the answer path cand_ans_topic_key_type = [] # topic key entity type cand_labels = [] # candidiate answers selected_types = (graph['notable_types'] + graph['type'])[:ENT_TYPE_HOP] topic_key_ent_type_bows = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for y in selected_types for x in y.lower().split('/')[-1].split('_')] topic_key_ent_type = [entityType2id[x] if x in entityType2id else config.RESERVED_ENT_TYPES['UNK'] for x in selected_types] # We only consider the alias relations of topic entityies for each in graph['alias']: cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) ent_bow = [vocab2id[y] if y in vocab2id else config.RESERVED_TOKENS['UNK'] for y in tokenize(each.lower())] cand_ans_bows.append(ent_bow) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([]) cand_ans_type_bows.append([]) cand_ans_paths.append([relation2id['alias'] if 'alias' in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append([vocab2id['alias']]) # We do not count the topic_entity as context since it is trivial cand_ans_ctx.append([[], []]) cand_labels.append(each) if len(cand_labels) == 0 and (not 'neighbors' in graph or len(graph['neighbors']) == 0): return ([], [], [], [], [], [], [], [], []) for k, v in graph['neighbors'].items(): if if_filterout(k): continue k_bow = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for x in k.lower().split('/')[-1].split('_')] for nbr in v: if isinstance(nbr, str): cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) ent_bow = [vocab2id[y] if y in vocab2id else config.RESERVED_TOKENS['UNK'] for y in tokenize(nbr.lower())] cand_ans_bows.append(ent_bow) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([]) cand_ans_type_bows.append([]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(k_bow) cand_ans_ctx.append([[], []]) cand_labels.append(nbr) continue elif isinstance(nbr, bool): cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) cand_ans_bows.append([vocab2id['true' if nbr else 'false']]) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([entityType2id['bool']]) cand_ans_type_bows.append([vocab2id['bool']]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(k_bow) cand_ans_ctx.append([[], []]) cand_labels.append('true' if nbr else 'false') continue elif isinstance(nbr, float): cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) cand_ans_bows.append([vocab2id[str(nbr)] if str(nbr) in vocab2id else config.RESERVED_TOKENS['UNK']]) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([entityType2id['num']]) cand_ans_type_bows.append([vocab2id['num']]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(k_bow) cand_ans_ctx.append([[], []]) cand_labels.append(str(nbr)) continue elif isinstance(nbr, dict): nbr_k = list(nbr.keys())[0] nbr_v = nbr[nbr_k] selected_names = (nbr_v['name'] + nbr_v['alias'])[:1] is_dummy = True if not IGNORE_DUMMY or len(selected_names) > 0: # Otherwise, it is an intermediate (dummpy) node cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) nbr_k_bow = [vocab2id[y] if y in vocab2id else config.RESERVED_TOKENS['UNK'] for x in selected_names for y in tokenize(x.lower())] cand_ans_bows.append(nbr_k_bow) cand_ans_entities.append(entity2id[nbr_k] if nbr_k in entity2id else config.RESERVED_ENTS['UNK']) selected_types = (nbr_v['notable_types'] + nbr_v['type'])[:ENT_TYPE_HOP] cand_ans_types.append([entityType2id[x] if x in entityType2id else config.RESERVED_ENT_TYPES['UNK'] for x in selected_types]) cand_ans_type_bows.append([vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for y in selected_types for x in y.lower().split('/')[-1].split('_')]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(k_bow) cand_labels.append(selected_names[0] if len(selected_names) > 0 else 'UNK') is_dummy = False if not 'neighbors' in nbr_v: if not is_dummy: cand_ans_ctx.append([[], []]) continue rels = [] labels = [] all_ctx = [set(), set()] for kk, vv in nbr_v['neighbors'].items(): # 2nd hop if if_filterout(kk): continue kk_bow = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for x in kk.lower().split('/')[-1].split('_')] all_ctx[1].add(kk) for nbr_nbr in vv: if isinstance(nbr_nbr, str): cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) ent_bow = [vocab2id[y] if y in vocab2id else config.RESERVED_TOKENS['UNK'] for y in tokenize(nbr_nbr.lower())] cand_ans_bows.append(ent_bow) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([]) cand_ans_type_bows.append([]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK'], relation2id[kk] if kk in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(kk_bow + k_bow) labels.append(nbr_nbr) all_ctx[0].add(nbr_nbr) rels.append(kk) continue elif isinstance(nbr_nbr, bool): cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) cand_ans_bows.append([vocab2id['true' if nbr_nbr else 'false']]) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([entityType2id['bool']]) cand_ans_type_bows.append([vocab2id['bool']]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK'], relation2id[kk] if kk in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(kk_bow + k_bow) labels.append('true' if nbr_nbr else 'false') all_ctx[0].add('true' if nbr_nbr else 'false') rels.append(kk) continue elif isinstance(nbr_nbr, float): cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) cand_ans_bows.append([vocab2id[str(nbr_nbr)] if str(nbr_nbr) in vocab2id else config.RESERVED_TOKENS['UNK']]) cand_ans_entities.append(config.RESERVED_ENTS['PAD']) cand_ans_types.append([entityType2id['num']]) cand_ans_type_bows.append([vocab2id['num']]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK'], relation2id[kk] if kk in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(kk_bow + k_bow) labels.append(str(nbr_nbr)) all_ctx[0].add(str(nbr_nbr)) rels.append(kk) continue elif isinstance(nbr_nbr, dict): nbr_nbr_k = list(nbr_nbr.keys())[0] nbr_nbr_v = nbr_nbr[nbr_nbr_k] selected_names = (nbr_nbr_v['name'] + nbr_nbr_v['alias'])[:1] if not IGNORE_DUMMY or len(selected_names) > 0: cand_ans_topic_key_type.append([topic_key_ent_type_bows, topic_key_ent_type]) ent_bow = [vocab2id[y] if y in vocab2id else config.RESERVED_TOKENS['UNK'] for x in selected_names for y in tokenize(x.lower())] cand_ans_bows.append(ent_bow) cand_ans_entities.append(entity2id[nbr_nbr_k] if nbr_nbr_k in entity2id else config.RESERVED_ENTS['UNK']) selected_types = (nbr_nbr_v['notable_types'] + nbr_nbr_v['type'])[:ENT_TYPE_HOP] cand_ans_types.append([entityType2id[x] if x in entityType2id else config.RESERVED_ENT_TYPES['UNK'] for x in selected_types]) cand_ans_type_bows.append([vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for y in selected_types for x in y.lower().split('/')[-1].split('_')]) cand_ans_paths.append([relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK'], relation2id[kk] if kk in relation2id else config.RESERVED_RELS['UNK']]) cand_ans_path_bows.append(kk_bow + k_bow) labels.append(selected_names[0] if len(selected_names) > 0 else 'UNK') if len(selected_names) > 0: all_ctx[0].add(selected_names[0]) rels.append(kk) else: raise RuntimeError('Unknown type: %s' % type(nbr_nbr)) assert len(labels) == len(rels) if not is_dummy: ctx_ent_bow = [tokenize(x.lower()) for x in all_ctx[0]] # ctx_rel_bow = list(set([vocab2id[y] for x in all_ctx[1] for y in x.lower().split('/')[-1].split('_') if y in vocab2id])) ctx_rel_bow = [] cand_ans_ctx.append([ctx_ent_bow, ctx_rel_bow]) for i in range(len(labels)): tmp_ent_names = all_ctx[0] - set([labels[i]]) # tmp_rel_names = all_ctx[1] - set([rels[i]]) ctx_ent_bow = [tokenize(x.lower()) for x in tmp_ent_names] # ctx_rel_bow = list(set([vocab2id[y] for x in tmp_rel_names for y in x.lower().split('/')[-1].split('_') if y in vocab2id])) ctx_rel_bow = [] cand_ans_ctx.append([ctx_ent_bow, ctx_rel_bow]) cand_labels.extend(labels) else: raise RuntimeError('Unknown type: %s' % type(nbr)) assert len(cand_ans_bows) == len(cand_ans_entities) == len(cand_ans_types) == len(cand_ans_type_bows) == len(cand_ans_paths) \ == len(cand_ans_ctx) == len(cand_labels) == len(cand_ans_topic_key_type) == len(cand_ans_path_bows) return (cand_ans_bows, cand_ans_entities, cand_ans_type_bows, cand_ans_types, cand_ans_path_bows, cand_ans_paths, cand_ans_ctx, cand_ans_topic_key_type, cand_labels) # Build seed entity candidates for topic entity classification def build_seed_ent_data(qa, kb, entity2id, entityType2id, relation2id, vocab2id, topn, dtype): queries = [] seed_ent_features = [] seed_ent_labels = [] seed_ent_inds = [] for each in qa: query = tokenize(each['qText'].lower()) q = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for x in query] queries.append(q) tmp_features = [] tmp_labels = [] tmp_inds = [] for i, freebase_key in enumerate(each['freebaseKeyCands'][:topn]): tmp_labels.append(freebase_key) if freebase_key == each['freebaseKey']: tmp_inds.append(i) if freebase_key in kb: features = build_seed_entity_feature(freebase_key, kb[freebase_key], entity2id, entityType2id, relation2id, vocab2id) tmp_features.append(features) else: tmp_features.append([[]] * 5) if dtype == 'test': if len(tmp_inds) == 0: # No answer tmp_inds.append(-1) else: assert len(tmp_labels) == topn assert len(tmp_inds) == 1 seed_ent_features.append(list(zip(*tmp_features))) seed_ent_labels.append(tmp_labels) seed_ent_inds.append(tmp_inds) return (queries, seed_ent_features, seed_ent_labels, seed_ent_inds) def build_seed_entity_feature(seed_ent, graph, entity2id, entityType2id, relation2id, vocab2id): # candidate seed entity features: # entity name # entity type # entity neighboring relations selected_names = (graph['name'] + graph['alias'])[:1] seed_ent_name = [vocab2id[y] if y in vocab2id else config.RESERVED_TOKENS['UNK'] for x in selected_names for y in tokenize(x.lower())] selected_types = (graph['notable_types'] + graph['type'])[:ENT_TYPE_HOP] seed_ent_type_name = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for y in selected_types for x in y.lower().split('/')[-1].split('_')] seed_ent_type = [entityType2id[x] if x in entityType2id else config.RESERVED_ENT_TYPES['UNK'] for x in selected_types] seed_rel_names = [] seed_rels = [] for k in graph['neighbors']: if if_filterout(k): continue k_bow = [vocab2id[x] if x in vocab2id else config.RESERVED_TOKENS['UNK'] for x in k.lower().split('/')[-1].split('_')] seed_rel_names.append(k_bow) seed_rels.append(relation2id[k] if k in relation2id else config.RESERVED_RELS['UNK']) return (seed_ent_name, seed_ent_type_name, seed_ent_type, seed_rel_names, seed_rels) ================================================ FILE: src/core/build_data/freebase.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import os from ..utils.utils import * def fetch_meta(path): try: data = load_gzip_json(path) except: return {} content = {} properties = data['property'] if '/type/object/name' in properties: content['name'] = [x['value'] for x in properties['/type/object/name']['values']] else: content['name'] = [] if '/common/topic/alias' in properties: content['alias'] = [x['value'] for x in properties['/common/topic/alias']['values']] else: content['alias'] = [] if '/common/topic/notable_types' in properties: content['notable_types'] = [x['id'] for x in properties['/common/topic/notable_types']['values']] else: content['notable_types'] = [] if '/type/object/type' in properties: content['type'] = [x['id'] for x in properties['/type/object/type']['values']] else: content['type'] = [] return content def fetch(data, data_dir): if not 'id' in data: return data['value'] mid = data['id'] # meta data might not be in the subgraph, get it from target files meta = fetch_meta(os.path.join(data_dir, '{}.json.gz'.format(mid.strip('/').replace('/', '.')))) if meta == {}: if not 'property' in data: if 'text' in data: return data['text'] else: import pdb;pdb.set_trace() properties = data['property'] if '/type/object/name' in properties: meta['name'] = [x['value'] for x in properties['/type/object/name']['values']] else: meta['name'] = [] if '/common/topic/alias' in properties: meta['alias'] = [x['value'] for x in properties['/common/topic/alias']['values']] else: meta['alias'] = [] if '/common/topic/notable_types' in properties: meta['notable_types'] = [x['id'] for x in properties['/common/topic/notable_types']['values']] else: meta['notable_types'] = [] if '/type/object/type' in properties: meta['type'] = [x['id'] for x in properties['/type/object/type']['values']] else: meta['type'] = [] graph = {mid: meta} if not 'property' in data: # we stop at the 2nd hop return graph properties = data['property'] neighbors = {} for k, v in properties.items(): if k.startswith('/common') or k.startswith('/type') \ or k.startswith('/freebase') or k.startswith('/user') \ or k.startswith('/imdb'): continue if len(v['values']) > 0: neighbors[k] = [] for nbr in v['values']: nbr_graph = fetch(nbr, data_dir) neighbors[k].append(nbr_graph) graph[mid]['neighbors'] = neighbors return graph ================================================ FILE: src/core/build_data/utils.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import os import datetime import shutil from collections import defaultdict import numpy as np from scipy.sparse import * RESERVED_TOKENS = {'PAD': 0, 'UNK': 1} def built(path, version_string=None): """Checks if 'built.log' flag has been set for that task. If a version_string is provided, this has to match, or the version is regarded as not built. """ if version_string: fname = os.path.join(path, 'built.log') if not os.path.isfile(fname): return False else: with open(fname, 'r') as read: text = read.read().split('\n') return (len(text) > 1 and text[1] == version_string) else: return os.path.isfile(os.path.join(path, 'built.log')) def mark_done(path, version_string=None): """Marks the path as done by adding a 'built.log' file with the current timestamp plus a version description string if specified. """ with open(os.path.join(path, 'built.log'), 'w') as write: write.write(str(datetime.datetime.today())) if version_string: write.write('\n' + version_string) def make_dir(path): """Makes the directory and any nonexistent parent directories.""" os.makedirs(path, exist_ok=True) def remove_dir(path): """Removes the given directory, if it exists.""" shutil.rmtree(path, ignore_errors=True) def vectorize_data(queries, query_mentions, memories, max_query_size=None, max_query_markup_size=None, max_mem_size=None, \ max_ans_bow_size=None, max_ans_type_bow_size=None, max_ans_path_bow_size=None, max_ans_path_size=None, \ max_ans_ctx_entity_bows_size=None, max_ans_ctx_relation_bows_size=1, \ verbose=True, fixed_size=False, vocab2id=None): cand_ans_bows, cand_ans_entities, cand_ans_type_bows, cand_ans_types, cand_ans_path_bows, cand_ans_paths, cand_ans_ctx, cand_ans_topic_key = zip(*memories) cand_ans_size = min(max(map(len, (x for x in cand_ans_entities)), default=0), max_mem_size if max_mem_size else float('inf')) if fixed_size: query_size = max_query_size # query_markup_size = max_query_markup_size cand_ans_bows_size = max_ans_bow_size cand_ans_type_bows_size = max_ans_type_bow_size cand_ans_path_bows_size = max_ans_path_bow_size cand_ans_paths_size = max_ans_path_size else: query_size = max(min(max(map(len, queries), default=0), max_query_size if max_query_size else float('inf')), 1) # query_markup_size = max(min(max(map(len, query_mentions), default=0), max_query_markup_size if max_query_markup_size else float('inf')), 1) cand_ans_bows_size = max(min(max(map(len, (y for x in cand_ans_bows for y in x)), default=0), max_ans_bow_size if max_ans_bow_size else float('inf')), 1) cand_ans_type_bows_size = max(min(max(map(len, (y for x in cand_ans_type_bows for y in x)), default=0), max_ans_type_bow_size if max_ans_type_bow_size else float('inf')), 1) cand_ans_path_bows_size = max(min(max(map(len, (y for x in cand_ans_path_bows for y in x)), default=0), max_ans_path_bow_size if max_ans_path_bow_size else float('inf')), 1) cand_ans_paths_size = max(min(max(map(len, (y for x in cand_ans_paths for y in x)), default=0), max_ans_path_size if max_ans_path_size else float('inf')), 1) cand_ans_types_size = max(max(map(len, (y for x in cand_ans_types for y in x)), default=0), 1) cand_ans_ctx_entity_bows_size = max(min(max(map(len, (z for x in cand_ans_ctx for y in x for z in y[0])), default=0), max_ans_ctx_entity_bows_size if max_ans_ctx_entity_bows_size else float('inf')), 1) cand_ans_ctx_relation_bows_size = max(min(max(map(len, (y[1] for x in cand_ans_ctx for y in x)), default=0), max_ans_ctx_relation_bows_size if max_ans_ctx_relation_bows_size else float('inf')), 1) cand_ans_topic_key_ent_type_bows_size = max(max(map(len, (y[0] for x in cand_ans_topic_key for y in x)), default=0), 1) cand_ans_topic_key_ent_types_size = max(max(map(len, (y[1] for x in cand_ans_topic_key for y in x)), default=0), 1) if verbose: print('\nquery_size: {}, cand_ans_size: {}, cand_ans_bows_size: {}, ' 'cand_ans_type_bows_size: {}, cand_ans_types_size: {}, cand_ans_path_bows_size: {}, cand_ans_paths_size: {}, ' 'cand_ans_ctx_entity_bows_size: {}, cand_ans_topic_key_ent_types_size: {}'\ .format(query_size, cand_ans_size, cand_ans_bows_size, cand_ans_type_bows_size, \ cand_ans_types_size, cand_ans_path_bows_size, cand_ans_paths_size, cand_ans_ctx_entity_bows_size, \ cand_ans_topic_key_ent_types_size)) # Question word qw_tokens = ["which", "what", "who", "whose", "whom", "where", "when", "how", "why", "whether"] qw_vids = [vocab2id[each] for each in qw_tokens if each in vocab2id] qw_vid2id = dict(zip(qw_vids, range(len(qw_vids)))) Q = [] QW = [] Q_len = [] for i, q in enumerate(queries): Q_len.append(min(query_size, len(q))) lq = max(0, query_size - len(q)) q_vec = q[-query_size:] + [0] * lq Q.append(q_vec) tmp = [qw_vid2id[each] for each in q if each in qw_vid2id] tmp = tmp[-query_size:] + [0] * max(0, query_size - len(tmp)) QW.append(tmp) cand_ans_bows_vec = [] for x in cand_ans_bows: tmp = [] for y in x: l = max(0, cand_ans_bows_size - len(y)) tmp1 = y[:cand_ans_bows_size] + [0] * l tmp.append(tmp1) tmp += [[0] * cand_ans_bows_size] # Add a dummy candidate after the true sequence cand_ans_bows_vec.append(tmp) cand_ans_entities_vec = [] for x in cand_ans_entities: cand_ans_entities_vec.append(x + [0]) # Add a dummy candidate after the true sequence cand_ans_types_vec = [] for x in cand_ans_types: tmp = [] for y in x: l = max(0, cand_ans_types_size - len(y)) tmp1 = y[:cand_ans_types_size] + [0] * l tmp.append(tmp1) tmp += [[0] * cand_ans_types_size] # Add a dummy candidate after the true sequence cand_ans_types_vec.append(tmp) cand_ans_type_bows_vec = [] cand_ans_type_bows_len = [] for x in cand_ans_type_bows: tmp = [] tmp_len = [] for y in x: l = max(0, cand_ans_type_bows_size - len(y)) tmp1 = y[:cand_ans_type_bows_size] + [0] * l tmp.append(tmp1) tmp_len.append(max(min(cand_ans_type_bows_size, len(y)), 1)) tmp += [[0] * cand_ans_type_bows_size] # Add a dummy candidate after the true sequence tmp_len += [1] cand_ans_type_bows_vec.append(tmp) cand_ans_type_bows_len.append(tmp_len) cand_ans_paths_vec = [] for x in cand_ans_paths: tmp = [] for y in x: l = max(0, cand_ans_paths_size - len(y)) tmp1 = y[:cand_ans_paths_size] + [0] * l tmp.append(tmp1) tmp += [[0] * cand_ans_paths_size] # Add a dummy candidate after the true sequence cand_ans_paths_vec.append(tmp) cand_ans_path_bows_vec = [] cand_ans_path_bows_len = [] for x in cand_ans_path_bows: tmp = [] tmp_len = [] for y in x: l = max(0, cand_ans_path_bows_size - len(y)) tmp1 = y[:cand_ans_path_bows_size] + [0] * l tmp.append(tmp1) tmp_len.append(max(min(cand_ans_path_bows_size, len(y)), 1)) tmp += [[0] * cand_ans_path_bows_size] # Add a dummy candidate after the true sequence tmp_len += [1] cand_ans_path_bows_vec.append(tmp) cand_ans_path_bows_len.append(tmp_len) cand_ans_ctx_entity_vec = [] cand_ans_ctx_relation_vec = [] for x in cand_ans_ctx: tmp_ent = [] tmp_rel = [] for y in x: tmp_ent.append(y[0]) # y[0] is a list of lists l_rel = max(0, cand_ans_ctx_relation_bows_size - len(y[1])) tmp_rel.append(y[1][:cand_ans_ctx_relation_bows_size] + [0] * l_rel) tmp_ent += [[]] # Add a dummy candidate after the true sequence tmp_rel += [[0] * cand_ans_ctx_relation_bows_size] cand_ans_ctx_entity_vec.append(tmp_ent) cand_ans_ctx_relation_vec.append(tmp_rel) cand_ans_topic_key_ent_type_bows_vec = [] cand_ans_topic_key_ent_type_vec = [] cand_ans_topic_key_ent_type_bows_len = [] for x in cand_ans_topic_key: tmp_ent_type_bows = [] tmp_ent_type = [] tmp_ent_type_bow_len = [] for y in x: tmp_ent_type_bows.append(y[0][:cand_ans_topic_key_ent_type_bows_size] + [0] * max(0, cand_ans_topic_key_ent_type_bows_size - len(y[0]))) tmp_ent_type.append(y[1][:cand_ans_topic_key_ent_types_size] + [0] * max(0, cand_ans_topic_key_ent_types_size - len(y[1]))) tmp_ent_type_bow_len.append(max(min(cand_ans_topic_key_ent_type_bows_size, len(y[0])), 1)) tmp_ent_type_bows += [[0] * cand_ans_topic_key_ent_type_bows_size] # Add a dummy candidate after the true sequence tmp_ent_type += [[0] * cand_ans_topic_key_ent_types_size] tmp_ent_type_bow_len += [1] cand_ans_topic_key_ent_type_bows_vec.append(tmp_ent_type_bows) cand_ans_topic_key_ent_type_vec.append(tmp_ent_type) cand_ans_topic_key_ent_type_bows_len.append(tmp_ent_type_bow_len) return Q, QW, Q_len, list(zip(cand_ans_bows_vec, cand_ans_entities_vec, cand_ans_type_bows_vec, cand_ans_types_vec, cand_ans_type_bows_len, cand_ans_path_bows_vec, cand_ans_paths_vec, cand_ans_path_bows_len, cand_ans_ctx_entity_vec, cand_ans_ctx_relation_vec, cand_ans_topic_key_ent_type_bows_vec, cand_ans_topic_key_ent_type_vec, cand_ans_topic_key_ent_type_bows_len)) def vectorize_ent_data(queries, ent_memories, max_query_size=None, \ max_seed_ent_name_size=None, max_seed_type_name_size=None, \ max_seed_rel_name_size=None, max_seed_rel_size=None, verbose=True): seed_ent_name, seed_ent_type_name, seed_ent_type, seed_rel_names, seed_rels = zip(*ent_memories) max_query_size = max(min(max(map(len, queries), default=0), max_query_size if max_query_size else float('inf')), 1) cand_seed_ent_name_size = max(min(max(map(len, (y for x in seed_ent_name for y in x)), default=0), max_seed_ent_name_size if max_seed_ent_name_size else float('inf')), 1) cand_seed_type_name_size = max(min(max(map(len, (y for x in seed_ent_type_name for y in x)), default=0), max_seed_type_name_size if max_seed_type_name_size else float('inf')), 1) cand_seed_types_size = max(max(map(len, (y for x in seed_ent_type for y in x)), default=0), 1) cand_seed_rel_name_size = max(min(max(map(len, (z for x in seed_rel_names for y in x for z in y)), default=0), max_seed_rel_name_size if max_seed_rel_name_size else float('inf')), 1) cand_seed_rel_size = max(min(max(map(len, (y for x in seed_rels for y in x)), default=0), max_seed_rel_size if max_seed_rel_size else float('inf')), 1) if verbose: print('\nmax_query_size: {}, cand_seed_ent_name_size: {}, cand_seed_type_name_size: {}, ' 'cand_seed_types_size: {}, cand_seed_rel_name_size: {}, cand_seed_rel_size: {}'.format(max_query_size, \ cand_seed_ent_name_size, cand_seed_type_name_size, cand_seed_types_size, \ cand_seed_rel_name_size, cand_seed_rel_size)) # Query vectorization Q = [] Q_len = [] for q in queries: Q_len.append(min(max_query_size, len(q))) lq = max(0, max_query_size - len(q)) q_vec = q[-max_query_size:] + [0] * lq Q.append(q_vec) # Entity vectorization cand_seed_ent_name_vec = [] cand_seed_ent_name_len = [] for x in seed_ent_name: tmp = [] tmp_len = [] for y in x: l = max(0, cand_seed_ent_name_size - len(y)) tmp1 = y[:cand_seed_ent_name_size] + [0] * l tmp.append(tmp1) tmp_len.append(max(min(cand_seed_ent_name_size, len(y)), 1)) cand_seed_ent_name_vec.append(tmp) cand_seed_ent_name_len.append(tmp_len) cand_seed_type_vec = [] for x in seed_ent_type: tmp = [] for y in x: l = max(0, cand_seed_types_size - len(y)) tmp1 = y[:cand_seed_types_size] + [0] * l tmp.append(tmp1) cand_seed_type_vec.append(tmp) cand_seed_type_name_vec = [] cand_seed_type_name_len = [] for x in seed_ent_type_name: tmp = [] tmp_len = [] for y in x: l = max(0, cand_seed_type_name_size - len(y)) tmp1 = y[:cand_seed_type_name_size] + [0] * l tmp.append(tmp1) tmp_len.append(max(min(cand_seed_type_name_size, len(y)), 1)) cand_seed_type_name_vec.append(tmp) cand_seed_type_name_len.append(tmp_len) cand_seed_rel_vec = [] cand_seed_rel_mask = [] for x in seed_rels: # example x_tmp = [] x_mask = [] for y in x: # seed entity l = max(0, cand_seed_rel_size - len(y)) y_tmp = y[:cand_seed_rel_size] + [0] * l x_tmp.append(y_tmp) x_mask.append(min(len(y), cand_seed_rel_size)) cand_seed_rel_vec.append(x_tmp) cand_seed_rel_mask.append(x_mask) cand_seed_rel_name_vec = [] cand_seed_rel_name_len = [] for x in seed_rel_names: # example x_tmp = [] x_tmp_len = [] for y in x: # seed entity y_tmp = [] y_tmp_len = [] for z in y: # relation z_l = max(0, cand_seed_rel_name_size - len(z)) z_tmp = z[:cand_seed_rel_name_size] + [0] * z_l y_tmp.append(z_tmp) y_tmp_len.append(max(min(cand_seed_rel_name_size, len(z)), 1)) y_l = max(0, cand_seed_rel_size - len(y)) y_tmp += [[0] * cand_seed_rel_name_size] * y_l y_tmp_len += [1] * y_l x_tmp.append(y_tmp) x_tmp_len.append(y_tmp_len) cand_seed_rel_name_vec.append(x_tmp) cand_seed_rel_name_len.append(x_tmp_len) return Q, Q_len, list(zip(cand_seed_ent_name_vec, cand_seed_ent_name_len, cand_seed_type_name_vec, cand_seed_type_vec, cand_seed_type_name_len, cand_seed_rel_name_vec, cand_seed_rel_vec, cand_seed_rel_name_len, cand_seed_rel_mask)) ================================================ FILE: src/core/build_data/webquestions.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import os # import re import argparse from nltk.parse.stanford import StanfordDependencyParser from ..utils.utils import * from ..utils.freebase_utils import if_filterout from ..utils.generic_utils import * def get_used_fbkeys(data_dir, out_dir): # Fetch freebase keys used in training and validation sets. fbkeys = set() split = ['factoid_webqa/train.json', 'factoid_webqa/valid.json'] files = [os.path.join(data_dir, x) for x in split] for f in files: data = load_json(f) for qa in data: fbkeys.add(qa['freebaseKey']) dump_json(list(fbkeys), os.path.join(out_dir, 'fbkeys_train_valid.json'), indent=1) def get_all_fbkeys(data_dir, out_dir): # Fetch all freebase keys possibily useful to answer questions. fbkeys = set() split = ['factoid_webqa/train.json', 'factoid_webqa/valid.json', 'factoid_webqa/test.json'] files = [os.path.join(data_dir, x) for x in split] for f in files: data = load_json(f) for qa in data: fbkeys.add(qa['freebaseKey']) retrieved_test_path = os.path.join(data_dir, 'factoid_webqa/webquestions.examples.test.retrieved.json') if os.path.exists(retrieved_test_path): data = load_json(retrieved_test_path) for qa in data: if not 'retrievedList' in qa: continue for x in qa['retrievedList'].split(): fbkeys.add(x.split(':')[0]) dump_json(list(fbkeys), os.path.join(out_dir, 'fbkeys_train_valid_test_retrieved.json'), indent=1) def main(fb_path, mid2key_path, data_dir, out_dir): HAS_DEP = False if HAS_DEP: dep_parser = StanfordDependencyParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") # Set CLASSPATH and STANFORD_MODELS environment variables beforehand kb = load_ndjson(fb_path, return_type='dict') mid2key = load_json(mid2key_path) all_split_questions = [] split = ['factoid_webqa/train.json', 'factoid_webqa/valid.json', 'factoid_webqa/test.json'] files = [os.path.join(data_dir, x) for x in split] missing_mid2key = [] for f in files: data_type = os.path.basename(f).split('.')[0] num_unanswerable = 0 all_questions = [] data = load_json(f) for q in data: questions = {} questions['answers'] = q['answers'] questions['entities'] = q['entities'] questions['qText'] = q['qText'] questions['qId'] = q['qId'] questions['freebaseKey'] = q['freebaseKey'] questions['freebaseKeyCands'] = [q['freebaseKey']] for x in q['freebaseMids']: if x['mid'] in mid2key: fbkey = mid2key[x['mid']] if fbkey != q['freebaseKey']: questions['freebaseKeyCands'].append(fbkey) else: missing_mid2key.append(x['mid']) qtext = tokenize(q['qText']) if HAS_DEP: qw = list(set(qtext).intersection(question_word_list)) question_word = qw[0] if len(qw) > 0 else '' topic_ent = q['freebaseKey'] dep_path = extract_dep_feature(dep_parser, ' '.join(qtext), topic_ent, question_word) else: dep_path = [] questions['dep_path'] = dep_path all_questions.append(questions) if not q['freebaseKey'] in kb: num_unanswerable += 1 continue cand_ans = fetch_ans_cands(kb[q['freebaseKey']]) norm_cand_ans = set([normalize_answer(x) for x in cand_ans]) norm_gold_ans = [normalize_answer(x) for x in q['answers']] # Check if we can find the gold answer from the candidiate answers. if len(norm_cand_ans.intersection(norm_gold_ans)) == 0: num_unanswerable += 1 continue all_split_questions.append(all_questions) print('{} set: Num of unanswerable questions: {}'.format(data_type, num_unanswerable)) for i, each in enumerate(all_split_questions): dump_ndjson(each, os.path.join(out_dir, split[i].split('/')[-1])) def fetch_ans_cands(graph): cand_ans = set() # candidiate answers # We only consider the alias relations of topic entityies cand_ans.update(graph['alias']) for k, v in graph['neighbors'].items(): if if_filterout(k): continue for nbr in v: if isinstance(nbr, str): cand_ans.add(nbr) continue elif isinstance(nbr, bool): cand_ans.add('true' if nbr else 'false') continue elif isinstance(nbr, float): cand_ans.add(str(nbr)) continue elif isinstance(nbr, dict): nbr_k = list(nbr.keys())[0] nbr_v = nbr[nbr_k] selected_names = nbr_v['name'] if 'name' in nbr_v and len(nbr_v['name']) > 0 else (nbr_v['alias'][:1] if 'alias' in nbr_v else []) cand_ans.add(selected_names[0] if len(selected_names) > 0 else 'UNK') if not 'neighbors' in nbr_v: continue for kk, vv in nbr_v['neighbors'].items(): # 2nd hop if if_filterout(kk): continue for nbr_nbr in vv: if isinstance(nbr_nbr, str): cand_ans.add(nbr_nbr) continue elif isinstance(nbr_nbr, bool): cand_ans.add('true' if nbr_nbr else 'false') continue elif isinstance(nbr_nbr, float): cand_ans.add(str(nbr_nbr)) continue elif isinstance(nbr_nbr, dict): nbr_nbr_k = list(nbr_nbr.keys())[0] nbr_nbr_v = nbr_nbr[nbr_nbr_k] selected_names = nbr_nbr_v['name'] if 'name' in nbr_nbr_v and len(nbr_nbr_v['name']) > 0 else (nbr_nbr_v['alias'][:1] if 'alias' in nbr_nbr_v else []) cand_ans.add(selected_names[0] if len(selected_names) > 0 else 'UNK') else: raise RuntimeError('Unknown type: %s' % type(nbr_nbr)) else: raise RuntimeError('Unknown type: %s' % type(nbr)) return list(cand_ans) ================================================ FILE: src/core/config.py ================================================ # Vocabulary RESERVED_TOKENS = {'PAD': 0, 'UNK': 1} RESERVED_ENTS = {'PAD': 0, 'UNK': 1} RESERVED_ENT_TYPES = {'PAD': 0, 'UNK': 1} RESERVED_RELS = {'PAD': 0, 'UNK': 1} extra_vocab_tokens = ['alias', 'true', 'false', 'num', 'bool'] + \ ['np', 'organization', 'date', 'number', 'misc', 'ordinal', 'duration', 'person', 'time', 'location'] + \ ['__np__', '__organization__', '__date__', '__number__', '__misc__', '__ordinal__', '__duration__', '__person__', '__time__', '__location__'] extra_rels = ['alias'] extra_ent_types = ['num', 'bool'] # BAMnet entity mention types topic_mention_types = {'person', 'organization', 'location', 'misc'} # delex_mention_types = {'date', 'time', 'ordinal', 'number'} delex_mention_types = {'date', 'ordinal', 'number'} constraint_mention_types = delex_mention_types ================================================ FILE: src/core/utils/__init__.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' ================================================ FILE: src/core/utils/freebase_utils.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' from rapidfuzz import fuzz, process def if_filterout(s): if s.endswith('has_sentences') or \ s.endswith('exceptions') or s.endswith('sww_base/source') or \ s.endswith('kwtopic/assessment'): return True else: return False def query_kb(kb, ent_name, fuzz_threshold=90): results = [] for k, v in kb.items(): ret = process.extractOne(ent_name, v['name'] + v['alias'], scorer=fuzz.token_sort_ratio) if ret[1] > fuzz_threshold: results.append((k, ret[0], ret[1])) results = sorted(results, key=lambda d:d[-1], reverse=True) return list(zip(*results))[0] if len(results) > 0 else [] ================================================ FILE: src/core/utils/generic_utils.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import re, string import numpy as np from rapidfuzz import fuzz, process from nltk.corpus import stopwords from .utils import dump_ndarray, tokenize question_word_list = 'who, when, what, where, how, which, why, whom, whose'.split(', ') stop_words = set(stopwords.words("english")) def find_parent(x, tree, conn='<-'): root = tree[0][0] path = [] for parent, indicator, child in tree: if x == child[0]: path.extend([conn, '__{}__'.format(indicator), '-', parent[0]]) if not parent == root: p = find_parent(parent[0], tree, conn) path.extend(p) return path return path def extract_dep_feature(dep_parser, text, topic_ent, question_word): dep = dep_parser.raw_parse(text).__next__() tree = list(dep.triples()) topic_ent = list(set(tokenize(topic_ent)) - stop_words) text = text.split() path_len = 1e5 topic_ent_to_root = [] for each in topic_ent: ret = process.extractOne(each, text, scorer=fuzz.token_sort_ratio) if ret[1] < 85: continue tmp = find_parent(ret[0], tree, '->') if len(tmp) > 0 and len(tmp) < path_len: topic_ent_to_root = tmp path_len = len(tmp) question_word_to_root = find_parent(question_word, tree) # if len(question_word_to_root) == 0 or len(topic_ent_to_root) == 0: # import pdb;pdb.set_trace() return question_word_to_root + list(reversed(topic_ent_to_root[:-1])) def unique(seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] re_art = re.compile(r'\b(a|an|the)\b') re_punc = re.compile(r'[%s]' % re.escape(string.punctuation)) def normalize_answer(s): """Lower text and remove extra whitespace.""" def remove_articles(text): return re_art.sub(' ', text) def remove_punc(text): return re_punc.sub(' ', text) # convert punctuation to spaces def white_space_fix(text): return ' '.join(text.split()) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s)))) def dump_embeddings(vocab_dict, emb_file, out_path, emb_size=300, binary=False, seed=123): vocab_emb = get_embeddings(emb_file, vocab_dict, binary) vocab_size = len(vocab_dict) np.random.seed(seed) embeddings = np.random.uniform(-0.08, 0.08, (vocab_size, emb_size)) for w, idx in vocab_dict.items(): if w in vocab_emb: embeddings[int(idx)] = vocab_emb[w] embeddings[0] = 0 dump_ndarray(embeddings, out_path) return embeddings def get_embeddings(emb_file, vocab, binary=False): pt = PreTrainEmbedding(emb_file, binary) vocab_embs = {} i = 0. for each in vocab: emb = pt.get_embeddings(each) if not emb is None: vocab_embs[each] = emb i += 1 print('get_wordemb hit ratio: %s' % (i / len(vocab))) return vocab_embs class PreTrainEmbedding(): def __init__(self, file, binary=False): import gensim self.model = gensim.models.KeyedVectors.load_word2vec_format(file, binary=binary) def get_embeddings(self, word): word_list = [word, word.upper(), word.lower(), word.title(), string.capwords(word, '_')] for w in word_list: try: return self.model[w] except KeyError: # print('Can not get embedding for ', w) continue return None ================================================ FILE: src/core/utils/metrics.py ================================================ ''' Created on Oct, 2017 @author: hugo Note: Modified the official evaluation script provided by Berant et al. (https://github.com/percyliang/sempre/blob/master/scripts/evaluation.py) ''' from .generic_utils import normalize_answer def calc_f1(gold_list, pred_list): """Return a tuple with recall, precision, and f1 for one example""" # Assume all questions have at least one answer if len(gold_list) == 0: raise RuntimeError('Gold list may not be empty') # If we return an empty list recall is zero and precision is one if len(pred_list) == 0: return (0, 1, 0) # It is guaranteed now that both lists are not empty # Normalize answers gold_list = [normalize_answer(s) for s in gold_list] pred_list = [normalize_answer(s) for s in pred_list] precision = 0 for entity in pred_list: if entity in gold_list: precision += 1 precision = float(precision) / len(pred_list) recall = 0 for entity in gold_list: if entity in pred_list: recall += 1 recall = float(recall) / len(gold_list) f1 = 0 if precision + recall > 0: f1 = 2 * recall * precision / (precision + recall) return (recall, precision, f1) def calc_avg_f1(gold_list, pred_list, verbose=True): """Go over all examples and compute recall, precision and F1""" avg_recall = 0 avg_precision = 0 avg_f1 = 0 count = 0 out_f = open('error_analysis.txt', 'w') assert len(gold_list) == len(pred_list) for i, gold in enumerate(gold_list): recall, precision, f1 = calc_f1(gold, pred_list[i]) avg_recall += recall avg_precision += precision avg_f1 += f1 count += 1 if True: # if f1 < 0.6: out_f.write('{}\t{}\t{}\t{}\n'.format(i, gold, pred_list[i], f1)) out_f.close() avg_recall = float(avg_recall) / count avg_precision = float(avg_precision) / count avg_f1 = float(avg_f1) / count avg_new_f1 = 0 if avg_precision + avg_recall > 0: avg_new_f1 = 2 * avg_recall * avg_precision / (avg_precision + avg_recall) if verbose: print("Number of questions: " + str(count)) print("Average recall over questions: " + str(avg_recall)) print("Average precision over questions: " + str(avg_precision)) print("Average f1 over questions: " + str(avg_f1)) # print("F1 of average recall and average precision: " + str(avg_new_f1)) return count, avg_recall, avg_precision, avg_f1 ================================================ FILE: src/core/utils/utils.py ================================================ ''' Created on Sep, 2017 @author: hugo ''' import os import re import yaml import gzip import json import string import numpy as np from nltk.tokenize import wordpunct_tokenize#, word_tokenize # tokenize = lambda s: word_tokenize(re.sub(r'[%s]' % punc_wo_dot, ' ', re.sub(r'(? {}".format(keystr, val)) print("**************** MODEL CONFIGURATION ****************") def read_lines(path_to_file): data = [] try: with open(path_to_file, 'r') as f: for line in f: tmp = [float(x) for x in line.strip().split()] data.append(tmp) except Exception as e: raise e return data def dump_ndarray(data, path_to_file): try: with open(path_to_file, 'wb') as f: np.save(f, data) except Exception as e: raise e def load_ndarray(path_to_file): try: with open(path_to_file, 'rb') as f: data = np.load(f) except Exception as e: raise e return data def dump_ndjson(data, file): try: with open(file, 'w') as f: for each in data: f.write(json.dumps(each) + '\n') except Exception as e: raise e def load_ndjson(file, return_type='array'): if return_type == 'array': return load_ndjson_to_array(file) elif return_type == 'dict': return load_ndjson_to_dict(file) else: raise RuntimeError('Unknown return_type: %s' % return_type) def load_ndjson_to_array(file): data = [] try: with open(file, 'r') as f: for line in f: data.append(json.loads(line.strip())) except Exception as e: raise e return data def load_ndjson_to_dict(file): data = {} try: with open(file, 'r') as f: for line in f: data.update(json.loads(line.strip())) except Exception as e: raise e return data def dump_json(data, file, indent=None): try: with open(file, 'w') as f: json.dump(data, f, indent=indent) except Exception as e: raise e def load_json(file): try: with open(file, 'r') as f: data = json.load(f) except Exception as e: raise e return data def dump_dict_ndjson(data, file): try: with open(file, 'w') as f: for k, v in data.items(): line = json.dumps([k, v]) + '\n' f.write(line) except Exception as e: raise e def load_gzip_json(file): try: with gzip.open(file, 'r') as f: data = json.load(f) except Exception as e: raise e return data def get_all_files(dir, recursive=False): if recursive: return [os.path.join(root, file) for root, dirnames, filenames in os.walk(dir) for file in filenames if os.path.isfile(os.path.join(root, file)) and not file.startswith('.')] else: return [os.path.join(dir, filename) for filename in os.listdir(dir) if os.path.isfile(os.path.join(dir, filename)) and not filename.startswith('.')] # Print iterations progress def printProgressBar(iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█'): """ Call in a loop to create terminal progress bar @params: iteration - Required : current iteration (Int) total - Required : total iterations (Int) prefix - Optional : prefix string (Str) suffix - Optional : suffix string (Str) decimals - Optional : positive number of decimals in percent complete (Int) length - Optional : character length of bar (Int) fill - Optional : bar fill character (Str) """ percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total))) filledLength = int(length * iteration // total) bar = fill * filledLength + '-' * (length - filledLength) print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r') ================================================ FILE: src/joint_test.py ================================================ import timeit import argparse import numpy as np from core.bamnet.entnet import EntnetAgent from core.bamnet.bamnet import BAMnetAgent from core.build_data.build_all import build from core.build_data.utils import vectorize_ent_data, vectorize_data from core.build_data.build_data import build_data from core.utils.generic_utils import unique from core.utils.utils import * from core.utils.metrics import * def dynamic_pred(pred, margin): predictions = [] for i in range(len(pred)): predictions.append(unique([x[0] for x in pred[i] if x[1] + margin >= pred[i][0][1]])) return predictions if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-bamnet_config', '--bamnet_config', required=True, type=str, help='path to the config file') parser.add_argument('-entnet_config', '--entnet_config', required=True, type=str, help='path to the config file') parser.add_argument('-raw_data', '--raw_data_dir', required=True, type=str, help='raw data dir') cfg = vars(parser.parse_args()) bamnet_opt = get_config(cfg['bamnet_config']) entnet_opt = get_config(cfg['entnet_config']) start = timeit.default_timer() # Entnet # Ensure data is built build(entnet_opt['data_dir']) data_vec = load_json(os.path.join(entnet_opt['data_dir'], entnet_opt['test_data'])) queries, memories, ent_labels, ent_inds = data_vec queries, query_lengths, memories = vectorize_ent_data(queries, \ memories, max_query_size=entnet_opt['query_size'], \ max_seed_ent_name_size=entnet_opt['max_seed_ent_name_size'], \ max_seed_type_name_size=entnet_opt['max_seed_type_name_size'], \ max_seed_rel_name_size=entnet_opt['max_seed_rel_name_size'], \ max_seed_rel_size=entnet_opt['max_seed_rel_size']) ent_model = EntnetAgent(entnet_opt) acc = ent_model.evaluate([memories, queries, query_lengths], ent_inds, batch_size=entnet_opt['test_batch_size']) print('acc: {}'.format(acc)) pred_seed_ents = ent_model.predict([memories, queries, query_lengths], ent_labels, batch_size=entnet_opt['test_batch_size']) # BAMnet # Ensure data is built build(bamnet_opt['data_dir']) entity2id = load_json(os.path.join(bamnet_opt['data_dir'], 'entity2id.json')) entityType2id = load_json(os.path.join(bamnet_opt['data_dir'], 'entityType2id.json')) relation2id = load_json(os.path.join(bamnet_opt['data_dir'], 'relation2id.json')) vocab2id = load_json(os.path.join(bamnet_opt['data_dir'], 'vocab2id.json')) ctx_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"} # Build data in real time freebase = load_ndjson(os.path.join(cfg['raw_data_dir'], 'freebase_full.json'), return_type='dict') test_data = load_ndjson(os.path.join(cfg['raw_data_dir'], 'raw_test.json')) data_vec = build_data(test_data, freebase, entity2id, entityType2id, relation2id, vocab2id, pred_seed_ents=pred_seed_ents) queries, raw_queries, query_mentions, memories, cand_labels, _, gold_ans_labels = data_vec queries, query_words, query_lengths, memories_vec = vectorize_data(queries, query_mentions, memories, \ max_query_size=bamnet_opt['query_size'], \ max_query_markup_size=bamnet_opt['query_markup_size'], \ max_ans_bow_size=bamnet_opt['ans_bow_size'], \ vocab2id=vocab2id) model = BAMnetAgent(bamnet_opt, ctx_stopwords, vocab2id) pred = model.predict([memories_vec, queries, query_words, raw_queries, query_mentions, query_lengths], cand_labels, batch_size=bamnet_opt['test_batch_size'], margin=2) print('\nPredictions') for margin in bamnet_opt['test_margin']: print('\nMargin: {}'.format(margin)) predictions = dynamic_pred(pred, margin) calc_avg_f1(gold_ans_labels, predictions) print('Runtime: %ss' % (timeit.default_timer() - start)) ================================================ FILE: src/run_freebase.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import argparse import os import json from core.build_data.freebase import * from core.utils.utils import * parser = argparse.ArgumentParser() parser.add_argument('-data_dir', '--data_dir', required=True, type=str, help='path to the data dir') parser.add_argument('-fbkeys', '--freebase_keys', required=True, type=str, help='path to the freebase key file') parser.add_argument('-out_dir', '--out_dir', type=str, required=True, help='path to the output dir') args = parser.parse_args() ids = load_json(args.freebase_keys) total = len(ids) print('Fetching {} entities and their 2-hop neighbors.'.format(total)) print_bar_len = 50 cnt = 0 missing_ids = set() with open(os.path.join(args.out_dir, 'freebase.json'), 'a') as out_f: for id_ in ids: try: data = load_gzip_json(os.path.join(args.data_dir, '{}.json.gz'.format(id_))) except: missing_ids.add(id_) continue graph = fetch(data, args.data_dir) graph2 = {id_: list(graph.values())[0]} graph2[id_]['id'] = list(graph.keys())[0] line = json.dumps(graph2) + '\n' out_f.write(line) cnt += 1 if cnt % int(total / print_bar_len) == 0: printProgressBar(cnt, total, prefix='Progress:', suffix='Complete', length=print_bar_len) printProgressBar(cnt, total, prefix='Progress:', suffix='Complete', length=print_bar_len) print('Missed %s mids' % len(missing_ids)) dump_json(list(missing_ids), os.path.join(args.out_dir, 'missing_fbids.json')) ================================================ FILE: src/run_webquestions.py ================================================ ''' Created on Oct, 2017 @author: hugo ''' import argparse from core.build_data.webquestions import * parser = argparse.ArgumentParser() parser.add_argument('-fb', '--freebase_path', required=True, type=str, help='path to the freebase data') parser.add_argument('-mid2key', '--mid2key_path', required=True, type=str, help='path to the freebase data') parser.add_argument('-data_dir', '--data_dir', required=True, type=str, help='path to the data dir') parser.add_argument('-out_dir', '--out_dir', type=str, required=True, help='path to the output dir') args = parser.parse_args() main(args.freebase_path, args.mid2key_path, args.data_dir, args.out_dir) # get_used_fbkeys(args.data_dir, args.out_dir) # get_all_fbkeys(args.data_dir, args.out_dir) ================================================ FILE: src/test.py ================================================ import timeit import argparse from core.bamnet.bamnet import BAMnetAgent from core.build_data.build_all import build from core.build_data.utils import vectorize_data from core.utils.utils import * from core.utils.generic_utils import unique from core.utils.metrics import * def dynamic_pred(pred, margin): predictions = [] for i in range(len(pred)): predictions.append(unique([x[0] for x in pred[i] if x[1] + margin >= pred[i][0][1]])) return predictions if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-config', '--config', required=True, type=str, help='path to the config file') cfg = vars(parser.parse_args()) opt = get_config(cfg['config']) # Ensure data is built build(opt['data_dir']) data_vec = load_json(os.path.join(opt['data_dir'], opt['test_data'])) vocab2id = load_json(os.path.join(opt['data_dir'], 'vocab2id.json')) ctx_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"} queries, raw_queries, query_mentions, memories, cand_labels, _, gold_ans_labels = data_vec queries, query_words, query_lengths, memories_vec = vectorize_data(queries, query_mentions, memories, \ max_query_size=opt['query_size'], \ max_query_markup_size=opt['query_markup_size'], \ max_ans_bow_size=opt['ans_bow_size'], \ vocab2id=vocab2id) start = timeit.default_timer() model = BAMnetAgent(opt, ctx_stopwords, vocab2id) pred = model.predict([memories_vec, queries, query_words, raw_queries, query_mentions, query_lengths], cand_labels, batch_size=opt['test_batch_size'], margin=2) print('\nPredictions') for margin in opt['test_margin']: print('\nMargin: {}'.format(margin)) predictions = dynamic_pred(pred, margin) calc_avg_f1(gold_ans_labels, predictions) print('Runtime: %ss' % (timeit.default_timer() - start)) import pdb;pdb.set_trace() ================================================ FILE: src/test_entnet.py ================================================ import timeit import argparse import numpy as np from core.bamnet.entnet import EntnetAgent from core.build_data.build_all import build from core.build_data.utils import vectorize_ent_data from core.utils.utils import * if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-dt', '--datatype', default='test', type=str, help='data type: {train, valid, test}') parser.add_argument('-config', '--config', required=True, type=str, help='path to the config file') cfg = vars(parser.parse_args()) opt = get_config(cfg['config']) # Ensure data is built build(opt['data_dir']) data_vec = load_json(os.path.join(opt['data_dir'], opt['test_data'])) queries, memories, ent_labels, ent_inds = data_vec queries, query_lengths, memories = vectorize_ent_data(queries, \ memories, max_query_size=opt['query_size'], \ max_seed_ent_name_size=opt['max_seed_ent_name_size'], \ max_seed_type_name_size=opt['max_seed_type_name_size'], \ max_seed_rel_name_size=opt['max_seed_rel_name_size'], \ max_seed_rel_size=opt['max_seed_rel_size']) start = timeit.default_timer() ent_model = EntnetAgent(opt) acc = ent_model.evaluate([memories, queries, query_lengths], ent_inds, batch_size=opt['test_batch_size']) print('acc: {}'.format(acc)) print('Runtime: %ss' % (timeit.default_timer() - start)) ================================================ FILE: src/train.py ================================================ import timeit import argparse import numpy as np from core.bamnet.bamnet import BAMnetAgent from core.build_data.build_all import build from core.build_data.utils import vectorize_data from core.utils.utils import * if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-config', '--config', required=True, type=str, help='path to the config file') cfg = vars(parser.parse_args()) opt = get_config(cfg['config']) print_config(opt) # Ensure data is built build(opt['data_dir']) train_vec = load_json(os.path.join(opt['data_dir'], opt['train_data'])) valid_vec = load_json(os.path.join(opt['data_dir'], opt['valid_data'])) vocab2id = load_json(os.path.join(opt['data_dir'], 'vocab2id.json')) ctx_stopwords = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"} train_queries, train_raw_queries, train_query_mentions, train_memories, _, train_gold_ans_inds, _ = train_vec train_queries, train_query_words, train_query_lengths, train_memories = vectorize_data(train_queries, train_query_mentions, \ train_memories, max_query_size=opt['query_size'], \ max_query_markup_size=opt['query_markup_size'], \ max_mem_size=opt['mem_size'], \ max_ans_bow_size=opt['ans_bow_size'], \ max_ans_path_bow_size=opt['ans_path_bow_size'], \ vocab2id=vocab2id) valid_queries, valid_raw_queries, valid_query_mentions, valid_memories, valid_cand_labels, valid_gold_ans_inds, valid_gold_ans_labels = valid_vec valid_queries, valid_query_words, valid_query_lengths, valid_memories = vectorize_data(valid_queries, valid_query_mentions, \ valid_memories, max_query_size=opt['query_size'], \ max_query_markup_size=opt['query_markup_size'], \ max_mem_size=opt['mem_size'], \ max_ans_bow_size=opt['ans_bow_size'], \ max_ans_path_bow_size=opt['ans_path_bow_size'], \ vocab2id=vocab2id) start = timeit.default_timer() model = BAMnetAgent(opt, ctx_stopwords, vocab2id) model.train([train_memories, train_queries, train_query_words, train_raw_queries, train_query_mentions, train_query_lengths], train_gold_ans_inds, \ [valid_memories, valid_queries, valid_query_words, valid_raw_queries, valid_query_mentions, valid_query_lengths], \ valid_gold_ans_inds, valid_cand_labels, valid_gold_ans_labels) print('Runtime: %ss' % (timeit.default_timer() - start)) ================================================ FILE: src/train_entnet.py ================================================ import timeit import argparse import numpy as np from core.bamnet.entnet import EntnetAgent from core.build_data.build_all import build from core.build_data.utils import vectorize_ent_data from core.utils.utils import * if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-config', '--config', required=True, type=str, help='path to the config file') cfg = vars(parser.parse_args()) opt = get_config(cfg['config']) print_config(opt) # Ensure data is built build(opt['data_dir']) train_vec = load_json(os.path.join(opt['data_dir'], opt['train_data'])) valid_vec = load_json(os.path.join(opt['data_dir'], opt['valid_data'])) train_queries, train_memories, _, train_ent_inds = train_vec train_queries, train_query_lengths, train_memories = vectorize_ent_data(train_queries, \ train_memories, max_query_size=opt['query_size'], \ max_seed_ent_name_size=opt['max_seed_ent_name_size'], \ max_seed_type_name_size=opt['max_seed_type_name_size'], \ max_seed_rel_name_size=opt['max_seed_rel_name_size'], \ max_seed_rel_size=opt['max_seed_rel_size']) valid_queries, valid_memories, _, valid_ent_inds = valid_vec valid_queries, valid_query_lengths, valid_memories = vectorize_ent_data(valid_queries, \ valid_memories, max_query_size=opt['query_size'], \ max_seed_ent_name_size=opt['max_seed_ent_name_size'], \ max_seed_type_name_size=opt['max_seed_type_name_size'], \ max_seed_rel_name_size=opt['max_seed_rel_name_size'], \ max_seed_rel_size=opt['max_seed_rel_size']) start = timeit.default_timer() ent_model = EntnetAgent(opt) ent_model.train([train_memories, train_queries, train_query_lengths], train_ent_inds, \ [valid_memories, valid_queries, valid_query_lengths], valid_ent_inds) print('Runtime: %ss' % (timeit.default_timer() - start))