Repository: zhengyima/kg-baseline-pytorch Branch: master Commit: af159d5bf1f7 Files: 8 Total size: 53.5 KB Directory structure: gitextract_fqwo80l_/ ├── README.md ├── all_50_schemas ├── dev_data.json ├── main.py ├── model.py ├── models_real/ │ └── README.md ├── train_data.json └── trans.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: README.md ================================================ # kg-baseline-pytorch 2019百度的关系抽取比赛,Pytorch版苏神的baseline,联合关系抽取。 ## 模型 与苏神的模型相同,只不过开发框架由Keras+Tensorflow变成了Pytorch,给使用Pytorch的小伙伴分享。 苏神Keras版链接:https://github.com/bojone/kg-2019-baseline 代码中复用了许多苏神的代码,因此首先对苏神表示感谢! 以下为苏神模型介绍原文: ``` 用BiLSTM做联合标注,先预测subject,然后根据suject同时预测object和predicate,标注结构是“半指针-半标注”结构,以前也曾介绍过( https://kexue.fm/archives/5409 ) 标注结构是自己设计的,我看了很多关系抽取的论文,没有发现类似的做法。所以,如果你基于此模型做出后的修改,最终获奖了或者发表paper什么的,烦请注明一下(其实也不是太奢望) @misc{ jianlin2019bdkg, title={Hybrid Structure of Pointer and Ragging for Relation Extraction: A Baseline}, author={Jianlin Su}, year={2019}, publisher={GitHub}, howpublished={\url{https://github.com/bojone/kg-2019-baseline}}, } ``` CSDN上基于本代码的算法简介:https://blog.csdn.net/qq_35268841/article/details/107063066 ## 用法 `python trans.py`转换数据,`python main.py`跑模型并观察结果。 代码需要GPU运行!若需要CPU运行则去掉代码中所有的`.cuda()`并将一些cuda上的数据类型改为普通数据类型即可。 ## 数据 数据只提供了共30条示例数据。数据由比赛官方提供,如有需要请联系比赛主办方。 ## 结果 5个epoch到达0.73,最高能到0.75。 ## 环境 Python 3.5+ Pytorch 1.0.1 tqdm ## 链接 - https://github.com/bojone/kg-2019-baseline - https://pytorch.org/ ================================================ FILE: all_50_schemas ================================================ {"object_type": "地点", "predicate": "祖籍", "subject_type": "人物"} {"object_type": "人物", "predicate": "父亲", "subject_type": "人物"} {"object_type": "地点", "predicate": "总部地点", "subject_type": "企业"} {"object_type": "地点", "predicate": "出生地", "subject_type": "人物"} {"object_type": "目", "predicate": "目", "subject_type": "生物"} {"object_type": "Number", "predicate": "面积", "subject_type": "行政区"} {"object_type": "Text", "predicate": "简称", "subject_type": "机构"} {"object_type": "Date", "predicate": "上映时间", "subject_type": "影视作品"} {"object_type": "人物", "predicate": "妻子", "subject_type": "人物"} {"object_type": "音乐专辑", "predicate": "所属专辑", "subject_type": "歌曲"} {"object_type": "Number", "predicate": "注册资本", "subject_type": "企业"} {"object_type": "城市", "predicate": "首都", "subject_type": "国家"} {"object_type": "人物", "predicate": "导演", "subject_type": "影视作品"} {"object_type": "Text", "predicate": "字", "subject_type": "历史人物"} {"object_type": "Number", "predicate": "身高", "subject_type": "人物"} {"object_type": "企业", "predicate": "出品公司", "subject_type": "影视作品"} {"object_type": "Number", "predicate": "修业年限", "subject_type": "学科专业"} {"object_type": "Date", "predicate": "出生日期", "subject_type": "人物"} {"object_type": "人物", "predicate": "制片人", "subject_type": "影视作品"} {"object_type": "人物", "predicate": "母亲", "subject_type": "人物"} {"object_type": "人物", "predicate": "编剧", "subject_type": "影视作品"} {"object_type": "国家", "predicate": "国籍", "subject_type": "人物"} {"object_type": "Number", "predicate": "海拔", "subject_type": "地点"} {"object_type": "网站", "predicate": "连载网站", "subject_type": "网络小说"} {"object_type": "人物", "predicate": "丈夫", "subject_type": "人物"} {"object_type": "Text", "predicate": "朝代", "subject_type": "历史人物"} {"object_type": "Text", "predicate": "民族", "subject_type": "人物"} {"object_type": "Text", "predicate": "号", "subject_type": "历史人物"} {"object_type": "出版社", "predicate": "出版社", "subject_type": "书籍"} {"object_type": "人物", "predicate": "主持人", "subject_type": "电视综艺"} {"object_type": "Text", "predicate": "专业代码", "subject_type": "学科专业"} {"object_type": "人物", "predicate": "歌手", "subject_type": "歌曲"} {"object_type": "人物", "predicate": "作词", "subject_type": "歌曲"} {"object_type": "人物", "predicate": "主角", "subject_type": "网络小说"} {"object_type": "人物", "predicate": "董事长", "subject_type": "企业"} {"object_type": "Date", "predicate": "成立日期", "subject_type": "机构"} {"object_type": "学校", "predicate": "毕业院校", "subject_type": "人物"} {"object_type": "Number", "predicate": "占地面积", "subject_type": "机构"} {"object_type": "语言", "predicate": "官方语言", "subject_type": "国家"} {"object_type": "Text", "predicate": "邮政编码", "subject_type": "行政区"} {"object_type": "Number", "predicate": "人口数量", "subject_type": "行政区"} {"object_type": "城市", "predicate": "所在城市", "subject_type": "景点"} {"object_type": "人物", "predicate": "作者", "subject_type": "图书作品"} {"object_type": "Date", "predicate": "成立日期", "subject_type": "企业"} {"object_type": "人物", "predicate": "作曲", "subject_type": "歌曲"} {"object_type": "气候", "predicate": "气候", "subject_type": "行政区"} {"object_type": "人物", "predicate": "嘉宾", "subject_type": "电视综艺"} {"object_type": "人物", "predicate": "主演", "subject_type": "影视作品"} {"object_type": "作品", "predicate": "改编自", "subject_type": "影视作品"} {"object_type": "人物", "predicate": "创始人", "subject_type": "企业"} ================================================ FILE: dev_data.json ================================================ {"postag": [{"word": "《", "pos": "w"}, {"word": "课本上学不到的生物学2", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "2013年", "pos": "t"}, {"word": "上海科技教育出版社", "pos": "nt"}, {"word": "出版", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "图书", "pos": "n"}], "text": "《课本上学不到的生物学2》是2013年上海科技教育出版社出版的图书", "spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", "object": "上海科技教育出版社", "subject": "《课本上学不到的生物学2》"}]} {"postag": [{"word": "南京京九思新能源有限公司", "pos": "nt"}, {"word": "于", "pos": "p"}, {"word": "2015年", "pos": "t"}, {"word": "05月15日", "pos": "t"}, {"word": "在", "pos": "p"}, {"word": "南京市江宁区市场监督管理局", "pos": "nt"}, {"word": "登记", "pos": "v"}, {"word": "成立", "pos": "v"}], "text": "南京京九思新能源有限公司于2015年05月15日在南京市江宁区市场监督管理局登记成立", "spo_list": [{"predicate": "成立日期", "object_type": "Date", "subject_type": "机构", "object": "2015年05月15日", "subject": "南京京九思新能源有限公司"}]} {"postag": [{"word": "世界", "pos": "n"}, {"word": "百科", "pos": "n"}, {"word": "大全", "pos": "n"}, {"word": "总编", "pos": "n"}, {"word": "彭友", "pos": "n"}, {"word": "定义", "pos": "v"}, {"word": "本", "pos": "r"}, {"word": "词条", "pos": "n"}, {"word": "为", "pos": "p"}, {"word": " ", "pos": "w"}, {"word": "人物", "pos": "n"}, {"word": "总", "pos": "a"}, {"word": "类", "pos": "n"}, {"word": " ", "pos": "w"}, {"word": "董事长", "pos": "n"}, {"word": "分类", "pos": "vn"}, {"word": "概述", "pos": "vn"}, {"word": " ", "pos": "w"}, {"word": "1", "pos": "m"}, {"word": "朱明宏", "pos": "nr"}, {"word": "的", "pos": "u"}, {"word": "基本", "pos": "a"}, {"word": "情况", "pos": "n"}, {"word": "男", "pos": "a"}, {"word": " ", "pos": "w"}, {"word": "汉族", "pos": "nz"}, {"word": " ", "pos": "w"}, {"word": "1968年", "pos": "t"}, {"word": "6月", "pos": "t"}, {"word": "生", "pos": "v"}, {"word": " ", "pos": "w"}, {"word": "浙江", "pos": "ns"}, {"word": "义乌", "pos": "ns"}, {"word": "人", "pos": "n"}, {"word": "11", "pos": "m"}, {"word": "现任", "pos": "v"}, {"word": " ", "pos": "w"}, {"word": "金华市发展和改革委员会", "pos": "nt"}, {"word": "副主任", "pos": "n"}, {"word": "1", "pos": "m"}, {"word": "拟", "pos": "v"}, {"word": "任", "pos": "v"}, {"word": " ", "pos": "w"}, {"word": "金华市现代服务业投资发展有限公司", "pos": "nt"}, {"word": "董事长", "pos": "n"}], "text": "世界百科大全总编彭友定义本词条为 人物总类 董事长分类概述 1朱明宏的基本情况男 汉族 1968年6月生 浙江义乌人11现任 金华市发展和改革委员会副主任1拟任 金华市现代服务业投资发展有限公司董事长", "spo_list": [{"predicate": "民族", "object_type": "Text", "subject_type": "人物", "object": "汉族", "subject": "朱明宏"}, {"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "浙江义乌", "subject": "朱明宏"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1968年6月", "subject": "朱明宏"}]} {"postag": [{"word": "田承冉 男", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "1952年", "pos": "t"}, {"word": "生", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "汉族", "pos": "nz"}, {"word": ",", "pos": "w"}, {"word": "山东", "pos": "ns"}, {"word": "桓台", "pos": "ns"}, {"word": "人", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "共", "pos": "d"}, {"word": "党员", "pos": "n"}], "text": "田承冉 男,1952年生,汉族,山东桓台人,共党员", "spo_list": [{"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "山东桓台", "subject": "田承冉"}, {"predicate": "民族", "object_type": "Text", "subject_type": "人物", "object": "汉族", "subject": "田承冉"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1952年", "subject": "田承冉"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "深夜烘焙坊", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "同名", "pos": "vn"}, {"word": "小说", "pos": "n"}, {"word": "改编", "pos": "v"}, {"word": "电视剧", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "泷泽秀明", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "桐山照史", "pos": "nr"}, {"word": "及", "pos": "c"}, {"word": "土屋太凤", "pos": "nr"}, {"word": "主演", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "于", "pos": "p"}, {"word": "2013年4月28日", "pos": "t"}, {"word": "至", "pos": "p"}, {"word": "6月16日", "pos": "t"}, {"word": "于", "pos": "p"}, {"word": "NHK BS Premium", "pos": "nz"}, {"word": "频道", "pos": "n"}, {"word": "播出", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "同年", "pos": "t"}, {"word": "11月5日", "pos": "t"}, {"word": "至", "pos": "v"}, {"word": "12月6日", "pos": "t"}, {"word": "NHK综合台", "pos": "nz"}, {"word": "播出", "pos": "v"}], "text": "《深夜烘焙坊》是同名小说改编电视剧,由泷泽秀明、桐山照史及土屋太凤主演,于2013年4月28日至6月16日于NHK BS Premium频道播出,同年11月5日至12月6日NHK综合台播出", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "泷泽秀明", "subject": "《深夜烘焙坊》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "桐山照史", "subject": "《深夜烘焙坊》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "土屋太凤", "subject": "《深夜烘焙坊》"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "星空黑夜传奇", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "连载", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "起点中文网", "pos": "nz"}, {"word": "的", "pos": "u"}, {"word": "网络", "pos": "n"}, {"word": "小说", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "作者", "pos": "n"}, {"word": "是", "pos": "v"}, {"word": "啤酒", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "罪孽", "pos": "n"}], "text": "《星空黑夜传奇》是连载于起点中文网的网络小说,作者是啤酒的罪孽", "spo_list": [{"predicate": "连载网站", "object_type": "网站", "subject_type": "网络小说", "object": "起点中文网", "subject": "《星空黑夜传奇》"}, {"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "啤酒的罪孽", "subject": "《星空黑夜传奇》"}]} {"postag": [{"word": "Chanda Mushili", "pos": "nz"}, {"word": ",", "pos": "w"}, {"word": "赞比亚", "pos": "ns"}, {"word": "籍", "pos": "n"}, {"word": "运动员", "pos": "n"}], "text": "Chanda Mushili,赞比亚籍运动员", "spo_list": [{"predicate": "国籍", "object_type": "国家", "subject_type": "人物", "object": "赞比亚", "subject": "Chanda Mushili"}]} {"postag": [{"word": "陈奕迅", "pos": "nr"}, {"word": "2011", "pos": "m"}, {"word": "新专辑", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "?", "pos": "w"}, {"word": "》", "pos": "w"}, {"word": "第二", "pos": "m"}, {"word": "粤语", "pos": "nz"}, {"word": "主打", "pos": "v"}, {"word": "《", "pos": "w"}, {"word": "神奇化妆师", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "。", "pos": "w"}, {"word": "粤语", "pos": "nz"}, {"word": "版", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "《", "pos": "w"}, {"word": "神奇化妆师", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "(", "pos": "w"}, {"word": "国语版", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "看穿", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": ")", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "蓝又时", "pos": "nr"}, {"word": "作曲", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "林夕", "pos": "nr"}, {"word": "填词", "pos": "v"}, {"word": "、", "pos": "w"}, {"word": "梁荣骏", "pos": "nr"}, {"word": "监制", "pos": "v"}, {"word": "、", "pos": "w"}, {"word": "Gary", "pos": "nr"}, {"word": "Tong", "pos": "n"}, {"word": "编曲", "pos": "vn"}, {"word": ",", "pos": "w"}, {"word": "谱", "pos": "n"}, {"word": "写出", "pos": "v"}, {"word": "一首", "pos": "m"}, {"word": "玩味", "pos": "v"}, {"word": "又", "pos": "d"}, {"word": "跳跃", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "旋律", "pos": "n"}], "text": "陈奕迅2011新专辑《?》第二粤语主打《神奇化妆师》。粤语版的《神奇化妆师》(国语版《看穿》),由蓝又时作曲,林夕填词、梁荣骏监制、GaryTong编曲,谱写出一首玩味又跳跃的旋律", "spo_list": [{"predicate": "所属专辑", "object_type": "音乐专辑", "subject_type": "歌曲", "object": "《?》", "subject": "《神奇化妆师》"}, {"predicate": "歌手", "object_type": "人物", "subject_type": "歌曲", "object": "陈奕迅", "subject": "《神奇化妆师》"}]} {"postag": [{"word": "莫迪博", "pos": "nr"}, {"word": "·", "pos": "w"}, {"word": "迪亚基特", "pos": "nr"}, {"word": "1987年3月2日", "pos": "t"}, {"word": "出生", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "法国", "pos": "ns"}, {"word": "的", "pos": "u"}, {"word": "赖恩堡", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "身高", "pos": "n"}, {"word": "192cm", "pos": "m"}, {"word": ",", "pos": "w"}, {"word": "司职", "pos": "v"}, {"word": "后卫", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "双脚", "pos": "n"}, {"word": "技术", "pos": "n"}, {"word": "均衡", "pos": "a"}, {"word": ",", "pos": "w"}, {"word": "身披", "pos": "v"}, {"word": "21号", "pos": "m"}, {"word": "战袍", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "曾", "pos": "d"}, {"word": "效力", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "佩斯卡拉", "pos": "nt"}, {"word": ",", "pos": "w"}, {"word": "现", "pos": "t"}, {"word": "效力", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "拉齐奥队", "pos": "nt"}], "text": "莫迪博·迪亚基特1987年3月2日出生于法国的赖恩堡,身高192cm,司职后卫,双脚技术均衡,身披21号战袍,曾效力于佩斯卡拉,现效力于拉齐奥队", "spo_list": [{"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1987年3月2日", "subject": "莫迪博·迪亚基特"}, {"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "赖恩堡", "subject": "莫迪博·迪亚基特"}, {"predicate": "身高", "object_type": "Number", "subject_type": "人物", "object": "192cm", "subject": "莫迪博·迪亚基特"}]} ================================================ FILE: main.py ================================================ #! -*- coding:utf-8 -*- import json import numpy as np from random import choice from tqdm import tqdm import model import torch from torch.autograd import Variable #import data_prepare import os import torch.utils.data as Data import torch.nn.functional as F import time torch.backends.cudnn.benchmark = True CHAR_SIZE = 128 SENT_LENGTH = 4 HIDDEN_SIZE = 64 EPOCH_NUM = 100 BATCH_SIZE = 64 def get_now_time(): a = time.time() return time.ctime(a) def seq_padding(X): L = [len(x) for x in X] ML = max(L) #print("ML",ML) return [x + [0] * (ML - len(x)) for x in X] def seq_padding_vec(X): L = [len(x) for x in X] ML = max(L) #print("ML",ML) return [x + [[1,0]] * (ML - len(x)) for x in X] train_data = json.load(open('./train_data_me.json')) dev_data = json.load(open('./dev_data_me.json')) id2predicate, predicate2id = json.load(open('./all_50_schemas_me.json')) id2predicate = {int(i):j for i,j in id2predicate.items()} id2char, char2id = json.load(open('./all_chars_me.json')) num_classes = len(id2predicate) class data_generator: def __init__(self, data, batch_size=64): self.data = data self.batch_size = batch_size self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def pro_res(self): idxs = list(range(len(self.data))) #print(idxs) np.random.shuffle(idxs) T, S1, S2, K1, K2, O1, O2, = [], [], [], [], [], [], [] for i in idxs: d = self.data[i] text = d['text'] items = {} for sp in d['spo_list']: subjectid = text.find(sp[0]) objectid = text.find(sp[2]) if subjectid != -1 and objectid != -1: key = (subjectid, subjectid+len(sp[0])) if key not in items: items[key] = [] items[key].append((objectid,objectid+len(sp[2]),predicate2id[sp[1]])) if items: T.append([char2id.get(c, 1) for c in text]) # 1是unk,0是padding # s1, s2 = [[1,0]] * len(text), [[1,0]] * len(text) s1, s2 = [0] * len(text), [0] * len(text) for j in items: # s1[j[0]] = [0,1] # s2[j[1]-1] = [0,1] s1[j[0]] = 1 s2[j[1]-1] = 1 #print(items.keys()) k1, k2 = choice(list(items.keys())) o1, o2 = [0] * len(text), [0] * len(text) # 0是unk类(共49+1个类) for j in items[(k1, k2)]: o1[j[0]] = j[2] o2[j[1]-1] = j[2] S1.append(s1) S2.append(s2) K1.append([k1]) K2.append([k2-1]) O1.append(o1) O2.append(o2) T = np.array(seq_padding(T)) S1 = np.array(seq_padding(S1)) S2 = np.array(seq_padding(S2)) O1 = np.array(seq_padding(O1)) O2 = np.array(seq_padding(O2)) K1, K2 = np.array(K1), np.array(K2) return [T, S1, S2, K1, K2, O1, O2] class myDataset(Data.Dataset): """ 下载数据、初始化数据,都可以在这里完成 """ def __init__(self,_T,_S1,_S2,_K1,_K2,_O1,_O2): #xy = np.loadtxt('../dataSet/diabetes.csv.gz', delimiter=',', dtype=np.float32) # 使用numpy读取数据 self.x_data = _T self.y1_data = _S1 self.y2_data = _S2 self.k1_data = _K1 self.k2_data = _K2 self.o1_data = _O1 self.o2_data = _O2 self.len = len(self.x_data) def __getitem__(self, index): return self.x_data[index], self.y1_data[index],self.y2_data[index],self.k1_data[index],self.k2_data[index],self.o1_data[index],self.o2_data[index] def __len__(self): return self.len def collate_fn(data): t = np.array([item[0] for item in data], np.int32) s1 = np.array([item[1] for item in data], np.int32) s2 = np.array([item[2] for item in data], np.int32) k1 = np.array([item[3] for item in data], np.int32) k2 = np.array([item[4] for item in data], np.int32) o1 = np.array([item[5] for item in data], np.int32) o2 = np.array([item[6] for item in data], np.int32) return { 'T': torch.LongTensor(t), # targets_i 'S1': torch.FloatTensor(s1), 'S2': torch.FloatTensor(s2), 'K1': torch.LongTensor(k1), 'K2': torch.LongTensor(k2), 'O1': torch.LongTensor(o1), 'O2': torch.LongTensor(o2), } dg = data_generator(train_data) T, S1, S2, K1, K2, O1, O2 = dg.pro_res() # print("len",len(T)) torch_dataset = myDataset(T,S1,S2,K1,K2,O1,O2) loader = Data.DataLoader( dataset=torch_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=True, # random shuffle for training num_workers=8, collate_fn=collate_fn, # subprocesses for loading data ) # print("len",len(id2char)) s_m = model.s_model(len(char2id)+2,CHAR_SIZE,HIDDEN_SIZE).cuda() po_m = model.po_model(len(char2id)+2,CHAR_SIZE,HIDDEN_SIZE,49).cuda() params = list(s_m.parameters()) params += list(po_m.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) loss = torch.nn.CrossEntropyLoss().cuda() b_loss = torch.nn.BCEWithLogitsLoss().cuda() def extract_items(text_in): R = [] _s = [char2id.get(c, 1) for c in text_in] _s = np.array([_s]) _k1, _k2,t , t_max,mask = s_m(torch.LongTensor(_s).cuda()) _k1, _k2 = _k1[0, :, 0], _k2[0, :, 0] _kk1s = [] for i,_kk1 in enumerate(_k1): if _kk1 > 0.5: _subject = '' for j,_kk2 in enumerate(_k2[i:]): if _kk2 > 0.5: _subject = text_in[i: i+j+1] break if _subject: _k1, _k2 = torch.LongTensor([[i]]), torch.LongTensor([[i+j]]) #np.array([i]), np.array([i+j]) _o1, _o2 = po_m(t.cuda(),t_max.cuda(),_k1.cuda(),_k2.cuda()) _o1, _o2 = _o1.cpu().data.numpy(), _o2.cpu().data.numpy() _o1, _o2 = np.argmax(_o1[0], 1), np.argmax(_o2[0], 1) for i,_oo1 in enumerate(_o1): if _oo1 > 0: for j,_oo2 in enumerate(_o2[i:]): if _oo2 == _oo1: _object = text_in[i: i+j+1] _predicate = id2predicate[_oo1] # print((_subject, _predicate, _object)) R.append((_subject, _predicate, _object)) break _kk1s.append(_kk1.data.cpu().numpy()) _kk1s = np.array(_kk1s) return list(set(R)) def evaluate(): A, B, C = 1e-10, 1e-10, 1e-10 cnt = 0 for d in tqdm(iter(dev_data)): R = set(extract_items(d['text'])) T = set([tuple(i) for i in d['spo_list']]) A += len(R & T) B += len(R) C += len(T) # if cnt % 1000 == 0: # print('iter: %d f1: %.4f, precision: %.4f, recall: %.4f\n' % (cnt, 2 * A / (B + C), A / B, A / C)) cnt += 1 return 2 * A / (B + C), A / B, A / C best_f1 = 0 best_epoch = 0 for i in range(EPOCH_NUM): for step, loader_res in tqdm(iter(enumerate(loader))): # print(get_now_time()) t_s = loader_res["T"].cuda() k1 = loader_res["K1"].cuda() k2 = loader_res["K2"].cuda() s1 = loader_res["S1"].cuda() s2 = loader_res["S2"].cuda() o1 = loader_res["O1"].cuda() o2 = loader_res["O2"].cuda() ps_1,ps_2,t,t_max,mask = s_m(t_s) t,t_max,k1,k2 = t.cuda(),t_max.cuda(),k1.cuda(),k2.cuda() po_1,po_2 = po_m(t,t_max,k1,k2) ps_1 = ps_1.cuda() ps_2 = ps_2.cuda() po_1 = po_1.cuda() po_2 = po_2.cuda() s1 = torch.unsqueeze(s1,2) s2 = torch.unsqueeze(s2,2) s1_loss = b_loss(ps_1,s1) s1_loss = torch.sum(s1_loss.mul(mask))/torch.sum(mask) s2_loss = b_loss(ps_2,s2) s2_loss = torch.sum(s2_loss.mul(mask))/torch.sum(mask) po_1 = po_1.permute(0,2,1) po_2 = po_2.permute(0,2,1) o1_loss = loss(po_1,o1) o1_loss = torch.sum(o1_loss.mul(mask[:,:,0])) / torch.sum(mask) o2_loss = loss(po_2,o2) o2_loss = torch.sum(o2_loss.mul(mask[:,:,0])) / torch.sum(mask) loss_sum = 2.5 * (s1_loss + s2_loss) + (o1_loss + o2_loss) # if step % 500 == 0: # torch.save(s_m, 'models_real/s_'+str(step)+"epoch_"+str(i)+'.pkl') # torch.save(po_m, 'models_real/po_'+str(step)+"epoch_"+str(i)+'.pkl') optimizer.zero_grad() loss_sum.backward() optimizer.step() torch.save(s_m, 'models_real/s_'+str(i)+'.pkl') torch.save(po_m, 'models_real/po_'+str(i)+'.pkl') f1, precision, recall = evaluate() print("epoch:",i,"loss:",loss_sum.data) if f1 >= best_f1: best_f1 = f1 best_epoch = i print('f1: %.4f, precision: %.4f, recall: %.4f, bestf1: %.4f, bestepoch: %d \n ' % (f1, precision, recall, best_f1, best_epoch)) ================================================ FILE: model.py ================================================ import torch from torch import nn import numpy as np #import matplotlib.pyplot as plt from torch.autograd import Variable def seq_max_pool(x): """seq是[None, seq_len, s_size]的格式, mask是[None, seq_len, 1]的格式,先除去mask部分, 然后再做maxpooling。 """ seq, mask = x seq = seq - (1 - mask) * 1e10 return torch.max(seq, 1) def seq_and_vec(x): """seq是[None, seq_len, s_size]的格式, vec是[None, v_size]的格式,将vec重复seq_len次,拼到seq上, 得到[None, seq_len, s_size+v_size]的向量。 """ seq , vec = x vec = torch.unsqueeze(vec,1) vec = torch.zeros_like(seq[:, :, :1]) + vec return torch.cat([seq, vec], 2) def seq_gather(x): """seq是[None, seq_len, s_size]的格式, idxs是[None, 1]的格式,在seq的第i个序列中选出第idxs[i]个向量, 最终输出[None, s_size]的向量。 """ seq, idxs = x batch_idxs = torch.arange(0,seq.size(0)).cuda() batch_idxs = torch.unsqueeze(batch_idxs,1) idxs = torch.cat([batch_idxs, idxs], 1) res = [] for i in range(idxs.size(0)): vec = seq[idxs[i][0],idxs[i][1],:] res.append(torch.unsqueeze(vec,0)) res = torch.cat(res) return res class s_model(nn.Module): def __init__(self,word_dict_length,word_emb_size,lstm_hidden_size): super(s_model,self).__init__() self.embeds = nn.Embedding(word_dict_length, word_emb_size).cuda() self.fc1_dropout = nn.Sequential( nn.Dropout(0.25).cuda(), # drop 20% of the neuron ).cuda() self.lstm1 = nn.LSTM( input_size = word_emb_size, hidden_size = int(word_emb_size/2), num_layers = 1, batch_first = True, bidirectional = True ).cuda() self.lstm2 = nn.LSTM( input_size = word_emb_size, hidden_size = int(word_emb_size/2), num_layers = 1, batch_first = True, bidirectional = True ).cuda() self.conv1 = nn.Sequential( nn.Conv1d( in_channels=word_emb_size*2, #输入的深度 out_channels=word_emb_size,#filter 的个数,输出的高度 kernel_size = 3,#filter的长与宽 stride=1,#每隔多少步跳一下 padding=1,#周围围上一圈 if stride= 1, pading=(kernel_size-1)/2 ).cuda(), nn.ReLU().cuda(), ).cuda() self.fc_ps1 = nn.Sequential( nn.Linear(word_emb_size,1), ).cuda() self.fc_ps2 = nn.Sequential( nn.Linear(word_emb_size,1), ).cuda() def forward(self,t): mask = torch.gt(torch.unsqueeze(t,2),0).type(torch.cuda.FloatTensor) #(batch_size,sent_len,1) mask.requires_grad = False outs = self.embeds(t) t = outs t = self.fc1_dropout(t) t = t.mul(mask) # (batch_size,sent_len,char_size) t, (h_n, c_n) = self.lstm1(t,None) t, (h_n, c_n) = self.lstm2(t,None) t_max,t_max_index = seq_max_pool([t,mask]) t_dim = list(t.size())[-1] h = seq_and_vec([t, t_max]) h = h.permute(0,2,1) h = self.conv1(h) h = h.permute(0,2,1) ps1 = self.fc_ps1(h) ps2 = self.fc_ps2(h) return [ps1.cuda(),ps2.cuda(),t.cuda(),t_max.cuda(),mask.cuda()] class po_model(nn.Module): def __init__(self,word_dict_length,word_emb_size,lstm_hidden_size,num_classes): super(po_model,self).__init__() self.conv1 = nn.Sequential( nn.Conv1d( in_channels=word_emb_size*4, #输入的深度 out_channels=word_emb_size,#filter 的个数,输出的高度 kernel_size = 3,#filter的长与宽 stride=1,#每隔多少步跳一下 padding=1,#周围围上一圈 if stride= 1, pading=(kernel_size-1)/2 ).cuda(), nn.ReLU().cuda(), ).cuda() self.fc_ps1 = nn.Sequential( nn.Linear(word_emb_size,num_classes+1).cuda(), # nn.Softmax(), ).cuda() self.fc_ps2 = nn.Sequential( nn.Linear(word_emb_size,num_classes+1).cuda(), # nn.Softmax(), ).cuda() def forward(self,t,t_max,k1,k2): k1 = seq_gather([t,k1]) k2 = seq_gather([t,k2]) k = torch.cat([k1,k2],1) h = seq_and_vec([t,t_max]) h = seq_and_vec([h,k]) h = h.permute(0,2,1) h = self.conv1(h) h = h.permute(0,2,1) po1 = self.fc_ps1(h) po2 = self.fc_ps2(h) return [po1.cuda(),po2.cuda()] ================================================ FILE: models_real/README.md ================================================ # 模型目录 这个目录用来存代码保存的模型 ================================================ FILE: train_data.json ================================================ {"postag": [{"word": "《", "pos": "w"}, {"word": "冰山上的来客", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "戴冰", "pos": "nr"}, {"word": "执导", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "军事", "pos": "n"}, {"word": "悬疑", "pos": "n"}, {"word": "谍战片", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "王洛勇", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "于荣光", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "努尔比亚", "pos": "nr"}, {"word": "等", "pos": "u"}, {"word": "主演", "pos": "v"}], "text": "《冰山上的来客》是戴冰执导的军事悬疑谍战片,由王洛勇、于荣光、努尔比亚等主演", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "努尔比亚", "subject": "《冰山上的来客》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "王洛勇", "subject": "《冰山上的来客》"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "戴冰", "subject": "《冰山上的来客》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "于荣光", "subject": "《冰山上的来客》"}]} {"postag": [{"word": "影片", "pos": "n"}, {"word": "中", "pos": "f"}, {"word": "刘诗诗", "pos": "nr"}, {"word": "饰演", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "米楠", "pos": "nr"}, {"word": "是", "pos": "v"}, {"word": "痕迹", "pos": "n"}, {"word": "学", "pos": "v"}, {"word": "专家", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "她", "pos": "r"}, {"word": "留", "pos": "v"}, {"word": "着", "pos": "u"}, {"word": "短发", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "行事", "pos": "vn"}, {"word": "干练", "pos": "a"}, {"word": ",", "pos": "w"}, {"word": "该片", "pos": "r"}, {"word": "改编", "pos": "v"}, {"word": "自", "pos": "p"}, {"word": "雷米", "pos": "nr"}, {"word": "系列", "pos": "n"}, {"word": "犯罪", "pos": "vn"}, {"word": "小说", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "心理罪", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "最后", "pos": "a"}, {"word": "一部", "pos": "m"}, {"word": "《", "pos": "w"}, {"word": "城市之光", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "讲述", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "神探", "pos": "n"}, {"word": "方木", "pos": "nr"}, {"word": "抓捕", "pos": "v"}, {"word": "高智商", "pos": "n"}, {"word": "变态", "pos": "n"}, {"word": "杀人犯", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "故事", "pos": "n"}], "text": "影片中刘诗诗饰演的米楠是痕迹学专家,她留着短发,行事干练,该片改编自雷米系列犯罪小说《心理罪》最后一部《城市之光》,讲述了神探方木抓捕高智商变态杀人犯的故事", "spo_list": [{"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "雷米", "subject": "《心理罪》"}]} {"postag": [{"word": "布丹", "pos": "nr"}, {"word": "出生于", "pos": "v"}, {"word": "1824年", "pos": "t"}, {"word": "的", "pos": "u"}, {"word": "法国", "pos": "ns"}, {"word": "画家", "pos": "n"}], "text": "布丹出生于1824年的法国画家", "spo_list": [{"predicate": "国籍", "object_type": "国家", "subject_type": "人物", "object": "法国", "subject": "布丹"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1824年", "subject": "布丹"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "森林报-秋", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "2007年", "pos": "t"}, {"word": "二十一世纪出版社", "pos": "nt"}, {"word": "出版", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "图书", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "作者", "pos": "n"}, {"word": "是", "pos": "v"}, {"word": "(", "pos": "w"}, {"word": "苏联", "pos": "ns"}, {"word": ")", "pos": "w"}, {"word": "维", "pos": "nr"}, {"word": "·", "pos": "w"}, {"word": "比安基", "pos": "nr"}], "text": "《森林报-秋》是2007年二十一世纪出版社出版的图书,作者是(苏联)维·比安基", "spo_list": [{"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "维·比安基", "subject": "《森林报-秋》"}, {"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", "object": "二十一世纪出版社", "subject": "《森林报-秋》"}]} {"postag": [{"word": "伴随", "pos": "v"}, {"word": "着", "pos": "u"}, {"word": "春节", "pos": "nz"}, {"word": "的", "pos": "u"}, {"word": "结束", "pos": "vn"}, {"word": ",", "pos": "w"}, {"word": "又", "pos": "d"}, {"word": "有", "pos": "v"}, {"word": "一批", "pos": "m"}, {"word": "新剧", "pos": "n"}, {"word": "播出", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "包括", "pos": "v"}, {"word": "张一山", "pos": "nr"}, {"word": "的", "pos": "u"}, {"word": "《", "pos": "w"}, {"word": "我的父亲我的兵", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "左小青", "pos": "nr"}, {"word": "的", "pos": "u"}, {"word": "《", "pos": "w"}, {"word": "台湾往事", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "还有", "pos": "v"}, {"word": "《", "pos": "w"}, {"word": "风光大嫁", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "等", "pos": "u"}], "text": "伴随着春节的结束,又有一批新剧播出,包括张一山的《我的父亲我的兵》,左小青的《台湾往事》,还有《风光大嫁》等", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "张一山", "subject": "《我的父亲我的兵》"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "心理罪", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "由", "pos": "p"}, {"word": "凤凰", "pos": "nz"}, {"word": "联动", "pos": "v"}, {"word": "影业", "pos": "n"}, {"word": "和", "pos": "c"}, {"word": "爱奇艺", "pos": "nt"}, {"word": "联合", "pos": "vd"}, {"word": "出品", "pos": "v"}, {"word": "、", "pos": "w"}, {"word": "根据", "pos": "p"}, {"word": "作家", "pos": "n"}, {"word": "雷米", "pos": "nr"}, {"word": "所", "pos": "u"}, {"word": "著", "pos": "u"}, {"word": "的", "pos": "u"}, {"word": "同名", "pos": "vn"}, {"word": "系列", "pos": "n"}, {"word": "小说", "pos": "n"}, {"word": "改编", "pos": "v"}, {"word": "而成", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "犯罪", "pos": "vn"}, {"word": "悬疑", "pos": "n"}, {"word": "网络", "pos": "n"}, {"word": "剧", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "五百", "pos": "m"}, {"word": "执导", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "顾小白", "pos": "nr"}, {"word": "编剧", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "陈若轩", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "付枚", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "王泷正", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "温心", "pos": "nr"}, {"word": "等", "pos": "u"}, {"word": "联袂", "pos": "d"}, {"word": "主演", "pos": "v"}], "text": "《心理罪》是由凤凰联动影业和爱奇艺联合出品、根据作家雷米所著的同名系列小说改编而成的犯罪悬疑网络剧,由五百执导,顾小白编剧,陈若轩、付枚、王泷正、温心等联袂主演", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "陈若轩", "subject": "《心理罪》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "王泷正", "subject": "《心理罪》"}, {"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "雷米", "subject": "《心理罪》"}, {"predicate": "编剧", "object_type": "人物", "subject_type": "影视作品", "object": "顾小白", "subject": "《心理罪》"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "五百", "subject": "《心理罪》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "付枚", "subject": "《心理罪》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "温心", "subject": "《心理罪》"}]} {"postag": [{"word": "李治", "pos": "nr"}, {"word": "不", "pos": "d"}, {"word": "喜欢", "pos": "v"}, {"word": "李忠", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "他", "pos": "r"}, {"word": "爱", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "是", "pos": "v"}, {"word": "萧淑妃", "pos": "nr"}, {"word": "所", "pos": "u"}, {"word": "生", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "雍王", "pos": "nr"}, {"word": "李素节", "pos": "nr"}], "text": "李治不喜欢李忠,他爱的是萧淑妃所生的雍王李素节", "spo_list": [{"predicate": "母亲", "object_type": "人物", "subject_type": "人物", "object": "萧淑妃", "subject": "李素节"}, {"predicate": "丈夫", "object_type": "人物", "subject_type": "人物", "object": "李治", "subject": "萧淑妃"}, {"predicate": "妻子", "object_type": "人物", "subject_type": "人物", "object": "萧淑妃", "subject": "李治"}]} {"postag": [{"word": "基本", "pos": "a"}, {"word": "信息", "pos": "n"}, {"word": " ", "pos": "v"}, {"word": "片名", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "咪咪", "pos": "n"}, {"word": "找", "pos": "v"}, {"word": "妈妈", "pos": "n"}, {"word": " ", "pos": "v"}, {"word": "编剧", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "贺梦凡", "pos": "nr"}, {"word": " ", "pos": "v"}, {"word": "导演", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "贺梦凡", "pos": "nr"}, {"word": " ", "pos": "v"}, {"word": "题材", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "童话", "pos": "n"}, {"word": " ", "pos": "v"}, {"word": "集数", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "52 ", "pos": "m"}, {"word": "每", "pos": "r"}, {"word": "集", "pos": "q"}, {"word": "长度", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "15分钟", "pos": "m"}, {"word": " ", "pos": "v"}, {"word": "计划", "pos": "n"}, {"word": "投拍", "pos": "vn"}, {"word": "时间", "pos": "n"}, {"word": ":", "pos": "w"}, {"word": "2011", "pos": "m"}], "text": "基本信息 片名:咪咪找妈妈 编剧:贺梦凡 导演:贺梦凡 题材:童话 集数:52 每集长度:15分钟 计划投拍时间:2011", "spo_list": [{"predicate": "编剧", "object_type": "人物", "subject_type": "影视作品", "object": "贺梦凡", "subject": "咪咪找妈妈"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "贺梦凡", "subject": "咪咪找妈妈"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "健行天下:带上一本健康的书去出行", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "一", "pos": "m"}, {"word": "书", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "出版社", "pos": "n"}, {"word": "是", "pos": "v"}, {"word": "人民军医出版社", "pos": "nt"}, {"word": ",", "pos": "w"}, {"word": "作者", "pos": "n"}, {"word": "是", "pos": "v"}, {"word": "秦惠基", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "出版", "pos": "vn"}, {"word": "时间", "pos": "n"}, {"word": "是", "pos": "v"}, {"word": " ", "pos": "w"}, {"word": "2006年4月1日", "pos": "t"}], "text": "《健行天下:带上一本健康的书去出行》一书的出版社是人民军医出版社,作者是秦惠基,出版时间是 2006年4月1日", "spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", "object": "人民军医出版社", "subject": "《健行天下:带上一本健康的书去出行》"}, {"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "秦惠基", "subject": "《健行天下:带上一本健康的书去出行》"}]} {"postag": [{"word": "1994年", "pos": "t"}, {"word": ",", "pos": "w"}, {"word": "许晴", "pos": "nr"}, {"word": "又", "pos": "d"}, {"word": "与", "pos": "p"}, {"word": "王志文", "pos": "nr"}, {"word": "拍摄", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "电视剧", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "东边日出西边雨", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "但", "pos": "c"}, {"word": "之后", "pos": "f"}, {"word": "王志文", "pos": "nr"}, {"word": "突然", "pos": "ad"}, {"word": "离开", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "北京", "pos": "ns"}, {"word": ",", "pos": "w"}, {"word": "到", "pos": "v"}, {"word": "上海", "pos": "ns"}, {"word": "发展", "pos": "v"}, {"word": "事业", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "这", "pos": "r"}, {"word": "段", "pos": "q"}, {"word": "恋情", "pos": "n"}, {"word": "戛然而止", "pos": "v"}], "text": "1994年,许晴又与王志文拍摄了电视剧《东边日出西边雨》,但之后王志文突然离开了北京,到上海发展事业,这段恋情戛然而止", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "许晴", "subject": "《东边日出西边雨》"}]} {"postag": [{"word": "马红宝", "pos": "nr"}, {"word": " ", "pos": "w"}, {"word": "男", "pos": "a"}, {"word": " ", "pos": "w"}, {"word": "汉族", "pos": "nz"}, {"word": ",", "pos": "w"}, {"word": "1949年", "pos": "t"}, {"word": "8月", "pos": "t"}, {"word": "生", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "浙江省", "pos": "ns"}, {"word": "长兴县", "pos": "ns"}, {"word": "人", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "1978年", "pos": "t"}, {"word": "8月", "pos": "t"}, {"word": "加入", "pos": "v"}, {"word": "中国共产党", "pos": "nt"}, {"word": ",", "pos": "w"}, {"word": "1967年", "pos": "t"}, {"word": "9月", "pos": "t"}, {"word": "参加", "pos": "v"}, {"word": "工作", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "大普", "pos": "n"}, {"word": "文化", "pos": "n"}], "text": "马红宝 男 汉族,1949年8月生,浙江省长兴县人,1978年8月加入中国共产党,1967年9月参加工作,大普文化", "spo_list": [{"predicate": "民族", "object_type": "Text", "subject_type": "人物", "object": "汉族", "subject": "马红宝"}, {"predicate": "国籍", "object_type": "国家", "subject_type": "人物", "object": "中国", "subject": "马红宝"}, {"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "浙江省长兴", "subject": "马红宝"}, {"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "1949年8月", "subject": "马红宝"}]} {"postag": [{"word": "吕雅娟", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "博士", "pos": "n"}, {"word": "毕业", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "哈尔滨工业大学", "pos": "nt"}, {"word": ",", "pos": "w"}, {"word": "现任", "pos": "v"}, {"word": "百度", "pos": "nt"}, {"word": "高级", "pos": "a"}, {"word": "研究员", "pos": "n"}], "text": "吕雅娟,博士毕业于哈尔滨工业大学,现任百度高级研究员", "spo_list": [{"predicate": "毕业院校", "object_type": "学校", "subject_type": "人物", "object": "哈尔滨工业大学", "subject": "吕雅娟"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "小王子", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "由", "pos": "p"}, {"word": "神田武幸", "pos": "nr"}, {"word": "导演", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "松野达也", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "増冈弘", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "松尾佳子", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "た", "pos": "w"}, {"word": "て", "pos": "w"}, {"word": "か", "pos": "w"}, {"word": "べ", "pos": "w"}, {"word": "和", "pos": "c"}, {"word": "也", "pos": "d"}, {"word": "主演", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "1978年", "pos": "t"}, {"word": "7月", "pos": "t"}, {"word": "上映", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "电影", "pos": "n"}], "text": "《小王子》是由神田武幸导演,松野达也、増冈弘、松尾佳子、たてかべ和也主演,1978年7月上映的电影", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "たてかべ和也", "subject": "《小王子》"}, {"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "神田武幸", "subject": "《小王子》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "松尾佳子", "subject": "《小王子》"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "你的嘴", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "收录", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "歌手", "pos": "n"}, {"word": "金莎", "pos": "nr"}, {"word": "的", "pos": "u"}, {"word": "音乐", "pos": "n"}, {"word": "专辑", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "星月神话", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "由", "pos": "p"}, {"word": "许嵩", "pos": "nr"}, {"word": "作词", "pos": "v"}, {"word": "作曲", "pos": "vn"}, {"word": ",", "pos": "w"}, {"word": "2010年10月15日", "pos": "t"}, {"word": "首发", "pos": "v"}], "text": "《你的嘴》收录于歌手金莎的音乐专辑《星月神话》,由许嵩作词作曲,2010年10月15日首发", "spo_list": [{"predicate": "所属专辑", "object_type": "音乐专辑", "subject_type": "歌曲", "object": "《星月神话》", "subject": "《你的嘴》"}, {"predicate": "作词", "object_type": "人物", "subject_type": "歌曲", "object": "许嵩", "subject": "《你的嘴》"}, {"predicate": "作曲", "object_type": "人物", "subject_type": "歌曲", "object": "许嵩", "subject": "《你的嘴》"}, {"predicate": "歌手", "object_type": "人物", "subject_type": "歌曲", "object": "金莎", "subject": "《你的嘴》"}]} {"postag": [{"word": "黑泽明", "pos": "nr"}, {"word": "在", "pos": "p"}, {"word": "他", "pos": "r"}, {"word": "的", "pos": "u"}, {"word": "自传", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "蛤蟆的油", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "结尾处", "pos": "n"}, {"word": "写道", "pos": "v"}, {"word": ":", "pos": "w"}, {"word": "再", "pos": "d"}, {"word": "没有", "pos": "v"}, {"word": "什么", "pos": "r"}, {"word": "比", "pos": "p"}, {"word": "作品", "pos": "n"}, {"word": "更", "pos": "d"}, {"word": "能", "pos": "v"}, {"word": "说明", "pos": "v"}, {"word": "作者", "pos": "n"}, {"word": "了", "pos": "xc"}, {"word": ",", "pos": "w"}, {"word": "所以", "pos": "c"}, {"word": "如果", "pos": "c"}, {"word": "想", "pos": "v"}, {"word": "了解", "pos": "v"}, {"word": "王尼玛", "pos": "nr"}, {"word": "是", "pos": "v"}, {"word": "一个", "pos": "m"}, {"word": "怎么样", "pos": "r"}, {"word": "的", "pos": "u"}, {"word": "人", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "在", "pos": "p"}, {"word": "不", "pos": "d"}, {"word": "认识", "pos": "v"}, {"word": "他", "pos": "r"}, {"word": "的", "pos": "u"}, {"word": "前提", "pos": "n"}, {"word": "下", "pos": "f"}, {"word": ",", "pos": "w"}, {"word": "应该", "pos": "v"}, {"word": "去", "pos": "v"}, {"word": "看看", "pos": "v"}, {"word": "暴走大事件", "pos": "nw"}, {"word": "是", "pos": "v"}, {"word": "一档", "pos": "m"}, {"word": "怎么样", "pos": "r"}, {"word": "的", "pos": "u"}, {"word": "节目", "pos": "n"}], "text": "黑泽明在他的自传《蛤蟆的油》结尾处写道:再没有什么比作品更能说明作者了,所以如果想了解王尼玛是一个怎么样的人,在不认识他的前提下,应该去看看暴走大事件是一档怎么样的节目", "spo_list": [{"predicate": "作者", "object_type": "人物", "subject_type": "图书作品", "object": "黑泽明", "subject": "《蛤蟆的油》"}]} {"postag": [{"word": "红地球葡萄栽培技术问答", "pos": "nw"}, {"word": "是", "pos": "v"}, {"word": "一本", "pos": "m"}, {"word": "由", "pos": "p"}, {"word": "刘洪章", "pos": "nr"}, {"word": "主编", "pos": "v"}, {"word": ",", "pos": "w"}, {"word": "天津科技翻译出版公司", "pos": "nt"}, {"word": "出版", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "关于", "pos": "p"}, {"word": "红地球葡萄", "pos": "nz"}, {"word": "标准化", "pos": "v"}, {"word": "、", "pos": "w"}, {"word": "产业化", "pos": "vn"}, {"word": "栽培", "pos": "vn"}, {"word": "技术", "pos": "n"}, {"word": "为", "pos": "v"}, {"word": "核心", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "图书", "pos": "n"}], "text": "红地球葡萄栽培技术问答是一本由刘洪章主编,天津科技翻译出版公司出版的关于红地球葡萄标准化、产业化栽培技术为核心的图书", "spo_list": [{"predicate": "出版社", "object_type": "出版社", "subject_type": "书籍", "object": "天津科技翻译出版公司", "subject": "红地球葡萄栽培技术问答"}]} {"postag": [{"word": "前秦世祖", "pos": "nz"}, {"word": "宣昭皇帝", "pos": "nr"}, {"word": "苻坚", "pos": "nr"}, {"word": "(", "pos": "w"}, {"word": "338年-385年10月16日", "pos": "t"}, {"word": ")", "pos": "w"}, {"word": ",", "pos": "w"}, {"word": "字", "pos": "n"}, {"word": "永固", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "又", "pos": "d"}, {"word": "字", "pos": "n"}, {"word": "文玉", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "小名", "pos": "n"}, {"word": "坚", "pos": "a"}, {"word": "头", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "氐族", "pos": "nz"}, {"word": ",", "pos": "w"}, {"word": "略阳", "pos": "ns"}, {"word": "临渭", "pos": "ns"}, {"word": "(", "pos": "w"}, {"word": "今", "pos": "t"}, {"word": "甘肃", "pos": "ns"}, {"word": "秦安", "pos": "ns"}, {"word": ")", "pos": "w"}, {"word": "人", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "十六国", "pos": "n"}, {"word": "时期", "pos": "n"}, {"word": "前秦", "pos": "t"}, {"word": "的", "pos": "u"}, {"word": "君主", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "公元", "pos": "n"}, {"word": "357", "pos": "m"}, {"word": "-", "pos": "w"}, {"word": "385年", "pos": "m"}, {"word": "在位", "pos": "v"}], "text": "前秦世祖宣昭皇帝苻坚(338年-385年10月16日),字永固,又字文玉,小名坚头,氐族,略阳临渭(今甘肃秦安)人,十六国时期前秦的君主,公元357-385年在位", "spo_list": [{"predicate": "出生日期", "object_type": "Date", "subject_type": "人物", "object": "338年", "subject": "苻坚"}, {"predicate": "字", "object_type": "Text", "subject_type": "历史人物", "object": "文玉", "subject": "苻坚"}, {"predicate": "出生地", "object_type": "地点", "subject_type": "人物", "object": "略阳临渭", "subject": "苻坚"}, {"predicate": "民族", "object_type": "Text", "subject_type": "人物", "object": "氐族", "subject": "苻坚"}]} {"postag": [{"word": "目前", "pos": "t"}, {"word": "各", "pos": "r"}, {"word": "大", "pos": "a"}, {"word": "主演", "pos": "n"}, {"word": "已经", "pos": "d"}, {"word": "确定", "pos": "v"}, {"word": "了", "pos": "xc"}, {"word": ",", "pos": "w"}, {"word": "饰演", "pos": "v"}, {"word": "许仙", "pos": "nr"}, {"word": "的", "pos": "u"}, {"word": "是", "pos": "v"}, {"word": "小编", "pos": "n"}, {"word": "喜欢", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "于朦胧", "pos": "nr"}, {"word": "看过", "pos": "v"}, {"word": "于", "pos": "p"}, {"word": "朦胧", "pos": "a"}, {"word": "在", "pos": "p"}, {"word": "《", "pos": "w"}, {"word": "三生三世十里桃花", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": " ", "pos": "w"}, {"word": "中", "pos": "f"}, {"word": "饰演", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "白真", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "和", "pos": "p"}, {"word": "《", "pos": "w"}, {"word": "太子妃升职记", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "中", "pos": "f"}, {"word": "饰演", "pos": "v"}, {"word": "九王", "pos": "nz"}, {"word": "的", "pos": "u"}, {"word": "观众", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "相信", "pos": "v"}, {"word": "多多少少", "pos": "d"}, {"word": "会", "pos": "v"}, {"word": "对", "pos": "p"}, {"word": "他", "pos": "r"}, {"word": "抱有", "pos": "v"}, {"word": "期待", "pos": "vn"}, {"word": ",", "pos": "w"}, {"word": "于", "pos": "p"}, {"word": "朦胧", "pos": "a"}, {"word": "真", "pos": "a"}, {"word": "的", "pos": "u"}, {"word": "是", "pos": "v"}, {"word": "那种", "pos": "r"}, {"word": "温润", "pos": "a"}, {"word": "如玉", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "衣袂", "pos": "n"}, {"word": "飘飘", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "儒雅", "pos": "a"}, {"word": "公子哥", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "气质", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "饰演", "pos": "v"}, {"word": "同样", "pos": "d"}, {"word": "儒雅", "pos": "a"}, {"word": "的", "pos": "u"}, {"word": "许仙", "pos": "nr"}, {"word": "真", "pos": "a"}, {"word": "的", "pos": "u"}, {"word": "是", "pos": "v"}, {"word": "别", "pos": "d"}, {"word": "无", "pos": "v"}, {"word": "二人", "pos": "n"}, {"word": "了", "pos": "xc"}], "text": "目前各大主演已经确定了,饰演许仙的是小编喜欢的于朦胧看过于朦胧在《三生三世十里桃花》 中饰演的白真,和《太子妃升职记》中饰演九王的观众,相信多多少少会对他抱有期待,于朦胧真的是那种温润如玉,衣袂飘飘的儒雅公子哥的气质,饰演同样儒雅的许仙真的是别无二人了", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "于朦胧", "subject": "《三生三世十里桃花》"}, {"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "于朦胧", "subject": "《太子妃升职记》"}]} {"postag": [{"word": "73", "pos": "m"}, {"word": "获奖", "pos": "vn"}, {"word": "记录", "pos": "vn"}, {"word": "人物", "pos": "n"}, {"word": "评价", "pos": "v"}, {"word": "黄磊", "pos": "nr"}, {"word": "是", "pos": "v"}, {"word": "一个", "pos": "m"}, {"word": "特别", "pos": "d"}, {"word": "幸运", "pos": "a"}, {"word": "的", "pos": "u"}, {"word": "演员", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "拍", "pos": "v"}, {"word": "第一部", "pos": "m"}, {"word": "戏", "pos": "n"}, {"word": "就", "pos": "d"}, {"word": "碰到", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "导演", "pos": "n"}, {"word": "陈凯歌", "pos": "nr"}, {"word": ",", "pos": "w"}, {"word": "而且", "pos": "c"}, {"word": "在", "pos": "p"}, {"word": "他", "pos": "r"}, {"word": "的", "pos": "u"}, {"word": "下", "pos": "f"}, {"word": "一部", "pos": "m"}, {"word": "电影", "pos": "n"}, {"word": "《", "pos": "w"}, {"word": "夜半歌声", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "中", "pos": "f"}, {"word": "演", "pos": "v"}, {"word": "对手戏", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "张国荣", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "吴倩莲", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "黎明", "pos": "n"}, {"word": "等", "pos": "u"}, {"word": "都是", "pos": "v"}, {"word": "著名", "pos": "a"}, {"word": "的", "pos": "u"}, {"word": "港台", "pos": "ns"}, {"word": "演员", "pos": "n"}], "text": "73获奖记录人物评价黄磊是一个特别幸运的演员,拍第一部戏就碰到了导演陈凯歌,而且在他的下一部电影《夜半歌声》中演对手戏的张国荣、吴倩莲、黎明等都是著名的港台演员", "spo_list": [{"predicate": "主演", "object_type": "人物", "subject_type": "影视作品", "object": "黄磊", "subject": "《夜半歌声》"}]} {"postag": [{"word": "印象", "pos": "n"}, {"word": "中", "pos": "f"}, {"word": "的", "pos": "u"}, {"word": "《", "pos": "w"}, {"word": "智取威虎山", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "杨子荣", "pos": "nr"}, {"word": "、", "pos": "w"}, {"word": "座山雕", "pos": "nz"}, {"word": ",", "pos": "w"}, {"word": "天王盖地虎", "pos": "nw"}, {"word": ",", "pos": "w"}, {"word": "宝塔镇河妖", "pos": "ns"}, {"word": "可谓", "pos": "v"}, {"word": "是", "pos": "v"}, {"word": "耳熟能详", "pos": "vn"}, {"word": ",", "pos": "w"}, {"word": "感谢", "pos": "v"}, {"word": "徐克", "pos": "nr"}, {"word": "导演", "pos": "n"}, {"word": "让", "pos": "v"}, {"word": "我们", "pos": "r"}, {"word": "这一代", "pos": "r"}, {"word": "年轻人", "pos": "n"}, {"word": "得以", "pos": "v"}, {"word": "重温", "pos": "v"}, {"word": "上", "pos": "f"}, {"word": "一辈", "pos": "m"}, {"word": "人", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "经典", "pos": "a"}, {"word": "印象", "pos": "n"}, {"word": "中", "pos": "f"}, {"word": "的", "pos": "u"}, {"word": "徐克", "pos": "nr"}, {"word": "是", "pos": "v"}, {"word": "港式", "pos": "n"}, {"word": "武侠片", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "代表", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "《", "pos": "w"}, {"word": "新龙门客栈", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "、", "pos": "w"}, {"word": "《", "pos": "w"}, {"word": "黄飞鸿", "pos": "nz"}, {"word": "》", "pos": "w"}, {"word": "系列", "pos": "n"}, {"word": "依然", "pos": "d"}, {"word": "历历在目", "pos": "v"}], "text": "印象中的《智取威虎山》杨子荣、座山雕,天王盖地虎,宝塔镇河妖可谓是耳熟能详,感谢徐克导演让我们这一代年轻人得以重温上一辈人的经典印象中的徐克是港式武侠片的代表,《新龙门客栈》、《黄飞鸿》系列依然历历在目", "spo_list": [{"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "徐克", "subject": "《智取威虎山》"}]} {"postag": [{"word": "《", "pos": "w"}, {"word": "滑板战士", "pos": "nw"}, {"word": "》", "pos": "w"}, {"word": "是", "pos": "v"}, {"word": "虎", "pos": "n"}, {"word": "田", "pos": "n"}, {"word": "功", "pos": "n"}, {"word": "导演", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "Studio DEEN", "pos": "nt"}, {"word": " ", "pos": "w"}, {"word": "NAS", "pos": "nz"}, {"word": " ", "pos": "w"}, {"word": "TV", "pos": "nz"}, {"word": " ", "pos": "w"}, {"word": "Tokyo", "pos": "nz"}, {"word": "出品", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "日本", "pos": "ns"}, {"word": "动画", "pos": "n"}, {"word": "作品", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "讲述", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "是", "pos": "v"}, {"word": "未来", "pos": "t"}, {"word": "的", "pos": "u"}, {"word": "地球", "pos": "n"}, {"word": ",", "pos": "w"}, {"word": "人类", "pos": "n"}, {"word": "受到", "pos": "v"}, {"word": "了", "pos": "u"}, {"word": "在", "pos": "p"}, {"word": "数年", "pos": "m"}, {"word": "前", "pos": "f"}, {"word": "突然", "pos": "ad"}, {"word": "出现", "pos": "v"}, {"word": "的", "pos": "u"}, {"word": "被", "pos": "p"}, {"word": "称", "pos": "v"}, {"word": "之", "pos": "r"}, {"word": "为", "pos": "v"}, {"word": "“", "pos": "w"}, {"word": "巴古希恩", "pos": "nz"}, {"word": "”", "pos": "w"}, {"word": "的", "pos": "u"}, {"word": "迷", "pos": "vn"}, {"word": "之", "pos": "r"}, {"word": "兵器", "pos": "n"}, {"word": "的", "pos": "u"}, {"word": "袭击", "pos": "vn"}], "text": "《滑板战士》是虎田功导演,Studio DEEN NAS TV Tokyo出品的日本动画作品,讲述的是未来的地球,人类受到了在数年前突然出现的被称之为“巴古希恩”的迷之兵器的袭击", "spo_list": [{"predicate": "导演", "object_type": "人物", "subject_type": "影视作品", "object": "虎田功", "subject": "《滑板战士》"}, {"predicate": "出品公司", "object_type": "企业", "subject_type": "影视作品", "object": "Studio DEEN NAS TV Tokyo", "subject": "《滑板战士》"}]} ================================================ FILE: trans.py ================================================ #! -*- coding:utf-8 -*- import json from tqdm import tqdm import codecs all_50_schemas = set() with open('all_50_schemas') as f: for l in tqdm(f): a = json.loads(l) all_50_schemas.add(a['predicate']) id2predicate = {i+1:j for i,j in enumerate(all_50_schemas)} # 0表示终止类别 predicate2id = {j:i for i,j in id2predicate.items()} with codecs.open('all_50_schemas_me.json', 'w', encoding='utf-8') as f: json.dump([id2predicate, predicate2id], f, indent=4, ensure_ascii=False) chars = {} min_count = 2 train_data = [] with open('train_data.json') as f: for l in tqdm(f): a = json.loads(l) train_data.append( { 'text': a['text'], 'spo_list': [(i['subject'], i['predicate'], i['object']) for i in a['spo_list']] } ) for c in a['text']: chars[c] = chars.get(c, 0) + 1 with codecs.open('train_data_me.json', 'w', encoding='utf-8') as f: json.dump(train_data, f, indent=4, ensure_ascii=False) dev_data = [] with open('dev_data.json') as f: for l in tqdm(f): a = json.loads(l) dev_data.append( { 'text': a['text'], 'spo_list': [(i['subject'], i['predicate'], i['object']) for i in a['spo_list']] } ) for c in a['text']: chars[c] = chars.get(c, 0) + 1 with codecs.open('dev_data_me.json', 'w', encoding='utf-8') as f: json.dump(dev_data, f, indent=4, ensure_ascii=False) with codecs.open('all_chars_me.json', 'w', encoding='utf-8') as f: chars = {i:j for i,j in chars.items() if j >= min_count} id2char = {i+2:j for i,j in enumerate(chars)} # padding: 0, unk: 1 char2id = {j:i for i,j in id2char.items()} json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)