forked from tyliupku/soft-label-RE
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
873 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
# @Time : 17-3-21 下午1:50 | ||
# @Author : Tianyu Liu | ||
|
||
import tensorflow as tf | ||
import numpy as np | ||
import util, sys | ||
|
||
|
||
|
||
class model(object): | ||
def __init__(self, pad_len, num_rels, word_vectors, window_size, num_filters, embedding_size, pos_embedding, dropout, batch_num, joint_p, l2_reg=0.0): | ||
self.input = tf.placeholder(tf.int32, [None, pad_len], name="input") | ||
self.preds = tf.placeholder(tf.int32, [None, num_rels], name="preds") | ||
self.mask1 = tf.placeholder(tf.float32, [None, pad_len - window_size + 1, self.num_filters], name="mask_before") | ||
self.mask2 = tf.placeholder(tf.float32, [None, pad_len - window_size + 1, self.num_filters], name="mask_between") | ||
self.mask3 = tf.placeholder(tf.float32, [None, pad_len - window_size + 1, self.num_filters], name="mask_after") | ||
self.wps1 = tf.placeholder(tf.int32, [None, pad_len], name="wps1") | ||
self.wps2 = tf.placeholder(tf.int32, [None, pad_len], name="wps2") | ||
self.pad_len = pad_len | ||
self.window_size = window_size | ||
self.num_rels = num_rels | ||
self.PAD = len(word_vectors) - 1 | ||
self.bag_num = tf.placeholder(tf.int32, [batch_num + 1], name="bag_num") | ||
self.soft_label_flag = tf.placeholder(tf.float32, [batch_num], name="soft_label_flag") | ||
self.joint_p = joint_p | ||
total_num = self.bag_num[-1] | ||
self.batch_num = batch_num | ||
l2_loss = tf.constant(0.0) | ||
|
||
with tf.device('/cpu:0'): | ||
self.embedding = tf.Variable(word_vectors, dtype=tf.float32) | ||
self.inputs = tf.nn.embedding_lookup(self.embedding, self.input) | ||
with tf.name_scope('joint'): | ||
wpe1 = tf.get_variable("wpe1", shape=[62, pos_embedding], initializer=tf.contrib.layers.xavier_initializer()) | ||
wpe2 = tf.get_variable("wpe2", shape=[62, pos_embedding], initializer=tf.contrib.layers.xavier_initializer()) | ||
pos_left = tf.nn.embedding_lookup(wpe1, self.wps1) | ||
pos_right = tf.nn.embedding_lookup(wpe2, self.wps2) | ||
self.pos_embed = tf.concat([pos_left, pos_right], 2) | ||
with tf.name_scope('conv'): | ||
self._input = tf.concat([self.inputs, self.pos_embed], 2) | ||
filter_shape = [window_size, embedding_size + 2*pos_embedding, 1, num_filters] | ||
W = tf.get_variable("conv-W", shape=filter_shape, initializer=tf.contrib.layers.xavier_initializer()) | ||
b = tf.get_variable("conv-b", shape=[num_filters], initializer=tf.contrib.layers.xavier_initializer()) | ||
self.conv = tf.nn.conv2d(tf.expand_dims(self._input, -1), W, strides=[1, 1, 1, 1], padding="VALID", name="conv") | ||
h = tf.nn.tanh(tf.nn.bias_add(self.conv, b), name="tanh") | ||
self.h1 = tf.add(h, tf.expand_dims(self.mask1, 2)) | ||
self.h2 = tf.add(h, tf.expand_dims(self.mask2, 2)) | ||
self.h3 = tf.add(h, tf.expand_dims(self.mask3, 2)) | ||
pooled1 = tf.nn.max_pool(self.h1, ksize=[1, self.pad_len - self.window_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID",name="pool") | ||
poolre1 = tf.reshape(pooled1, [-1, self.num_filters]) | ||
pooled2 = tf.nn.max_pool(self.h2, ksize=[1, self.pad_len - self.window_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID",name="pool") | ||
poolre2 = tf.reshape(pooled2, [-1, self.num_filters]) | ||
pooled3 = tf.nn.max_pool(self.h3, ksize=[1, self.pad_len - self.window_size + 1, 1, 1], strides=[1, 1, 1, 1], padding="VALID",name="pool") | ||
poolre3 = tf.reshape(pooled3, [-1, self.num_filters]) | ||
poolre = tf.concat([poolre1, poolre2, poolre3], 1) | ||
pooled = tf.nn.dropout(poolre, dropout) | ||
with tf.name_scope("map"): | ||
W = tf.get_variable( | ||
"W", | ||
shape=[3*self.num_filters, self.num_rels], | ||
initializer=tf.contrib.layers.xavier_initializer()) | ||
b = tf.get_variable("b", shape=[self.num_rels], initializer=tf.contrib.layers.xavier_initializer()) | ||
l2_loss += tf.nn.l2_loss(W) | ||
l2_loss += tf.nn.l2_loss(b) | ||
|
||
sen_a = tf.get_variable("attention_A", [3*self.num_filters], initializer=tf.contrib.layers.xavier_initializer()) | ||
sen_q = tf.get_variable("query", [3*self.num_filters, 1], initializer=tf.contrib.layers.xavier_initializer()) | ||
sen_r = [] | ||
sen_s = [] | ||
sen_out = [] | ||
sen_alpha = [] | ||
self.bag_score = [] | ||
self.predictions = [] | ||
self.losses = [] | ||
self.accuracy = [] | ||
self.total_loss = 0.0 | ||
# selective attention model, use the weighted sum of all related the sentence vectors as bag representation | ||
for i in range(batch_num): | ||
sen_r.append(pooled[self.bag_num[i]:self.bag_num[i+1]]) | ||
bag_size = self.bag_num[i+1] - self.bag_num[i] | ||
sen_alpha.append(tf.reshape(tf.nn.softmax(tf.reshape(tf.matmul(tf.multiply(sen_r[i], sen_a), sen_q), [bag_size])), [1, bag_size])) | ||
sen_s.append(tf.reshape(tf.matmul(sen_alpha[i], sen_r[i]), [1, 3*self.num_filters])) | ||
sen_out.append(tf.reshape(tf.nn.xw_plus_b(sen_s[i], W, b), [self.num_rels])) | ||
self.bag_score.append(tf.nn.softmax(sen_out[i])) | ||
|
||
with tf.name_scope("output"): | ||
self.predictions.append(tf.argmax(self.bag_score[i], 0, name="predictions")) | ||
|
||
with tf.name_scope("loss"): | ||
|
||
nscor = self.soft_label_flag[i] * self.bag_score[i] + joint_p * tf.reduce_max(self.bag_score[i])* tf.cast(self.preds[i], tf.float32) | ||
self.nlabel = tf.reshape(tf.one_hot(indices=[tf.argmax(nscor, 0)], depth=self.num_rels, dtype=tf.int32), [self.num_rels]) | ||
self.ccc = self.preds[i] | ||
self.losses.append(tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=sen_out[i], labels=self.nlabel))) | ||
|
||
if i == 0: | ||
self.total_loss = self.losses[i] | ||
else: | ||
self.total_loss += self.losses[i] | ||
|
||
with tf.name_scope("accuracy"): | ||
self.accuracy.append(tf.reduce_mean(tf.cast(tf.equal(self.predictions[i], tf.argmax(self.preds[i], 0)), "float"), name="accuracy")) | ||
|
||
|
||
with tf.name_scope("update"): | ||
self.global_step = tf.Variable(0, name="global_step", trainable=False) | ||
optimizer = tf.train.AdamOptimizer(learning_rate=0.001) | ||
self.train_op = optimizer.minimize(self.total_loss, global_step=self.global_step) | ||
|
||
# pad sentences for piecewise max-pooling operation described in | ||
# "Distant Supervision for Relation Extraction via Piecewise Convolutional Neural Networks" | ||
def sen_padding(self, sen_id, instance, lpos, rpos, real_sen, namepos): | ||
# instance :[5, 233, 3232, ...] sentences in bag | ||
instances = [] | ||
mask_before = [] | ||
mask_between= [] | ||
mask_after = [] | ||
lwps = [] | ||
rwps = [] | ||
for id in instance: | ||
seq = sen_id[id] | ||
wps_left = lpos[id] | ||
wps_right = rpos[id] | ||
en1, en2 = namepos[id] | ||
t1, t2 = self.get_split(en1, en2) | ||
seq_len = len(real_sen[id].split()) | ||
assert seq_len <= self.pad_len | ||
if seq_len <= self.pad_len: | ||
mask1 = np.zeros([self.pad_len-self.window_size+1, self.num_filters], dtype=float) | ||
mask1[t1+1:, :] = -100.0 | ||
mask2 = np.zeros([self.pad_len-self.window_size+1, self.num_filters], dtype=float) | ||
mask2[:t1, :] = -100.0 | ||
mask2[t2+1:, :] = -100.0 | ||
mask3 = np.zeros([self.pad_len - self.window_size + 1, self.num_filters], dtype=float) | ||
mask3[:t2, :] = -100.0 | ||
mask3[seq_len-self.window_size+1:, :] = -100.0 | ||
# mask = [1] * (seq_len-self.window_size+1) + [0] * (self.pad_len-seq_len) | ||
if len(seq) < self.pad_len: | ||
llen = self.pad_len - len(seq) | ||
seq.extend([self.PAD] * (self.pad_len - len(seq))) | ||
wps_left.extend([61] * llen) | ||
wps_right.extend([61] * llen) | ||
mask_before.append(mask1) | ||
mask_between.append(mask2) | ||
mask_after.append(mask3) | ||
instances.append(seq) | ||
lwps.append(wps_left) | ||
rwps.append(wps_right) | ||
return instances, mask_before, mask_between, mask_after, lwps, rwps | ||
|
||
def get_split(self, en1, en2): | ||
t1, t2 = en1, en2 | ||
if en1 > en2: | ||
t1 = en2 | ||
t2 = en1 | ||
assert t1 <= t2 | ||
return t1,t2 | ||
|
||
|
||
def train(self, sess, bag_key, train_bag, sen_id, lpos, rpos, real_sen, namepos, use_soft_label = False): | ||
# bag_key: mid1 mid2 rel | ||
batch = [] | ||
pred = [] | ||
mask_before = [] | ||
mask_between = [] | ||
mask_after = [] | ||
wps_left = [] | ||
wps_right = [] | ||
batch_sen_num = [] | ||
soft_label_flag = [] | ||
cnt_sen = 0 | ||
for key in bag_key: | ||
rel = int(key.split('\t')[-1]) | ||
if use_soft_label:soft_label_flag.append(1) | ||
else:soft_label_flag.append(0) | ||
sentences = train_bag[key] | ||
sen_vec, mask_bef, mask_bet, mask_aft, llpos, rrpos = self.sen_padding(sen_id, sentences, lpos, rpos, real_sen, namepos) | ||
batch.extend(sen_vec) | ||
mask_before.extend(mask_bef) | ||
mask_between.extend(mask_bet) | ||
mask_after.extend(mask_aft) | ||
pred.append(rel) | ||
wps_left.extend(llpos) | ||
wps_right.extend(rrpos) | ||
batch_sen_num.append(cnt_sen) | ||
cnt_sen += len(sentences) | ||
batch_sen_num.append(cnt_sen) | ||
preds = np.zeros([len(bag_key), self.num_rels]) | ||
preds[np.arange(len(bag_key)), pred] = 1 | ||
_, hh, loss, acc, step = sess.run([self.train_op, self.h1, self.total_loss, self.accuracy, self.global_step], feed_dict={ | ||
self.input: batch, | ||
self.mask1: mask_before, | ||
self.mask2: mask_between, | ||
self.mask3: mask_after, | ||
self.preds: preds, | ||
self.wps1: wps_left, | ||
self.wps2: wps_right, | ||
self.bag_num: batch_sen_num, | ||
self.soft_label_flag: soft_label_flag | ||
}) | ||
|
||
assert np.min(np.max(hh, axis=1)) > -50.0 | ||
acc = np.reshape(np.array(acc), (self.batch_num)) | ||
acc = np.mean(acc) | ||
return loss | ||
|
||
def test(self, sess, bag_key, test_bag, sen_id, lpos, rpos, real_sen, namepos): | ||
# bag_key: mid1 mid2 | ||
pair_score = [] | ||
cnt_i = 1 | ||
batches = util.batch_iter(bag_key, self.batch_num, 1, shuffle=True) | ||
for bat in batches: | ||
if len(bat) < self.batch_num: | ||
continue | ||
batch = [] | ||
mask_before = [] | ||
mask_between = [] | ||
mask_after = [] | ||
wps_left = [] | ||
wps_right = [] | ||
batch_sen_num = [] | ||
cnt_sen = 0 | ||
for key in bat: | ||
# sys.stdout.write('testing %d cases...\r' % cnt_i | ||
# sys.stdout.flush() | ||
cnt_i += 1 | ||
sentences = test_bag[key] | ||
sen_vec, mask_bef, mask_bet, mask_aft, llpos, rrpos = self.sen_padding(sen_id, sentences, lpos, rpos, real_sen, namepos) | ||
batch.extend(sen_vec) | ||
mask_before.extend(mask_bef) | ||
mask_between.extend(mask_bet) | ||
mask_after.extend(mask_aft) | ||
wps_left.extend(llpos) | ||
wps_right.extend(rrpos) | ||
batch_sen_num.append(cnt_sen) | ||
cnt_sen += len(sentences) | ||
batch_sen_num.append(cnt_sen) | ||
soft_label_flag = [0] * len(bat) | ||
|
||
scores = sess.run(self.bag_score, feed_dict={ | ||
self.input: batch, | ||
self.mask1: mask_before, | ||
self.mask2: mask_between, | ||
self.mask3: mask_after, | ||
self.wps1: wps_left, | ||
self.wps2: wps_right, | ||
self.bag_num: batch_sen_num, | ||
self.soft_label_flag: soft_label_flag}) | ||
# score = np.max(scores, axis=0) | ||
for k, key in enumerate(bat): | ||
for i, sc in enumerate(scores[k]): | ||
if i==0: continue | ||
pair_score.append({"mid": key, "rel": i, "score": sc}) | ||
return pair_score | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from util import * | ||
import attmodel as attmodel | ||
import onemodel as onemodel | ||
import tensorflow as tf | ||
import os, time | ||
|
||
|
||
|
||
tf.app.flags.DEFINE_integer("pad_len", 200, "Pad sentences to this length for convolution.") | ||
tf.app.flags.DEFINE_integer("embedding_size", 50, "Size of word embedding.") | ||
tf.app.flags.DEFINE_integer("pos_embedding", 5, "Size of position embedding.") | ||
tf.app.flags.DEFINE_integer("batch_num", 160, "Batch size for sentence encoding.") | ||
tf.app.flags.DEFINE_integer("num_rels", 53, "Number of pre-defined relations.") | ||
tf.app.flags.DEFINE_integer("window_size", 3, "Size of sliding window.") | ||
tf.app.flags.DEFINE_integer("num_filters", 230, "Number of filters for convolution.") | ||
tf.app.flags.DEFINE_float("dropout", 0.7,'dropout') | ||
|
||
tf.app.flags.DEFINE_string("one_or_att",'one','at-least-one or selective attention model') | ||
tf.app.flags.DEFINE_boolean("use_pre_train_model", False,'use pre-trained model or label') | ||
tf.app.flags.DEFINE_string("load_model_name", 'pretrain/model.ckpt-3300','the path of pre-trained model without soft-label') | ||
tf.app.flags.DEFINE_boolean("save_model", False,'save models or not') | ||
|
||
tf.app.flags.DEFINE_boolean("use_soft_label", True,'use soft label or not') | ||
tf.app.flags.DEFINE_float("confidence", 0.9,'confidence of distant-supervised label') | ||
|
||
tf.app.flags.DEFINE_string("dir",'res','dir to store results') | ||
tf.app.flags.DEFINE_integer("report", 100, "report loss & save models after every *** batches.") | ||
FLAGS = tf.app.flags.FLAGS | ||
|
||
|
||
|
||
# =================== make new dirs =================== # | ||
prefix = str(int(time.time() * 1000)) | ||
top_dir = os.path.join(FLAGS.dir, prefix) # dir to save all the results in this run | ||
if not os.path.exists(FLAGS.dir): | ||
os.mkdir(FLAGS.dir) | ||
if not os.path.exists(top_dir): | ||
os.mkdir(top_dir) | ||
checkpoint_dir = os.path.join(top_dir, "checkpoint") # dir to save models | ||
log_file = os.path.join(top_dir, 'log.txt') | ||
def write_log(s): | ||
print(s) | ||
with open(log_file, 'a') as f: | ||
f.write(s+'\n') | ||
|
||
# =================== load data =================== # | ||
print("load training and testing data ...") | ||
start_time = time.time() | ||
vect = word2vec() # load pre-trained word vector | ||
word_vocab, word_vector = get_word_vec(vect, one_or_att=FLAGS.one_or_att) # load vocabulary and pre-defined word vectors | ||
''' | ||
bag_train: a dict , key is triple (h, r, t), related value is the list of sentence ids which contain the triple. | ||
sen_id: idlized sentences in the training data. | ||
real_sen: original sentences in the training set. | ||
lpos/ rpos: the distance of each token to the head/tail entities, for position embedding. | ||
keypos: the position of two key (head and tail) entities in the sentences. | ||
''' | ||
bag_train, sen_id, lpos, rpos, real_sen, keypos = get_data(istrain=True, word_vocab=word_vocab) | ||
bag_test, sen_id1, midrel, ltpos, rtpos, real_sen1, keypos1 = get_data(istrain=False, word_vocab=word_vocab) | ||
bag_keys = bag_train.keys() | ||
|
||
span = time.time() - start_time | ||
print("training and testing data loaded, using %.3f seconds" % span) | ||
write_log("training size: %d testing size: %d" % (len(bag_train.keys()), len(bag_test.keys())) ) | ||
|
||
|
||
# =================== model initialization =================== # | ||
config = tf.ConfigProto() | ||
config.gpu_options.allow_growth = True | ||
sess = tf.Session(config=config) | ||
if FLAGS.one_or_att == 'att': | ||
load_model = attmodel | ||
else: | ||
load_model = onemodel | ||
model = load_model.model(pad_len=FLAGS.pad_len, | ||
num_rels=FLAGS.num_rels, | ||
word_vectors=word_vector, | ||
window_size=FLAGS.window_size, | ||
num_filters=FLAGS.num_filters, | ||
embedding_size=FLAGS.embedding_size, | ||
dropout=FLAGS.dropout, | ||
pos_embedding=FLAGS.pos_embedding, | ||
batch_num=FLAGS.batch_num, | ||
joint_p=FLAGS.confidence) | ||
|
||
saver = tf.train.Saver(max_to_keep=70) | ||
if FLAGS.use_pre_train_model: | ||
saver.restore(sess, FLAGS.load_model_name) | ||
write_log("load pre-trained model from " + FLAGS.load_model_name) | ||
# ckpt = tf.train.get_checkpoint_state(checkpoint_dir) | ||
else: | ||
sess.run(tf.global_variables_initializer()) | ||
write_log("create new model") | ||
print "Initial model complete" | ||
|
||
|
||
# =================== training stage =================== # | ||
batches = batch_iter(bag_train.keys(), FLAGS.batch_num, 20) | ||
loss, start_time = 0.0, time.time() | ||
for batch in batches: | ||
if len(batch) < FLAGS.batch_num: | ||
continue | ||
loss += model.train(sess, batch, bag_train, sen_id, lpos, rpos, real_sen, keypos, FLAGS.use_soft_label) | ||
step = tf.train.global_step(sess, model.global_step) | ||
progress_bar(step % FLAGS.report, FLAGS.report) | ||
if step % FLAGS.report == 0: # report PR-curve results on the testing set | ||
cost_time = time.time() - start_time | ||
epoch = step // FLAGS.report | ||
write_log("%d : loss = %.3f, time = %.3f " % (step // FLAGS.report, loss, cost_time)) | ||
print "evaluating after epoch " + str(epoch) | ||
pair_score = model.test(sess, bag_test.keys(), bag_test, sen_id1, ltpos, rtpos, real_sen1, keypos1) | ||
evaluate(top_dir + "/pr"+str(epoch)+".txt", pair_score, midrel, epoch) | ||
loss, start_time = 0.0, time.time() | ||
if FLAGS.save_model: | ||
checkpoint_path = os.path.join(checkpoint_dir, "model.ckpt") | ||
saver.save(sess, checkpoint_path, global_step=model.global_step) | ||
write_log("save model in " + str(sess.run(model.global_step))) |
Oops, something went wrong.