-
Notifications
You must be signed in to change notification settings - Fork 12
/
attention.py
129 lines (104 loc) · 5.85 KB
/
attention.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import tensorflow as tf
def attention(inputs, attention_size, seq_len, time_major=False, return_alphas=False, batch_size = 10*40):
"""
Attention mechanism layer which reduces RNN/Bi-RNN outputs with Attention vector.
The idea was proposed in the article by Z. Yang et al., "Hierarchical Attention Networks
for Document Classification", 2016: http://www.aclweb.org/anthology/N16-1174.
Variables notation is also inherited from the article
Args:
inputs: The Attention inputs.
Matches outputs of RNN/Bi-RNN layer (not final state):
In case of RNN, this must be RNN outputs `Tensor`:
If time_major == False (default), this must be a tensor of shape:
`[batch_size, max_time, cell.output_size]`.
If time_major == True, this must be a tensor of shape:
`[max_time, batch_size, cell.output_size]`.
In case of Bidirectional RNN, this must be a tuple (outputs_fw, outputs_bw) containing the forward and
the backward RNN outputs `Tensor`.
If time_major == False (default),
outputs_fw is a `Tensor` shaped:
`[batch_size, max_time, cell_fw.output_size]`
and outputs_bw is a `Tensor` shaped:
`[batch_size, max_time, cell_bw.output_size]`.
If time_major == True,
outputs_fw is a `Tensor` shaped:
`[max_time, batch_size, cell_fw.output_size]`
and outputs_bw is a `Tensor` shaped:
`[max_time, batch_size, cell_bw.output_size]`.
attention_size: Linear size of the Attention weights.
time_major: The shape format of the `inputs` Tensors.
If true, these `Tensors` must be shaped `[max_time, batch_size, depth]`.
If false, these `Tensors` must be shaped `[batch_size, max_time, depth]`.
Using `time_major = True` is a bit more efficient because it avoids
transposes at the beginning and end of the RNN calculation. However,
most TensorFlow data is batch-major, so by default this function
accepts input and emits output in batch-major form.
return_alphas: Whether to return attention coefficients variable along with layer's output.
Used for visualization purpose.
Returns:
The Attention output `Tensor`.
In case of RNN, this will be a `Tensor` shaped:
`[batch_size, cell.output_size]`.
In case of Bidirectional RNN, this will be a `Tensor` shaped:
`[batch_size, cell_fw.output_size + cell_bw.output_size]`.
"""
if isinstance(inputs, tuple):
# In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
inputs = tf.concat(inputs, 2)
if time_major:
# (T,B,D) => (B,T,D)
inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
hidden_size = inputs.shape[2].value # D value - hidden size of the RNN layer
# Trainable parameters
W_omega = tf.Variable(tf.random_normal([hidden_size, attention_size], stddev=0.1))
b_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
u_omega = tf.Variable(tf.random_normal([attention_size], stddev=0.1))
# One fully connected layer with non-linear activation for each of the hidden states;
# the shape of `v` is (B*T,A), where A=attention_size
v = tf.tanh(tf.matmul(tf.reshape(inputs, [-1, hidden_size]), W_omega) + tf.expand_dims(b_omega, 0))
# For each of the B*T hidden states its vector of size A from `v` is reduced with `u` vector
vu = tf.matmul(v, tf.expand_dims(u_omega, -1)) # (B*T, 1) shape
vu = tf.reshape(vu, tf.shape(inputs)[:2]) # (B,T) shape
#mask attention values for padding
mask = tf.sequence_mask(seq_len, inputs.shape[1].value)
attn_mask = tf.multiply(-30.0,tf.ones_like(vu))
attn_mask_z = tf.zeros_like(vu)
attn_pad = tf.where(mask,x = attn_mask_z,y = attn_mask)
alphas = tf.nn.softmax(tf.add(vu,attn_pad))
# Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
if not return_alphas:
return output
else:
return output, alphas
def attention_unf(inputs, attention_size, seq_len, time_major=False, return_alphas=False, batch_size=10*40):
"""
Instead of cumputing attention, returns plain average of hidden states.
Returns:
The average output `Tensor`.
In case of RNN, this will be a `Tensor` shaped:
`[batch_size, cell.output_size]`.
In case of Bidirectional RNN, this will be a `Tensor` shaped:
`[batch_size, cell_fw.output_size + cell_bw.output_size]`.
"""
if isinstance(inputs, tuple):
# In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
inputs = tf.concat(inputs, 2)
if time_major:
# (T,B,D) => (B,T,D)
inputs = tf.array_ops.transpose(inputs, [1, 0, 2])
hidden_size = inputs.shape[2].value # D value - hidden size of the RNN layer
# Create constant matrix of ones "uniform attention"
vu = tf.Variable(tf.ones([batch_size,inputs.shape[1].value], tf.float32), trainable=False) # (B,T) shape
#mask attention values for padding
mask = tf.sequence_mask(seq_len, inputs.shape[1].value)
attn_mask = tf.multiply(-30.0,tf.ones_like(vu))
attn_mask_z = tf.zeros_like(vu)
attn_pad = tf.where(mask,x = attn_mask_z,y = attn_mask)
alphas = tf.nn.softmax(tf.add(vu,attn_pad))
# Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1)
if not return_alphas:
return output
else:
return output, alphas