attention
词向量: $x_1$(早),$x_3$(上),$x_3$(好)
$x_ix_i^T$ : 表征两个向量的夹角,表征一个向量在另一个向量上的投影,投影的值大,说明两个向量相关度高
归一化 -> 权重
$z_1$(早) = $[0.476,0.428,0.096]$
即: 分配0.476权重给予本身 , 0.428的权重给予”上” , 0.096的权重给予”上”
加权求和 -> attention向量
$n$ : 输入词向量个数
$d$ : dimension of input,输入向量维度,d = dimension of X = 4
$d_k$ : dimension of k ,即K的行维度,可取 = $d$ = dimension of $x$
$d_v$ : dimension of v ,即V的行维度,可取 = $d$ = dimension of $x$
($W^q,W^k,W^v$ 变换矩阵,可学习参数)
self-attention pytorch Implement
import torch
from torch import nn
import numpy as np
class SelfAttention(nn.Module):
def __init__(self,input_dim,key_dim=None,value_dim=None):
super(SelfAttention, self).__init__()
self.input_dim = input_dim
if key_dim is None:
self.key_dim = input_dim
if value_dim is None:
self.value_dim = input_dim
self.query = nn.Linear(self.input_dim,key_dim,bias=False)
self.key = nn.Linear(self.input_dim, key_dim, bias=False)
self.value = nn.Linear(self.input_dim, value_dim, bias=False)
self.norm_fact = 1 / np.sqrt(key_dim)
# X->[batch_size ,seq_len,input_dim]
def forward(self,X):
query = self.query(X)
key = self.key(X)
value = self.value(X)
keyT = key.permute(0, 2, 1) # key[batch_size ,seq_len,input_dim] -> [batch_size,input_dim,seq_len]
attention = nn.Softmax(dim=-1)(torch.bmm(query,keyT)) * self.norm_fact
output = torch.bmm(attention,value)
return output
model = SelfAttention(4,5,3)
X = torch.rand(1,3,4)
print(X)
output = model(X)
print(output)
print(output.size())
tensor([[[0.9641, 0.7163, 0.9725, 0.6660],
[0.2968, 0.2546, 0.8528, 0.9114],
[0.0058, 0.5362, 0.1145, 0.9804]]])
tensor([[[-0.0294, 0.1206, 0.1320],
[-0.0314, 0.1191, 0.1306],
[-0.0329, 0.1189, 0.1299]]], grad_fn=<BmmBackward0>)
torch.Size([1, 3, 3])
Multi-head Self-Attention
$heads=2,dim=4$,拆分向量维度值head_dim = dim / heads = 2
shift $XW:[seqlen,dim]->[heads,seqlen,\frac{dim}{heads}]$
multi-head attetion pytorch Implement
import torch
from torch import nn
import numpy as np
'''
Self-Attention Shape [batch_size,seq_len,intput_dim]
Multi-Head Attention Shape [batch_size,heads,seq_len,(intput_dim/heads)]
'''
def attention(query, key, value):
key_dim = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / np.sqrt(key_dim)
attention_weight = scores.softmax(dim=-1)
return torch.matmul(attention_weight, value)
class MultiHeadAttention(nn.Module):
def __init__(self, heads, input_dim):
super(MultiHeadAttention, self).__init__()
assert input_dim % heads == 0 # 可整除
self.key_dim = input_dim // heads
self.heads = heads
self.linears = [
nn.Linear(input_dim, input_dim),# W-Q
nn.Linear(input_dim, input_dim),# W-K
nn.Linear(input_dim, input_dim),# W-V
nn.Linear(input_dim, input_dim),# W-O
]
def forward(self, x):
batch_size = x.size(0)
query, key, value = [
linear(x).view(batch_size, -1, self.heads, self.key_dim).transpose(1, 2)
for linear, x in zip(self.linears, (x, x, x))
]
#x shape:[batch_size,heads,seq_len,input_dim/heads]
x = attention(query, key, value)
#x shape:[batch_size,seq_len,input_dim]
x = (x.transpose(1, 2).contiguous().view(batch_size, -1, self.heads * self.key_dim))
return self.linears[-1](x)
mutil_head_model = MultiHeadAttention(2,4)
output = mutil_head_model(X)
print(output)
print(output.size())
tensor([[[ 0.1228, -0.1211, 0.5368, -0.1685],
[ 0.1230, -0.1210, 0.5374, -0.1690],
[ 0.1229, -0.1207, 0.5375, -0.1683]]], grad_fn=<AddBackward0>)
torch.Size([1, 3, 4])
transformer
PositionalEncoding
[!NOTE]
self-attention的无序性丢失了词语的位置信息,Positional Encoding对句中词语的相对位置进行编码,保证词语的有序性
$[0,1,…,T]$作为位置编码,如第3个词的编码:[3,3,3,3,…,3],当句子过长时最后一个词的值比首词相差较大,合并embedding后易造成特征倾斜,若归一化,即$[0/T,1/T,…,T/T]$作为位置编码可避免特征倾斜问题,但导致不同文本的位置编码步长不一致,如:
[!NOTE] Note
利用sin,cos的特性使得位置编码分布处于[0,1]区间,同时位置编码的步长一致,由于其周期性会导致不同pos而位置编码一致的情况,因此 pos/X 可试周期无限长并交替使用sin和cos,从而避免
$pos$ : 单词所在句中位置
$2i$ : 偶数维度
$2i+1$ : 奇数维度
$dim$ : 编码维度
$X=[0.1,0.2,0.3,0.4],pos=2,posCode=[sin(\frac{2}{10000^{\frac{0}{4}}}),cos(\frac{2}{10000^{\frac{0}{4}}}),sin(\frac{2}{10000^{\frac{2}{4}}}),cos(\frac{2}{10000^{\frac{2}{4}}})]$
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout: float = 0.1, max_len: int = 64):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
position = torch.arange(max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
pe = torch.zeros(max_len, 1, d_model)
pe[:, 0, 0::2] = torch.sin(position * div_term)
pe[:, 0, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
def forward(self, x: Tensor) -> Tensor:
#x: Tensor, shape [seq_len, batch_size, embedding_dim]
x = x + self.pe[:x.size(0)]
return self.dropout(x)
encoder
decoder
$Y:shifted-right-output$
note : 我有一只猫 -> I have a cat
$Y$:[begin] -> I,[begin,I]->have,[begin,I,have]->a,[begin,I have a ]->cat
class FeedForward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
return self.w_2(self.dropout(self.w_1(x).relu()))
maskedMultiHeadAttention
[!NOTE] Note
train时经masked后传入整个输入矩阵等同于推理时按词顺序挨个推理
def generate_square_subsequent_mask(size: int) -> Tensor:
#size:
return torch.triu(torch.full((size, size), float('-inf')), diagonal=1)
def attention(query, key, value, mask=None, dropout=None):
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
p_attn = scores.softmax(dim=-1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, value), p_attn
transformer implement
#waiting
本博客所有文章除特别声明外,均采用 CC BY-SA 3.0协议 。转载请注明出处!