1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
| # 示例2:分块泰勒近似注意力(处理长序列)
def chunked_taylor_attention(query, key, value, chunk_size=64):
"""
将长序列分成小块,在每个块内使用泰勒近似计算注意力,
避免一次性计算整个序列的注意力矩阵。
参数:
query: 查询向量 (batch_size, seq_len, d_model)
key: 键向量 (batch_size, seq_len, d_model)
value: 值向量 (batch_size, seq_len, d_model)
chunk_size: 分块大小
返回:
分块近似计算后的注意力输出
"""
import torch
import math
batch_size, seq_len, d_model = query.shape
num_chunks = (seq_len + chunk_size - 1) // chunk_size
outputs = []
for i in range(num_chunks):
start = i * chunk_size
end = min((i + 1) * chunk_size, seq_len)
# 获取当前块
q_chunk = query[:, start:end, :]
k_chunk = key[:, start:end, :]
v_chunk = value[:, start:end, :]
# 计算块内注意力分数
scores = torch.matmul(q_chunk, k_chunk.transpose(-2, -1))
scaled_scores = scores / math.sqrt(d_model)
# 泰勒近似softmax: exp(x)/sum(exp(x)) ≈ (1 + x + x²/2) / sum(1 + x + x²/2)
exp_approx = 1 + scaled_scores + (scaled_scores ** 2) / 2
sum_exp = exp_approx.sum(-1, keepdim=True)
attn_weights = exp_approx / (sum_exp + 1e-9)
# 应用注意力权重
output = torch.matmul(attn_weights, v_chunk)
outputs.append(output)
# 拼接所有块的输出
return torch.cat(outputs, dim=1)
# 测试示例
if __name__ == "__main__":
import torch
batch_size, seq_len, d_model = 2, 128, 16
q = k = v = torch.randn(batch_size, seq_len, d_model)
output = chunked_taylor_attention(q, k, v, chunk_size=32)
print("分块泰勒近似注意力输出形状:", output.shape)
|