Positional Embeddings

왜 필요할까?

The dog chased another dog → 위치 정보가 없다면, dog1와 dog2를 구별 불가능

code

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
 
model_id = "meta-llama/Llama-3.2-1B"
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
 
text = "The dog chased another dog"
tokens = tok(text, return_tensors="pt")["input_ids"]
embeddings = model.embed_tokens(tokens) # 각 토큰에 대한 임베딩 batch 처리
hdim = embeddings.shape[-1] # metrics size 확인
 
 
# y = wx
# forward, backward 사용 가능
W_q = nn.Linear(hdim, hdim, bias=False) # query
W_k = nn.Linear(hdim, hdim, bias=False) # key
W_v = nn.Linear(hdim, hdim, bias=False) # value
mha = nn.MultiheadAttention(embed_dim=hdim, num_heads=4, batch_first=True) 
 
# mha layer 모든 파라미터 초기화
with torch.no_grad():
    for param in mha.parameters():
        nn.init.normal_(param, std=0.1) # Initialize weights to be non-negligible
 
output, _ = mha(W_q(embeddings), W_k(embeddings), W_v(embeddings))
 
dog1_out = output[0, 2]
dog2_out = output[0, 5]
print(f"Dog output identical?: {torch.allclose(dog1_out, dog2_out, atol=1e-6)}") #True