class RMSNorm(nn.Module):
    def __init__(self, normalized_shape: list | tuple,
        eps: float = 1e-5, element_affine: bool = True,
    ):
        super().__init__()
        self.eps = eps
        self.element_affine = element_affine
        if self.element_affine:
            self.gamma = nn.Parameter(torch.ones(normalized_shape))
        else:
            self.register_parameter("gamma", None)

    def forward(self, x: torch.Tensor):
        x = x * torch.rsqrt(self.eps + x.pow(2).mean(dim=-1, keepdim=True))
        return x if self.gamma is None else x * self.gamma

Arquiteturas de LLMs¶

Tópicos em Ciência de Dados ¶

Prof. Dr. Denis Mayr Lima Martins ¶

Pontifícia Universidade Católica de Campinas ¶

Grouped-Query Attention (GQA)¶

DeepSeek V3/R1¶

DeepSeek V3/R1: Resultados de MLA¶

Mixture of Experts (MoE)¶

DeepSeek V3/R1¶

Normalization Layer¶

Root Mean Square Layer Normalization¶

Root Mean Square Layer Normalization¶

Query-Key Normalization¶

Sliding Window Attention (SWA)¶

Sliding Window Attention (SWA)¶

Gemma 3 e Mistral 3.1¶

RoPE¶

Qwen3 0.6B¶

GPT-OSS¶

Extra: O preço da formatação¶

Leitura Recomendada¶

Arquiteturas de LLMs¶

Tópicos em Ciência de Dados¶

Prof. Dr. Denis Mayr Lima Martins¶

Pontifícia Universidade Católica de Campinas¶

Grouped-Query Attention (GQA)¶

DeepSeek V3/R1¶

DeepSeek V3/R1: Resultados de MLA¶

Mixture of Experts (MoE)¶

DeepSeek V3/R1¶

Normalization Layer¶

Root Mean Square Layer Normalization¶

Root Mean Square Layer Normalization¶

Query-Key Normalization¶

Sliding Window Attention (SWA)¶

Sliding Window Attention (SWA)¶

Gemma 3 e Mistral 3.1¶

RoPE¶

Qwen3 0.6B¶

GPT-OSS¶

Extra: O preço da formatação¶

Leitura Recomendada¶

Tópicos em Ciência de Dados ¶

Prof. Dr. Denis Mayr Lima Martins ¶

Pontifícia Universidade Católica de Campinas ¶