Patch Mpt !!top!! < AUTHENTIC >

# Case: (batch, key_len) -> expand to (batch, 1, 1, key_len) if attention_mask.dim() == 2: mask = attention_mask[:, None, None, :]

# patches/mpt_patch_rotary_cache.py """ Patch for MPT model: - Fix rotary embedding cache when sequence length changes between forward passes. - Correct attention mask broadcasting for cross-attention layers. """ import torch import torch.nn as nn from typing import Optional, Tuple 1. Patch Rotary Embedding Cache ---------------------------------------------------------------------- def patched_rotate_half(x: torch.Tensor) -> torch.Tensor: """Split and rotate half the hidden dims (fixed for fp16 stability).""" x1, x2 = x.chunk(2, dim=-1) return torch.cat((-x2, x1), dim=-1) patch mpt

If you meant something else (ECU patch, firmware, audio plugin), let me know. Context: MPT (Modified Transformer) uses ALiBi or Rotary embeddings. This patch fixes rotary position cache invalidation and attention mask expansion for variable-length sequences in a custom MPT block. # Case: (batch, key_len) -&gt; expand to (batch,

# Broadcast to query_len mask = mask.expand(batch, 1, query_length, key_length) # Broadcast to query_len mask = mask

# Test rotary cache fix rotary = PatchedRotaryEmbedding(dim=64, max_seq_len=512) x = torch.randn(1, 10, 64) cos1, sin1 = rotary(x, seq_len=10) cos2, sin2 = rotary(x, seq_len=20) # seqlen changes → recalc cache assert cos1.shape[0] == 10 assert cos2.shape[0] == 20 print("Rotary cache patch: OK")

Contact