Update app.py
Browse files
app.py
CHANGED
|
@@ -687,6 +687,7 @@ def generate_modeling_phoenix_code():
|
|
| 687 |
return '''"""
|
| 688 |
PHOENIX Retention Model v1.4.3
|
| 689 |
β
v1.4.3 CRITICAL FIX: forward() μκ·Έλμ² Transformers νΈν
|
|
|
|
| 690 |
β
PhoenixPreTrainedModel λ² μ΄μ€ ν΄λμ€ ν¬ν¨
|
| 691 |
β
λͺ¨λ Retention ν΄λμ€ μμ ꡬν
|
| 692 |
"""
|
|
@@ -748,7 +749,8 @@ class MultiScaleRetention(nn.Module):
|
|
| 748 |
b, s, _ = hidden_states.shape
|
| 749 |
device, dtype = hidden_states.device, hidden_states.dtype
|
| 750 |
|
| 751 |
-
|
|
|
|
| 752 |
self.to(device=device, dtype=dtype)
|
| 753 |
|
| 754 |
q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
|
|
@@ -801,7 +803,9 @@ class HierarchicalRetention(nn.Module):
|
|
| 801 |
):
|
| 802 |
b, s, h = hidden_states.shape
|
| 803 |
device, dtype = hidden_states.device, hidden_states.dtype
|
| 804 |
-
|
|
|
|
|
|
|
| 805 |
self.to(device=device, dtype=dtype)
|
| 806 |
|
| 807 |
ret_out = self.base_retention(hidden_states)[0]
|
|
@@ -824,10 +828,23 @@ def replace_attention_with_retention_for_loading(model, use_hierarchical=True):
|
|
| 824 |
layers = getattr(layers, 'layers', getattr(layers, 'h', getattr(layers, 'layers', None)))
|
| 825 |
if layers is None: return model, 0, 0
|
| 826 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
cnt = 0
|
| 828 |
for i, layer in enumerate(layers):
|
| 829 |
if hasattr(layer, 'self_attn'):
|
| 830 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 831 |
cnt += 1
|
| 832 |
return model, cnt, len(layers)
|
| 833 |
|
|
@@ -1871,6 +1888,7 @@ with gr.Blocks(
|
|
| 1871 |
**Complete Integrated Version with All Fixes**
|
| 1872 |
|
| 1873 |
β
**NEW v1.4.3!** forward() μκ·Έλμ² Transformers νΈν - μλ²½ μμ !
|
|
|
|
| 1874 |
β
Embedding Tying μ μ₯ μμ μ²λ¦¬
|
| 1875 |
β
State Dict μ§μ λ‘λλ‘ Retention 보쑴
|
| 1876 |
β
Model Structure Pre-Analysis
|
|
@@ -2003,6 +2021,7 @@ with gr.Blocks(
|
|
| 2003 |
|
| 2004 |
### What's New in v1.4.3 (Complete Integrated Version)
|
| 2005 |
- β
**CRITICAL FIX: forward() Signature** - Transformers νΈνμ± μλ²½ μμ
|
|
|
|
| 2006 |
- β
**Embedding Tying** - μ μ₯ μμ μ μλ μ²λ¦¬
|
| 2007 |
- β
**Qwen3-0.6B Generation Fixed** - μ μμ μΈ ν
μ€νΈ μμ±
|
| 2008 |
- β
**μμ ν΅ν©** - λͺ¨λ μμ μ¬ν ν¬ν¨
|
|
|
|
| 687 |
return '''"""
|
| 688 |
PHOENIX Retention Model v1.4.3
|
| 689 |
β
v1.4.3 CRITICAL FIX: forward() μκ·Έλμ² Transformers νΈν
|
| 690 |
+
β
v1.4.3 HOTFIX: dtype λΆμΌμΉ μμ (bfloat16 μ§μ)
|
| 691 |
β
PhoenixPreTrainedModel λ² μ΄μ€ ν΄λμ€ ν¬ν¨
|
| 692 |
β
λͺ¨λ Retention ν΄λμ€ μμ ꡬν
|
| 693 |
"""
|
|
|
|
| 749 |
b, s, _ = hidden_states.shape
|
| 750 |
device, dtype = hidden_states.device, hidden_states.dtype
|
| 751 |
|
| 752 |
+
# β
FIX: dtypeκ³Ό device λͺ¨λ μΌμΉμν΄
|
| 753 |
+
if self.q_proj.weight.device != device or self.q_proj.weight.dtype != dtype:
|
| 754 |
self.to(device=device, dtype=dtype)
|
| 755 |
|
| 756 |
q = self.q_proj(hidden_states).view(b, s, self.num_heads, self.head_dim).transpose(1, 2)
|
|
|
|
| 803 |
):
|
| 804 |
b, s, h = hidden_states.shape
|
| 805 |
device, dtype = hidden_states.device, hidden_states.dtype
|
| 806 |
+
|
| 807 |
+
# β
FIX: dtypeκ³Ό device λͺ¨λ μΌμΉμν΄
|
| 808 |
+
if next(self.short_proj.parameters()).device != device or next(self.short_proj.parameters()).dtype != dtype:
|
| 809 |
self.to(device=device, dtype=dtype)
|
| 810 |
|
| 811 |
ret_out = self.base_retention(hidden_states)[0]
|
|
|
|
| 828 |
layers = getattr(layers, 'layers', getattr(layers, 'h', getattr(layers, 'layers', None)))
|
| 829 |
if layers is None: return model, 0, 0
|
| 830 |
|
| 831 |
+
# β
FIX: μλ³Έ λͺ¨λΈμ dtype κ°μ§
|
| 832 |
+
original_dtype = None
|
| 833 |
+
for param in model.parameters():
|
| 834 |
+
original_dtype = param.dtype
|
| 835 |
+
break
|
| 836 |
+
|
| 837 |
cnt = 0
|
| 838 |
for i, layer in enumerate(layers):
|
| 839 |
if hasattr(layer, 'self_attn'):
|
| 840 |
+
# μ Retention μμ±
|
| 841 |
+
new_retention = HierarchicalRetention(model.config, i) if use_hierarchical else MultiScaleRetention(model.config, i)
|
| 842 |
+
|
| 843 |
+
# β
FIX: μλ³Έ dtypeμΌλ‘ λ³ν
|
| 844 |
+
if original_dtype is not None:
|
| 845 |
+
new_retention = new_retention.to(dtype=original_dtype)
|
| 846 |
+
|
| 847 |
+
layer.self_attn = new_retention
|
| 848 |
cnt += 1
|
| 849 |
return model, cnt, len(layers)
|
| 850 |
|
|
|
|
| 1888 |
**Complete Integrated Version with All Fixes**
|
| 1889 |
|
| 1890 |
β
**NEW v1.4.3!** forward() μκ·Έλμ² Transformers νΈν - μλ²½ μμ !
|
| 1891 |
+
β
**NEW v1.4.3!** dtype λΆμΌμΉ μμ - bfloat16 μλ²½ μ§μ!
|
| 1892 |
β
Embedding Tying μ μ₯ μμ μ²λ¦¬
|
| 1893 |
β
State Dict μ§μ λ‘λλ‘ Retention 보쑴
|
| 1894 |
β
Model Structure Pre-Analysis
|
|
|
|
| 2021 |
|
| 2022 |
### What's New in v1.4.3 (Complete Integrated Version)
|
| 2023 |
- β
**CRITICAL FIX: forward() Signature** - Transformers νΈνμ± μλ²½ μμ
|
| 2024 |
+
- β
**HOTFIX: dtype λΆμΌμΉ** - bfloat16 μλ²½ μ§μ
|
| 2025 |
- β
**Embedding Tying** - μ μ₯ μμ μ μλ μ²λ¦¬
|
| 2026 |
- β
**Qwen3-0.6B Generation Fixed** - μ μμ μΈ ν
μ€νΈ μμ±
|
| 2027 |
- β
**μμ ν΅ν©** - λͺ¨λ μμ μ¬ν ν¬ν¨
|