Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -22,13 +22,13 @@ TOKENIZER_PATH = "bpe.model"
|
|
| 22 |
|
| 23 |
if not os.path.exists(MODEL_PATH):
|
| 24 |
download_file(
|
| 25 |
-
"https://huggingface.co/OpenLab-NLP/
|
| 26 |
MODEL_PATH
|
| 27 |
)
|
| 28 |
|
| 29 |
if not os.path.exists(TOKENIZER_PATH):
|
| 30 |
download_file(
|
| 31 |
-
"https://huggingface.co/OpenLab-NLP/
|
| 32 |
TOKENIZER_PATH
|
| 33 |
)
|
| 34 |
|
|
@@ -59,82 +59,70 @@ def pad_sentence(tokens):
|
|
| 59 |
return tokens + [pad_id]*(MAX_LEN - len(tokens))
|
| 60 |
|
| 61 |
|
| 62 |
-
class
|
| 63 |
-
def __init__(self, d_model, k=7):
|
| 64 |
super().__init__()
|
| 65 |
assert k % 2 == 1
|
| 66 |
self.k = k
|
| 67 |
-
self.
|
| 68 |
-
self.
|
| 69 |
-
self.generator = layers.Dense(k, dtype='float32')
|
| 70 |
-
def call(self, x):
|
| 71 |
-
x_in = x
|
| 72 |
-
x = tf.cast(x, tf.float32)
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
D = tf.shape(x)[2]
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
strides=[1,1,1,1],
|
| 89 |
-
rates=[1,1,1,1],
|
| 90 |
-
padding='VALID'
|
| 91 |
-
)
|
| 92 |
-
patches = tf.reshape(patches, [B, L, self.k, D])
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
out = self.proj(out)
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
|
| 107 |
-
#
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
# 입력 정규화
|
| 118 |
-
x_norm = self.ln(x)
|
| 119 |
-
|
| 120 |
-
# DynamicConv 여러 층 통과
|
| 121 |
-
out = x_norm
|
| 122 |
-
for block in self.blocks: out = block(out)
|
| 123 |
-
# Conv residual 연결
|
| 124 |
-
x = x_norm + self.ln1(out)
|
| 125 |
-
|
| 126 |
-
# FFN / GLU
|
| 127 |
-
v = out
|
| 128 |
-
h = self.fc1(v)
|
| 129 |
-
g, v_split = tf.split(h, 2, axis=-1)
|
| 130 |
-
h = tf.nn.silu(g) * v_split
|
| 131 |
-
h = self.fc2(h)
|
| 132 |
|
| 133 |
-
# FFN residual 연결
|
| 134 |
-
x = x + self.ln2(h)
|
| 135 |
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
class L2NormLayer(layers.Layer):
|
| 140 |
def __init__(self, axis=1, epsilon=1e-10, **kwargs):
|
|
@@ -144,18 +132,20 @@ class L2NormLayer(layers.Layer):
|
|
| 144 |
def call(self, inputs):
|
| 145 |
return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
|
| 146 |
|
| 147 |
-
class SentenceEncoder(
|
| 148 |
def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN, pad_id=pad_id, dropout_rate=EMBED_DROPOUT):
|
| 149 |
super().__init__()
|
| 150 |
self.pad_id = pad_id
|
| 151 |
self.embed = layers.Embedding(vocab_size, embed_dim)
|
| 152 |
self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
|
| 153 |
self.dropout = layers.Dropout(dropout_rate)
|
| 154 |
-
self.blocks = [
|
| 155 |
self.attn_pool = layers.Dense(1)
|
| 156 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
|
| 157 |
self.latent = layers.Dense(latent_dim, activation=None)
|
| 158 |
self.l2norm = L2NormLayer(axis=1)
|
|
|
|
|
|
|
| 159 |
|
| 160 |
def call(self, x, training=None):
|
| 161 |
positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
|
|
@@ -166,8 +156,13 @@ class SentenceEncoder(tf.keras.Model):
|
|
| 166 |
|
| 167 |
h = x_embed
|
| 168 |
for block in self.blocks:
|
| 169 |
-
h = block(h
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
h = self.ln_f(h)
|
| 172 |
|
| 173 |
# 🔥 scores를 float32 강제
|
|
|
|
| 22 |
|
| 23 |
if not os.path.exists(MODEL_PATH):
|
| 24 |
download_file(
|
| 25 |
+
"https://huggingface.co/OpenLab-NLP/openlem3/resolve/main/encoder_fit.weights.h5?download=true",
|
| 26 |
MODEL_PATH
|
| 27 |
)
|
| 28 |
|
| 29 |
if not os.path.exists(TOKENIZER_PATH):
|
| 30 |
download_file(
|
| 31 |
+
"https://huggingface.co/OpenLab-NLP/openlem3/resolve/main/bpe.model?download=true",
|
| 32 |
TOKENIZER_PATH
|
| 33 |
)
|
| 34 |
|
|
|
|
| 59 |
return tokens + [pad_id]*(MAX_LEN - len(tokens))
|
| 60 |
|
| 61 |
|
| 62 |
+
class HyperConv1D(layers.Layer):
|
| 63 |
+
def __init__(self, d_model, k=7, mem_size=64, hyper_dim=128, dropout=0.0):
|
| 64 |
super().__init__()
|
| 65 |
assert k % 2 == 1
|
| 66 |
self.k = k
|
| 67 |
+
self.d_model = d_model
|
| 68 |
+
self.mem_size = mem_size
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
# Input projection
|
| 71 |
+
self.input_proj = layers.Dense(d_model, name="input_proj")
|
|
|
|
| 72 |
|
| 73 |
+
# Local depthwise conv
|
| 74 |
+
self.local_conv = layers.DepthwiseConv1D(kernel_size=k, padding='same', activation='silu')
|
| 75 |
+
self.local_proj = layers.Dense(d_model, name="local_proj")
|
| 76 |
|
| 77 |
+
# Hypernetwork: global -> scale vector
|
| 78 |
+
self.hyper = tf.keras.Sequential([
|
| 79 |
+
layers.Dense(hyper_dim, activation='gelu'),
|
| 80 |
+
layers.Dense(d_model)
|
| 81 |
+
], name="hyper")
|
| 82 |
|
| 83 |
+
# Associative memory
|
| 84 |
+
self.mem_keys = self.add_weight((mem_size, d_model), initializer='glorot_uniform', trainable=True)
|
| 85 |
+
self.mem_vals = self.add_weight((mem_size, d_model), initializer='glorot_uniform', trainable=True)
|
| 86 |
+
self.mem_proj = layers.Dense(d_model)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
self.norm = layers.LayerNormalization()
|
| 89 |
+
self.attn_pool = layers.Dense(1)
|
|
|
|
| 90 |
|
| 91 |
+
def call(self, x):
|
| 92 |
+
x_in = x
|
| 93 |
+
x_dtype = x.dtype # 입력 dtype 기억
|
| 94 |
|
| 95 |
+
# 1) input projection
|
| 96 |
+
x_proj = self.input_proj(x)
|
| 97 |
+
# memory와 연산 위해 dtype 통일
|
| 98 |
+
mem_dtype = self.mem_keys.dtype
|
| 99 |
+
x_proj = tf.cast(x_proj, mem_dtype)
|
| 100 |
|
| 101 |
+
# 2) local conv
|
| 102 |
+
out_local = self.local_conv(x_proj)
|
| 103 |
+
# hypernetwork scaling
|
| 104 |
+
global_z = self.attn_pool(x_proj)
|
| 105 |
+
global_z = tf.nn.softmax(global_z, axis=1)
|
| 106 |
+
global_z = tf.reduce_sum(x_proj * global_z, axis=1)
|
| 107 |
+
|
| 108 |
+
scale = tf.expand_dims(tf.nn.sigmoid(self.hyper(global_z)), 1)
|
| 109 |
+
out_local = out_local * scale
|
| 110 |
+
out_local = self.local_proj(out_local)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
# 3) associative memory
|
| 114 |
+
sims = tf.matmul(x_proj, self.mem_keys, transpose_b=True) / tf.math.sqrt(tf.cast(self.d_model, mem_dtype))
|
| 115 |
+
attn = tf.nn.softmax(sims, axis=-1)
|
| 116 |
+
mem_read = tf.matmul(attn, self.mem_vals)
|
| 117 |
+
mem_read = self.mem_proj(mem_read)
|
| 118 |
|
| 119 |
+
# 4) fuse & residual
|
| 120 |
+
out = out_local + mem_read
|
| 121 |
+
out = self.norm(x_proj + out)
|
| 122 |
+
out = tf.nn.silu(out)
|
| 123 |
+
|
| 124 |
+
# 최종 출력 dtype 원래 입력 dtype으로 캐스트
|
| 125 |
+
return tf.cast(out, x_dtype)
|
| 126 |
|
| 127 |
class L2NormLayer(layers.Layer):
|
| 128 |
def __init__(self, axis=1, epsilon=1e-10, **kwargs):
|
|
|
|
| 132 |
def call(self, inputs):
|
| 133 |
return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
|
| 134 |
|
| 135 |
+
class SentenceEncoder(Model):
|
| 136 |
def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN, pad_id=pad_id, dropout_rate=EMBED_DROPOUT):
|
| 137 |
super().__init__()
|
| 138 |
self.pad_id = pad_id
|
| 139 |
self.embed = layers.Embedding(vocab_size, embed_dim)
|
| 140 |
self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
|
| 141 |
self.dropout = layers.Dropout(dropout_rate)
|
| 142 |
+
self.blocks = [HyperConv1D(d_model=embed_dim, k=7, mem_size=128, hyper_dim=256) for _ in range(4)]
|
| 143 |
self.attn_pool = layers.Dense(1)
|
| 144 |
self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
|
| 145 |
self.latent = layers.Dense(latent_dim, activation=None)
|
| 146 |
self.l2norm = L2NormLayer(axis=1)
|
| 147 |
+
self.fc1 = layers.Dense(1152)
|
| 148 |
+
self.fc2 = layers.Dense(embed_dim)
|
| 149 |
|
| 150 |
def call(self, x, training=None):
|
| 151 |
positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
|
|
|
|
| 156 |
|
| 157 |
h = x_embed
|
| 158 |
for block in self.blocks:
|
| 159 |
+
h = block(h)
|
| 160 |
|
| 161 |
+
v = h
|
| 162 |
+
h = self.fc1(v)
|
| 163 |
+
g, v_split = tf.split(h, 2, axis=-1)
|
| 164 |
+
h = tf.nn.silu(g) * v_split
|
| 165 |
+
h = self.fc2(h)
|
| 166 |
h = self.ln_f(h)
|
| 167 |
|
| 168 |
# 🔥 scores를 float32 강제
|