Spaces:

OpenLab-NLP
/

OpenLEM

Sleeping

App Files Files Community

OpenLab-NLP commited on 8 days ago

Commit

c34f761

verified ·

1 Parent(s): 4ad0ec1

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -67

app.py CHANGED Viewed

@@ -22,13 +22,13 @@ TOKENIZER_PATH = "bpe.model"
 if not os.path.exists(MODEL_PATH):
     download_file(
-        "https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/encoder_fit.weights.h5?download=true",
         MODEL_PATH
     )
 if not os.path.exists(TOKENIZER_PATH):
     download_file(
-        "https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/bpe.model?download=true",
         TOKENIZER_PATH
     )
@@ -59,82 +59,70 @@ def pad_sentence(tokens):
     return tokens + [pad_id]*(MAX_LEN - len(tokens))
-class DynamicConv(layers.Layer):
-    def __init__(self, d_model, k=7):
         super().__init__()
         assert k % 2 == 1
         self.k = k
-        self.dense = layers.Dense(d_model, activation='silu')
-        self.proj = layers.Dense(d_model)
-        self.generator = layers.Dense(k, dtype='float32')
-    def call(self, x):
-        x_in = x
-        x = tf.cast(x, tf.float32)
-        B = tf.shape(x)[0]
-        L = tf.shape(x)[1]
-        D = tf.shape(x)[2]
-        kernels = self.generator(self.dense(x))
-        kernels = tf.nn.softmax(kernels, axis=-1)
-        pad = (self.k - 1) // 2
-        x_pad = tf.pad(x, [[0,0],[pad,pad],[0,0]])
-        x_pad_4d = tf.expand_dims(x_pad, axis=1)
-        patches = tf.image.extract_patches(
-            images=x_pad_4d,
-            sizes=[1,1,self.k,1],
-            strides=[1,1,1,1],
-            rates=[1,1,1,1],
-            padding='VALID'
-        )
-        patches = tf.reshape(patches, [B, L, self.k, D])
-        kernels_exp = tf.expand_dims(kernels, axis=-1)
-        out = tf.reduce_sum(patches * kernels_exp, axis=2)
-        out = self.proj(out)
-        # 🔥 원래 dtype으로 돌려줌
-        return tf.cast(out, x_in.dtype)
-class EncoderBlock(tf.keras.layers.Layer):
-    def __init__(self, embed_dim=EMBED_DIM, ff_dim=1152, seq_len=MAX_LEN, num_conv_layers=2):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.seq_len = seq_len
-        # MLP / FFN
-        self.fc1 = layers.Dense(ff_dim)
-        self.fc2 = layers.Dense(embed_dim)
-        self.blocks = [DynamicConv(d_model=embed_dim, k=7) for _ in range(num_conv_layers)]
-        # LayerNorm
-        self.ln = layers.LayerNormalization(epsilon=1e-5)   # 입력 정규화
-        self.ln1 = layers.LayerNormalization(epsilon=1e-5)  # Conv residual
-        self.ln2 = layers.LayerNormalization(epsilon=1e-5)  # FFN residual
-    def call(self, x, mask=None):
-        # 입력 정규화
-        x_norm = self.ln(x)
-        # DynamicConv 여러 층 통과
-        out = x_norm
-        for block in self.blocks: out = block(out)
-        # Conv residual 연결
-        x = x_norm + self.ln1(out)
-        # FFN / GLU
-        v = out
-        h = self.fc1(v)
-        g, v_split = tf.split(h, 2, axis=-1)
-        h = tf.nn.silu(g) * v_split
-        h = self.fc2(h)
-        # FFN residual 연결
-        x = x + self.ln2(h)
-        return x
 class L2NormLayer(layers.Layer):
     def __init__(self, axis=1, epsilon=1e-10, **kwargs):
@@ -144,18 +132,20 @@ class L2NormLayer(layers.Layer):
     def call(self, inputs):
         return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
-class SentenceEncoder(tf.keras.Model):
     def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN, pad_id=pad_id, dropout_rate=EMBED_DROPOUT):
         super().__init__()
         self.pad_id = pad_id
         self.embed = layers.Embedding(vocab_size, embed_dim)
         self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
         self.dropout = layers.Dropout(dropout_rate)
-        self.blocks = [EncoderBlock() for _ in range(2)]
         self.attn_pool = layers.Dense(1)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
         self.latent = layers.Dense(latent_dim, activation=None)
         self.l2norm = L2NormLayer(axis=1)
     def call(self, x, training=None):
         positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
@@ -166,8 +156,13 @@ class SentenceEncoder(tf.keras.Model):
         h = x_embed
         for block in self.blocks:
-            h = block(h, training=training)
         h = self.ln_f(h)
         # 🔥 scores를 float32 강제

 if not os.path.exists(MODEL_PATH):
     download_file(
+        "https://huggingface.co/OpenLab-NLP/openlem3/resolve/main/encoder_fit.weights.h5?download=true",
         MODEL_PATH
     )
 if not os.path.exists(TOKENIZER_PATH):
     download_file(
+        "https://huggingface.co/OpenLab-NLP/openlem3/resolve/main/bpe.model?download=true",
         TOKENIZER_PATH
     )
     return tokens + [pad_id]*(MAX_LEN - len(tokens))
+class HyperConv1D(layers.Layer):
+    def __init__(self, d_model, k=7, mem_size=64, hyper_dim=128, dropout=0.0):
         super().__init__()
         assert k % 2 == 1
         self.k = k
+        self.d_model = d_model
+        self.mem_size = mem_size
+        # Input projection
+        self.input_proj = layers.Dense(d_model, name="input_proj")
+        # Local depthwise conv
+        self.local_conv = layers.DepthwiseConv1D(kernel_size=k, padding='same', activation='silu')
+        self.local_proj = layers.Dense(d_model, name="local_proj")
+        # Hypernetwork: global -> scale vector
+        self.hyper = tf.keras.Sequential([
+            layers.Dense(hyper_dim, activation='gelu'),
+            layers.Dense(d_model)
+        ], name="hyper")
+        # Associative memory
+        self.mem_keys = self.add_weight((mem_size, d_model), initializer='glorot_uniform', trainable=True)
+        self.mem_vals = self.add_weight((mem_size, d_model), initializer='glorot_uniform', trainable=True)
+        self.mem_proj = layers.Dense(d_model)
+        self.norm = layers.LayerNormalization()
+        self.attn_pool = layers.Dense(1)
+    def call(self, x):
+        x_in = x
+        x_dtype = x.dtype  # 입력 dtype 기억
+        # 1) input projection
+        x_proj = self.input_proj(x)
+        # memory와 연산 위해 dtype 통일
+        mem_dtype = self.mem_keys.dtype
+        x_proj = tf.cast(x_proj, mem_dtype)
+        # 2) local conv
+        out_local = self.local_conv(x_proj)
+        # hypernetwork scaling
+        global_z = self.attn_pool(x_proj)
+        global_z = tf.nn.softmax(global_z, axis=1)
+        global_z = tf.reduce_sum(x_proj * global_z, axis=1)
+        scale = tf.expand_dims(tf.nn.sigmoid(self.hyper(global_z)), 1)
+        out_local = out_local * scale
+        out_local = self.local_proj(out_local)
+        # 3) associative memory
+        sims = tf.matmul(x_proj, self.mem_keys, transpose_b=True) / tf.math.sqrt(tf.cast(self.d_model, mem_dtype))
+        attn = tf.nn.softmax(sims, axis=-1)
+        mem_read = tf.matmul(attn, self.mem_vals)
+        mem_read = self.mem_proj(mem_read)
+        # 4) fuse & residual
+        out = out_local + mem_read
+        out = self.norm(x_proj + out)
+        out = tf.nn.silu(out)
+        # 최종 출력 dtype 원래 입력 dtype으로 캐스트
+        return tf.cast(out, x_dtype)
 class L2NormLayer(layers.Layer):
     def __init__(self, axis=1, epsilon=1e-10, **kwargs):
     def call(self, inputs):
         return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
+class SentenceEncoder(Model):
     def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN, pad_id=pad_id, dropout_rate=EMBED_DROPOUT):
         super().__init__()
         self.pad_id = pad_id
         self.embed = layers.Embedding(vocab_size, embed_dim)
         self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
         self.dropout = layers.Dropout(dropout_rate)
+        self.blocks = [HyperConv1D(d_model=embed_dim, k=7, mem_size=128, hyper_dim=256) for _ in range(4)]
         self.attn_pool = layers.Dense(1)
         self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
         self.latent = layers.Dense(latent_dim, activation=None)
         self.l2norm = L2NormLayer(axis=1)
+        self.fc1 = layers.Dense(1152)
+        self.fc2 = layers.Dense(embed_dim)
     def call(self, x, training=None):
         positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
         h = x_embed
         for block in self.blocks:
+            h = block(h)
+        v = h
+        h = self.fc1(v)
+        g, v_split = tf.split(h, 2, axis=-1)
+        h = tf.nn.silu(g) * v_split
+        h = self.fc2(h)
         h = self.ln_f(h)
         # 🔥 scores를 float32 강제