OpenLab-NLP commited on
Commit
c34f761
·
verified ·
1 Parent(s): 4ad0ec1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -67
app.py CHANGED
@@ -22,13 +22,13 @@ TOKENIZER_PATH = "bpe.model"
22
 
23
  if not os.path.exists(MODEL_PATH):
24
  download_file(
25
- "https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/encoder_fit.weights.h5?download=true",
26
  MODEL_PATH
27
  )
28
 
29
  if not os.path.exists(TOKENIZER_PATH):
30
  download_file(
31
- "https://huggingface.co/OpenLab-NLP/openlem2/resolve/main/bpe.model?download=true",
32
  TOKENIZER_PATH
33
  )
34
 
@@ -59,82 +59,70 @@ def pad_sentence(tokens):
59
  return tokens + [pad_id]*(MAX_LEN - len(tokens))
60
 
61
 
62
- class DynamicConv(layers.Layer):
63
- def __init__(self, d_model, k=7):
64
  super().__init__()
65
  assert k % 2 == 1
66
  self.k = k
67
- self.dense = layers.Dense(d_model, activation='silu')
68
- self.proj = layers.Dense(d_model)
69
- self.generator = layers.Dense(k, dtype='float32')
70
- def call(self, x):
71
- x_in = x
72
- x = tf.cast(x, tf.float32)
73
 
74
- B = tf.shape(x)[0]
75
- L = tf.shape(x)[1]
76
- D = tf.shape(x)[2]
77
 
78
- kernels = self.generator(self.dense(x))
79
- kernels = tf.nn.softmax(kernels, axis=-1)
 
80
 
81
- pad = (self.k - 1) // 2
82
- x_pad = tf.pad(x, [[0,0],[pad,pad],[0,0]])
 
 
 
83
 
84
- x_pad_4d = tf.expand_dims(x_pad, axis=1)
85
- patches = tf.image.extract_patches(
86
- images=x_pad_4d,
87
- sizes=[1,1,self.k,1],
88
- strides=[1,1,1,1],
89
- rates=[1,1,1,1],
90
- padding='VALID'
91
- )
92
- patches = tf.reshape(patches, [B, L, self.k, D])
93
 
94
- kernels_exp = tf.expand_dims(kernels, axis=-1)
95
- out = tf.reduce_sum(patches * kernels_exp, axis=2)
96
- out = self.proj(out)
97
 
98
- # 🔥 원래 dtype으로 돌려줌
99
- return tf.cast(out, x_in.dtype)
 
100
 
101
- class EncoderBlock(tf.keras.layers.Layer):
102
- def __init__(self, embed_dim=EMBED_DIM, ff_dim=1152, seq_len=MAX_LEN, num_conv_layers=2):
103
- super().__init__()
104
- self.embed_dim = embed_dim
105
- self.seq_len = seq_len
106
 
107
- # MLP / FFN
108
- self.fc1 = layers.Dense(ff_dim)
109
- self.fc2 = layers.Dense(embed_dim)
110
- self.blocks = [DynamicConv(d_model=embed_dim, k=7) for _ in range(num_conv_layers)]
111
- # LayerNorm
112
- self.ln = layers.LayerNormalization(epsilon=1e-5) # 입력 정규화
113
- self.ln1 = layers.LayerNormalization(epsilon=1e-5) # Conv residual
114
- self.ln2 = layers.LayerNormalization(epsilon=1e-5) # FFN residual
115
-
116
- def call(self, x, mask=None):
117
- # 입력 정규화
118
- x_norm = self.ln(x)
119
-
120
- # DynamicConv 여러 층 통과
121
- out = x_norm
122
- for block in self.blocks: out = block(out)
123
- # Conv residual 연결
124
- x = x_norm + self.ln1(out)
125
-
126
- # FFN / GLU
127
- v = out
128
- h = self.fc1(v)
129
- g, v_split = tf.split(h, 2, axis=-1)
130
- h = tf.nn.silu(g) * v_split
131
- h = self.fc2(h)
132
 
133
- # FFN residual 연결
134
- x = x + self.ln2(h)
135
 
136
- return x
 
 
 
 
137
 
 
 
 
 
 
 
 
138
 
139
  class L2NormLayer(layers.Layer):
140
  def __init__(self, axis=1, epsilon=1e-10, **kwargs):
@@ -144,18 +132,20 @@ class L2NormLayer(layers.Layer):
144
  def call(self, inputs):
145
  return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
146
 
147
- class SentenceEncoder(tf.keras.Model):
148
  def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN, pad_id=pad_id, dropout_rate=EMBED_DROPOUT):
149
  super().__init__()
150
  self.pad_id = pad_id
151
  self.embed = layers.Embedding(vocab_size, embed_dim)
152
  self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
153
  self.dropout = layers.Dropout(dropout_rate)
154
- self.blocks = [EncoderBlock() for _ in range(2)]
155
  self.attn_pool = layers.Dense(1)
156
  self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
157
  self.latent = layers.Dense(latent_dim, activation=None)
158
  self.l2norm = L2NormLayer(axis=1)
 
 
159
 
160
  def call(self, x, training=None):
161
  positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
@@ -166,8 +156,13 @@ class SentenceEncoder(tf.keras.Model):
166
 
167
  h = x_embed
168
  for block in self.blocks:
169
- h = block(h, training=training)
170
 
 
 
 
 
 
171
  h = self.ln_f(h)
172
 
173
  # 🔥 scores를 float32 강제
 
22
 
23
  if not os.path.exists(MODEL_PATH):
24
  download_file(
25
+ "https://huggingface.co/OpenLab-NLP/openlem3/resolve/main/encoder_fit.weights.h5?download=true",
26
  MODEL_PATH
27
  )
28
 
29
  if not os.path.exists(TOKENIZER_PATH):
30
  download_file(
31
+ "https://huggingface.co/OpenLab-NLP/openlem3/resolve/main/bpe.model?download=true",
32
  TOKENIZER_PATH
33
  )
34
 
 
59
  return tokens + [pad_id]*(MAX_LEN - len(tokens))
60
 
61
 
62
+ class HyperConv1D(layers.Layer):
63
+ def __init__(self, d_model, k=7, mem_size=64, hyper_dim=128, dropout=0.0):
64
  super().__init__()
65
  assert k % 2 == 1
66
  self.k = k
67
+ self.d_model = d_model
68
+ self.mem_size = mem_size
 
 
 
 
69
 
70
+ # Input projection
71
+ self.input_proj = layers.Dense(d_model, name="input_proj")
 
72
 
73
+ # Local depthwise conv
74
+ self.local_conv = layers.DepthwiseConv1D(kernel_size=k, padding='same', activation='silu')
75
+ self.local_proj = layers.Dense(d_model, name="local_proj")
76
 
77
+ # Hypernetwork: global -> scale vector
78
+ self.hyper = tf.keras.Sequential([
79
+ layers.Dense(hyper_dim, activation='gelu'),
80
+ layers.Dense(d_model)
81
+ ], name="hyper")
82
 
83
+ # Associative memory
84
+ self.mem_keys = self.add_weight((mem_size, d_model), initializer='glorot_uniform', trainable=True)
85
+ self.mem_vals = self.add_weight((mem_size, d_model), initializer='glorot_uniform', trainable=True)
86
+ self.mem_proj = layers.Dense(d_model)
 
 
 
 
 
87
 
88
+ self.norm = layers.LayerNormalization()
89
+ self.attn_pool = layers.Dense(1)
 
90
 
91
+ def call(self, x):
92
+ x_in = x
93
+ x_dtype = x.dtype # 입력 dtype 기억
94
 
95
+ # 1) input projection
96
+ x_proj = self.input_proj(x)
97
+ # memory와 연산 위해 dtype 통일
98
+ mem_dtype = self.mem_keys.dtype
99
+ x_proj = tf.cast(x_proj, mem_dtype)
100
 
101
+ # 2) local conv
102
+ out_local = self.local_conv(x_proj)
103
+ # hypernetwork scaling
104
+ global_z = self.attn_pool(x_proj)
105
+ global_z = tf.nn.softmax(global_z, axis=1)
106
+ global_z = tf.reduce_sum(x_proj * global_z, axis=1)
107
+
108
+ scale = tf.expand_dims(tf.nn.sigmoid(self.hyper(global_z)), 1)
109
+ out_local = out_local * scale
110
+ out_local = self.local_proj(out_local)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
 
 
112
 
113
+ # 3) associative memory
114
+ sims = tf.matmul(x_proj, self.mem_keys, transpose_b=True) / tf.math.sqrt(tf.cast(self.d_model, mem_dtype))
115
+ attn = tf.nn.softmax(sims, axis=-1)
116
+ mem_read = tf.matmul(attn, self.mem_vals)
117
+ mem_read = self.mem_proj(mem_read)
118
 
119
+ # 4) fuse & residual
120
+ out = out_local + mem_read
121
+ out = self.norm(x_proj + out)
122
+ out = tf.nn.silu(out)
123
+
124
+ # 최종 출력 dtype 원래 입력 dtype으로 캐스트
125
+ return tf.cast(out, x_dtype)
126
 
127
  class L2NormLayer(layers.Layer):
128
  def __init__(self, axis=1, epsilon=1e-10, **kwargs):
 
132
  def call(self, inputs):
133
  return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
134
 
135
+ class SentenceEncoder(Model):
136
  def __init__(self, vocab_size, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM, max_len=MAX_LEN, pad_id=pad_id, dropout_rate=EMBED_DROPOUT):
137
  super().__init__()
138
  self.pad_id = pad_id
139
  self.embed = layers.Embedding(vocab_size, embed_dim)
140
  self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
141
  self.dropout = layers.Dropout(dropout_rate)
142
+ self.blocks = [HyperConv1D(d_model=embed_dim, k=7, mem_size=128, hyper_dim=256) for _ in range(4)]
143
  self.attn_pool = layers.Dense(1)
144
  self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
145
  self.latent = layers.Dense(latent_dim, activation=None)
146
  self.l2norm = L2NormLayer(axis=1)
147
+ self.fc1 = layers.Dense(1152)
148
+ self.fc2 = layers.Dense(embed_dim)
149
 
150
  def call(self, x, training=None):
151
  positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
 
156
 
157
  h = x_embed
158
  for block in self.blocks:
159
+ h = block(h)
160
 
161
+ v = h
162
+ h = self.fc1(v)
163
+ g, v_split = tf.split(h, 2, axis=-1)
164
+ h = tf.nn.silu(g) * v_split
165
+ h = self.fc2(h)
166
  h = self.ln_f(h)
167
 
168
  # 🔥 scores를 float32 강제