OpenLab-NLP commited on
Commit
1a0a3d9
ยท
verified ยท
1 Parent(s): b1e2662

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +164 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import tensorflow as tf
3
+ from tensorflow.keras import layers
4
+ import sentencepiece as spm
5
+ import gradio as gr
6
+ import requests
7
+ import os
8
+
9
+ # ----------------------
10
+ # ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ ์œ ํ‹ธ
11
+ # ----------------------
12
+ def download_file(url, save_path):
13
+ r = requests.get(url, stream=True)
14
+ r.raise_for_status()
15
+ with open(save_path, "wb") as f:
16
+ for chunk in r.iter_content(8192*2):
17
+ f.write(chunk)
18
+ print(f"โœ… {save_path} ์ €์žฅ๋จ")
19
+
20
+ MODEL_PATH = "encoder.weights.h5"
21
+ TOKENIZER_PATH = "bpe.model"
22
+
23
+ if not os.path.exists(MODEL_PATH):
24
+ download_file(
25
+ "https://huggingface.co/OpenLab-NLP/openlem1/resolve/main/encoder.weights.h5?download=true",
26
+ MODEL_PATH
27
+ )
28
+
29
+ if not os.path.exists(TOKENIZER_PATH):
30
+ download_file(
31
+ "https://huggingface.co/OpenLab-NLP/openlem1/resolve/main/bpe.model?download=true",
32
+ TOKENIZER_PATH
33
+ )
34
+
35
+ MAX_LEN = 128
36
+ EMBED_DIM = 384
37
+ LATENT_DIM = 384
38
+ DROPOUT_RATE = 0.01
39
+
40
+ # ===============================
41
+ # 1๏ธโƒฃ ํ† ํฌ๋‚˜์ด์ € ๋กœ๋”ฉ
42
+ # ===============================
43
+ sp = spm.SentencePieceProcessor(TOKENIZER_PATH)
44
+ pad_id = sp.piece_to_id("<pad>") if sp.piece_to_id("<pad>") != -1 else 0
45
+ vocab_size = sp.get_piece_size()
46
+
47
+ def encode_sentence(sentence, max_len=MAX_LEN):
48
+ return sp.encode(sentence, out_type=int)[:max_len]
49
+
50
+ def pad_sentence(tokens):
51
+ return tokens + [pad_id]*(MAX_LEN - len(tokens))
52
+
53
+ class EncoderBlock(layers.Layer):
54
+ def __init__(self, embed_dim=EMBED_DIM, latent_dim=LATENT_DIM):
55
+ super().__init__() # โœ… ๋ฐ˜๋“œ์‹œ ๋งจ ์œ„์— ์ถ”๊ฐ€
56
+ self.mha = layers.MultiHeadAttention(num_heads=8, key_dim=embed_dim//8)
57
+ self.WB = layers.Dense(1152)
58
+ self.W = layers.Dense(embed_dim)
59
+ self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
60
+ self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
61
+ self.ln3 = tf.keras.layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
62
+ def call(self, x):
63
+ x = self.ln1(x)
64
+ attn = self.mha(x, x, x)
65
+ x = self.ln2(attn) + x
66
+ re = x
67
+ w = self.WB(x)
68
+ a, b = tf.split(w, 2, axis=-1)
69
+ g = tf.nn.silu(a) * b
70
+ o = self.W(g)
71
+ return self.ln3(o) + re
72
+
73
+ class L2NormLayer(layers.Layer):
74
+ def __init__(self, axis=1, epsilon=1e-10, **kwargs):
75
+ super().__init__(**kwargs)
76
+ self.axis = axis
77
+ self.epsilon = epsilon
78
+
79
+ def call(self, inputs):
80
+ return tf.math.l2_normalize(inputs, axis=self.axis, epsilon=self.epsilon)
81
+
82
+ def get_config(self):
83
+ return {"axis": self.axis, "epsilon": self.epsilon, **super().get_config()}
84
+
85
+ class SentenceEncoder(tf.keras.Model):
86
+ def __init__(self, vocab_size, embed_dim=384, latent_dim=384, max_len=128, pad_id=3):
87
+ super().__init__()
88
+ self.pad_id = pad_id
89
+ self.embed = layers.Embedding(vocab_size, embed_dim)
90
+ self.pos_embed = layers.Embedding(input_dim=max_len, output_dim=embed_dim)
91
+ self.blocks = [EncoderBlock(embed_dim=embed_dim, latent_dim=latent_dim) for _ in range(2)]
92
+ self.attn_pool = layers.Dense(1)
93
+ self.ln_f = layers.LayerNormalization(epsilon=1e-5, dtype=tf.float32)
94
+ self.latent = layers.Dense(latent_dim, activation=None) # tanh ์ œ๊ฑฐ
95
+ self.l2norm = L2NormLayer() # ์ถ”๊ฐ€
96
+
97
+ def call(self, x):
98
+ positions = tf.range(tf.shape(x)[1])[tf.newaxis, :]
99
+ x_embed = self.embed(x) + self.pos_embed(positions)
100
+ mask = tf.cast(tf.not_equal(x, self.pad_id), tf.float32)
101
+
102
+ x = x_embed
103
+ for block in self.blocks:
104
+ x = block(x)
105
+ x = self.ln_f(x)
106
+
107
+ scores = self.attn_pool(x)
108
+ scores = tf.where(tf.equal(mask[..., tf.newaxis], 0), -1e9, scores)
109
+ scores = tf.nn.softmax(scores, axis=1)
110
+ pooled = tf.reduce_sum(x * scores, axis=1)
111
+
112
+ latent = self.latent(pooled)
113
+ return self.l2norm(latent) # L2 ์ •๊ทœํ™”
114
+ # 3๏ธโƒฃ ๋ชจ๋ธ ๋กœ๋“œ
115
+ # ===============================
116
+ encoder = SentenceEncoder(vocab_size=vocab_size)
117
+ encoder(np.zeros((1, MAX_LEN), dtype=np.int32)) # ๋ชจ๋ธ ๋นŒ๋“œ
118
+ encoder.load_weights(MODEL_PATH)
119
+
120
+ # ===============================
121
+ # 4๏ธโƒฃ ๋ฒกํ„ฐํ™” ํ•จ์ˆ˜
122
+ # ===============================
123
+ def get_sentence_vector(sentence):
124
+ tokens = pad_sentence(encode_sentence(sentence))
125
+ vec = encoder(np.array([tokens])).numpy()[0]
126
+ return vec / np.linalg.norm(vec)
127
+
128
+ # ===============================
129
+ # 5๏ธโƒฃ ๊ฐ€์žฅ ๋น„์Šทํ•œ ๋ฌธ์žฅ ์ฐพ๊ธฐ
130
+ # ===============================
131
+ def find_most_similar(query, s1, s2, s3):
132
+ candidates = [s1, s2, s3]
133
+ candidate_vectors = np.stack([get_sentence_vector(c) for c in candidates]).astype(np.float32)
134
+ query_vector = get_sentence_vector(query)
135
+
136
+ sims = candidate_vectors @ query_vector # cosine similarity
137
+ top_idx = np.argmax(sims)
138
+
139
+ return {
140
+ "๊ฐ€์žฅ ๋น„์Šทํ•œ ๋ฌธ์žฅ": candidates[top_idx],
141
+ "์œ ์‚ฌ๋„": float(sims[top_idx])
142
+ }
143
+
144
+ # ===============================
145
+ # 6๏ธโƒฃ Gradio UI
146
+ # ===============================
147
+ with gr.Blocks() as demo:
148
+ gr.Markdown("## ๐Ÿ” ๋ฌธ์žฅ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰๊ธฐ (์ฟผ๋ฆฌ 1๊ฐœ + ํ›„๋ณด 3๊ฐœ)")
149
+ with gr.Row():
150
+ query_input = gr.Textbox(label="๊ฒ€์ƒ‰ํ•  ๋ฌธ์žฅ (Query)", placeholder="์—ฌ๊ธฐ์— ์ž…๋ ฅ")
151
+ with gr.Row():
152
+ s1_input = gr.Textbox(label="๊ฒ€์ƒ‰ ํ›„๋ณด 1")
153
+ s2_input = gr.Textbox(label="๊ฒ€์ƒ‰ ํ›„๋ณด 2")
154
+ s3_input = gr.Textbox(label="๊ฒ€์ƒ‰ ํ›„๋ณด 3")
155
+ output = gr.JSON(label="๊ฒฐ๊ณผ")
156
+
157
+ search_btn = gr.Button("๊ฐ€์žฅ ๋น„์Šทํ•œ ๋ฌธ์žฅ ์ฐพ๊ธฐ")
158
+ search_btn.click(
159
+ fn=find_most_similar,
160
+ inputs=[query_input, s1_input, s2_input, s3_input],
161
+ outputs=output
162
+ )
163
+
164
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy
2
+ tensorflow
3
+ sentencepiece
4
+ gradio
5
+ requests