Update technical report with comprehensive SVC vs Logistic Regression comparison
Browse files- Add detailed comparative per-class performance analysis for UTS2017_Bank dataset
- Update system card with SVC performance metrics (72.47% vs 70.96% accuracy)
- Include side-by-side comparison table showing F1-score improvements
- Add new inference.py script for local model testing
- Export optimized SVC model (uts2017_bank_classifier_20250928_060819.joblib)
- Fix ruff linting issues in inference.py (remove unused imports)
- Update LaTeX technical report with comprehensive analysis and conclusions
Key improvements documented:
- LOAN category: +0.50 F1-score improvement with SVC
- DISCOUNT category: +0.22 F1-score improvement with SVC
- INTEREST_RATE category: +0.18 F1-score improvement with SVC
- Overall weighted F1-score: +0.03 improvement with SVC
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Inference script for Sonar Core 1 - Vietnamese Text Classification.
|
| 4 |
+
Loads trained models from local files and performs predictions.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import joblib
|
| 9 |
+
import os
|
| 10 |
+
import glob
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def find_local_models():
|
| 14 |
+
"""Find all available local model files"""
|
| 15 |
+
models = {
|
| 16 |
+
'exported': {},
|
| 17 |
+
'runs': {}
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
# Find exported models in project root
|
| 21 |
+
for filename in os.listdir('.'):
|
| 22 |
+
if filename.endswith('.joblib'):
|
| 23 |
+
if filename.startswith('vntc_classifier_'):
|
| 24 |
+
models['exported']['vntc'] = filename
|
| 25 |
+
elif filename.startswith('uts2017_bank_classifier_'):
|
| 26 |
+
models['exported']['uts2017_bank'] = filename
|
| 27 |
+
|
| 28 |
+
# Find models in runs directory
|
| 29 |
+
vntc_runs = glob.glob('runs/*/models/VNTC_*.joblib')
|
| 30 |
+
bank_runs = glob.glob('runs/*/models/UTS2017_Bank_*.joblib')
|
| 31 |
+
|
| 32 |
+
if vntc_runs:
|
| 33 |
+
models['runs']['vntc'] = sorted(vntc_runs)[-1] # Most recent
|
| 34 |
+
if bank_runs:
|
| 35 |
+
models['runs']['uts2017_bank'] = sorted(bank_runs)[-1] # Most recent
|
| 36 |
+
|
| 37 |
+
return models
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def load_model(model_path):
|
| 41 |
+
"""Load a model from file path"""
|
| 42 |
+
try:
|
| 43 |
+
print(f"Loading model from: {model_path}")
|
| 44 |
+
model = joblib.load(model_path)
|
| 45 |
+
print(f"Model loaded successfully. Classes: {len(model.classes_)}")
|
| 46 |
+
return model
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Error loading model: {e}")
|
| 49 |
+
return None
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def predict_text(model, text):
|
| 53 |
+
"""Make prediction on a single text"""
|
| 54 |
+
try:
|
| 55 |
+
probabilities = model.predict_proba([text])[0]
|
| 56 |
+
|
| 57 |
+
# Get top 3 predictions sorted by probability
|
| 58 |
+
top_indices = probabilities.argsort()[-3:][::-1]
|
| 59 |
+
top_predictions = []
|
| 60 |
+
for idx in top_indices:
|
| 61 |
+
category = model.classes_[idx]
|
| 62 |
+
prob = probabilities[idx]
|
| 63 |
+
top_predictions.append((category, prob))
|
| 64 |
+
|
| 65 |
+
# The prediction should be the top category
|
| 66 |
+
prediction = top_predictions[0][0]
|
| 67 |
+
confidence = top_predictions[0][1]
|
| 68 |
+
|
| 69 |
+
return prediction, confidence, top_predictions
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"Error making prediction: {e}")
|
| 72 |
+
return None, 0, []
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def interactive_mode(model, dataset_name):
|
| 76 |
+
"""Interactive prediction mode"""
|
| 77 |
+
print(f"\n{'='*60}")
|
| 78 |
+
print(f"INTERACTIVE MODE - {dataset_name.upper()} CLASSIFICATION")
|
| 79 |
+
print(f"{'='*60}")
|
| 80 |
+
print("Enter Vietnamese text to classify (type 'quit' to exit):")
|
| 81 |
+
|
| 82 |
+
while True:
|
| 83 |
+
try:
|
| 84 |
+
user_input = input("\nText: ").strip()
|
| 85 |
+
|
| 86 |
+
if user_input.lower() in ['quit', 'exit', 'q']:
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
if not user_input:
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
prediction, confidence, top_predictions = predict_text(model, user_input)
|
| 93 |
+
|
| 94 |
+
if prediction:
|
| 95 |
+
print(f"Predicted category: {prediction}")
|
| 96 |
+
print(f"Confidence: {confidence:.3f}")
|
| 97 |
+
print("Top 3 predictions:")
|
| 98 |
+
for i, (category, prob) in enumerate(top_predictions, 1):
|
| 99 |
+
print(f" {i}. {category}: {prob:.3f}")
|
| 100 |
+
|
| 101 |
+
except KeyboardInterrupt:
|
| 102 |
+
print("\nExiting...")
|
| 103 |
+
break
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"Error: {e}")
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def test_examples(model, dataset_name):
|
| 109 |
+
"""Test model with predefined examples"""
|
| 110 |
+
if dataset_name == 'vntc':
|
| 111 |
+
examples = [
|
| 112 |
+
"Đội tuyển bóng đá Việt Nam giành chiến thắng 2-0",
|
| 113 |
+
"Chính phủ thông qua nghị định mới về chính sách xã hội",
|
| 114 |
+
"Các nhà khoa học phát hiện loại vi khuẩn mới",
|
| 115 |
+
"Thị trường chứng khoán biến động mạnh",
|
| 116 |
+
"Tiêm vaccine COVID-19 đạt tỷ lệ cao",
|
| 117 |
+
"Công nghệ trí tuệ nhân tạo phát triển mạnh"
|
| 118 |
+
]
|
| 119 |
+
else: # uts2017_bank
|
| 120 |
+
examples = [
|
| 121 |
+
"Tôi muốn mở tài khoản tiết kiệm mới",
|
| 122 |
+
"Lãi suất vay mua nhà hiện tại là bao nhiều?",
|
| 123 |
+
"Làm thế nào để đăng ký internet banking?",
|
| 124 |
+
"Chi phí chuyển tiền ra nước ngoài",
|
| 125 |
+
"Ngân hàng ACB có uy tín không?",
|
| 126 |
+
"Tôi cần hỗ trợ về dịch vụ ngân hàng"
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
print(f"\n{'='*60}")
|
| 130 |
+
print(f"TESTING {dataset_name.upper()} MODEL WITH EXAMPLES")
|
| 131 |
+
print(f"{'='*60}")
|
| 132 |
+
|
| 133 |
+
for text in examples:
|
| 134 |
+
prediction, confidence, top_predictions = predict_text(model, text)
|
| 135 |
+
|
| 136 |
+
if prediction:
|
| 137 |
+
print(f"\nText: {text}")
|
| 138 |
+
print(f"Prediction: {prediction}")
|
| 139 |
+
print(f"Confidence: {confidence:.3f}")
|
| 140 |
+
|
| 141 |
+
# Show top 3 if confidence is low
|
| 142 |
+
if confidence < 0.7:
|
| 143 |
+
print("Alternative predictions:")
|
| 144 |
+
for i, (category, prob) in enumerate(top_predictions[:3], 1):
|
| 145 |
+
print(f" {i}. {category}: {prob:.3f}")
|
| 146 |
+
print("-" * 60)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def list_available_models():
|
| 150 |
+
"""List all available models"""
|
| 151 |
+
models = find_local_models()
|
| 152 |
+
|
| 153 |
+
print("Available Models:")
|
| 154 |
+
print("=" * 50)
|
| 155 |
+
|
| 156 |
+
if models['exported']:
|
| 157 |
+
print("\nExported Models (Project Root):")
|
| 158 |
+
for dataset, filename in models['exported'].items():
|
| 159 |
+
file_size = os.path.getsize(filename) / (1024 * 1024) # MB
|
| 160 |
+
print(f" {dataset}: {filename} ({file_size:.1f}MB)")
|
| 161 |
+
|
| 162 |
+
if models['runs']:
|
| 163 |
+
print("\nRuns Models (Training Directory):")
|
| 164 |
+
for dataset, filepath in models['runs'].items():
|
| 165 |
+
file_size = os.path.getsize(filepath) / (1024 * 1024) # MB
|
| 166 |
+
print(f" {dataset}: {filepath} ({file_size:.1f}MB)")
|
| 167 |
+
|
| 168 |
+
if not models['exported'] and not models['runs']:
|
| 169 |
+
print("No local models found!")
|
| 170 |
+
print("Train a model first using: python train.py --export-model")
|
| 171 |
+
print("Or download from HuggingFace using: python use_this_model.py")
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def main():
|
| 175 |
+
"""Main function"""
|
| 176 |
+
parser = argparse.ArgumentParser(
|
| 177 |
+
description="Inference with local Sonar Core 1 models"
|
| 178 |
+
)
|
| 179 |
+
parser.add_argument(
|
| 180 |
+
"--model-path",
|
| 181 |
+
type=str,
|
| 182 |
+
help="Path to specific model file"
|
| 183 |
+
)
|
| 184 |
+
parser.add_argument(
|
| 185 |
+
"--dataset",
|
| 186 |
+
type=str,
|
| 187 |
+
choices=["vntc", "uts2017_bank"],
|
| 188 |
+
help="Dataset type (auto-detects if not specified)"
|
| 189 |
+
)
|
| 190 |
+
parser.add_argument(
|
| 191 |
+
"--text",
|
| 192 |
+
type=str,
|
| 193 |
+
help="Text to classify (if not provided, enters interactive mode)"
|
| 194 |
+
)
|
| 195 |
+
parser.add_argument(
|
| 196 |
+
"--test-examples",
|
| 197 |
+
action="store_true",
|
| 198 |
+
help="Test with predefined examples"
|
| 199 |
+
)
|
| 200 |
+
parser.add_argument(
|
| 201 |
+
"--list-models",
|
| 202 |
+
action="store_true",
|
| 203 |
+
help="List all available local models"
|
| 204 |
+
)
|
| 205 |
+
parser.add_argument(
|
| 206 |
+
"--source",
|
| 207 |
+
type=str,
|
| 208 |
+
choices=["exported", "runs"],
|
| 209 |
+
default="exported",
|
| 210 |
+
help="Model source: exported files or runs directory (default: exported)"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
args = parser.parse_args()
|
| 214 |
+
|
| 215 |
+
# List models and exit
|
| 216 |
+
if args.list_models:
|
| 217 |
+
list_available_models()
|
| 218 |
+
return
|
| 219 |
+
|
| 220 |
+
# Find available models
|
| 221 |
+
models = find_local_models()
|
| 222 |
+
|
| 223 |
+
# Determine model path
|
| 224 |
+
model_path = None
|
| 225 |
+
dataset_name = args.dataset
|
| 226 |
+
|
| 227 |
+
if args.model_path:
|
| 228 |
+
# Use specified model path
|
| 229 |
+
model_path = args.model_path
|
| 230 |
+
# Try to infer dataset from filename
|
| 231 |
+
if not dataset_name:
|
| 232 |
+
if 'vntc' in args.model_path.lower():
|
| 233 |
+
dataset_name = 'vntc'
|
| 234 |
+
elif 'uts2017' in args.model_path.lower() or 'bank' in args.model_path.lower():
|
| 235 |
+
dataset_name = 'uts2017_bank'
|
| 236 |
+
else:
|
| 237 |
+
# Auto-select model
|
| 238 |
+
if args.dataset:
|
| 239 |
+
# Use specified dataset
|
| 240 |
+
if args.dataset in models[args.source]:
|
| 241 |
+
model_path = models[args.source][args.dataset]
|
| 242 |
+
dataset_name = args.dataset
|
| 243 |
+
else:
|
| 244 |
+
print(f"No {args.dataset} model found in {args.source} models")
|
| 245 |
+
list_available_models()
|
| 246 |
+
return
|
| 247 |
+
else:
|
| 248 |
+
# Use first available model
|
| 249 |
+
if models[args.source]:
|
| 250 |
+
dataset_name = list(models[args.source].keys())[0]
|
| 251 |
+
model_path = models[args.source][dataset_name]
|
| 252 |
+
print(f"Auto-selected {dataset_name} model")
|
| 253 |
+
else:
|
| 254 |
+
print("No models found!")
|
| 255 |
+
list_available_models()
|
| 256 |
+
return
|
| 257 |
+
|
| 258 |
+
if not model_path or not os.path.exists(model_path):
|
| 259 |
+
print(f"Model file not found: {model_path}")
|
| 260 |
+
list_available_models()
|
| 261 |
+
return
|
| 262 |
+
|
| 263 |
+
# Load model
|
| 264 |
+
model = load_model(model_path)
|
| 265 |
+
if not model:
|
| 266 |
+
return
|
| 267 |
+
|
| 268 |
+
# Process based on arguments
|
| 269 |
+
if args.text:
|
| 270 |
+
# Single prediction
|
| 271 |
+
prediction, confidence, top_predictions = predict_text(model, args.text)
|
| 272 |
+
if prediction:
|
| 273 |
+
print(f"\nText: {args.text}")
|
| 274 |
+
print(f"Prediction: {prediction}")
|
| 275 |
+
print(f"Confidence: {confidence:.3f}")
|
| 276 |
+
print("Top 3 predictions:")
|
| 277 |
+
for i, (category, prob) in enumerate(top_predictions, 1):
|
| 278 |
+
print(f" {i}. {category}: {prob:.3f}")
|
| 279 |
+
|
| 280 |
+
elif args.test_examples:
|
| 281 |
+
# Test with examples
|
| 282 |
+
test_examples(model, dataset_name)
|
| 283 |
+
|
| 284 |
+
else:
|
| 285 |
+
# Interactive mode
|
| 286 |
+
print(f"Loaded {dataset_name} model: {os.path.basename(model_path)}")
|
| 287 |
+
test_examples(model, dataset_name)
|
| 288 |
+
|
| 289 |
+
# Ask if user wants interactive mode
|
| 290 |
+
try:
|
| 291 |
+
response = input("\nEnter interactive mode? (y/n): ").strip().lower()
|
| 292 |
+
if response in ['y', 'yes']:
|
| 293 |
+
interactive_mode(model, dataset_name)
|
| 294 |
+
except KeyboardInterrupt:
|
| 295 |
+
print("\nExiting...")
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
if __name__ == "__main__":
|
| 299 |
+
main()
|
|
@@ -8,7 +8,7 @@
|
|
| 8 |
|
| 9 |
## Model Overview
|
| 10 |
|
| 11 |
-
**Sonar Core 1** is a Vietnamese text classification model built on traditional machine learning techniques (TF-IDF + Logistic Regression) optimized for production deployment. The model achieves **92.
|
| 12 |
|
| 13 |
### Quick Facts
|
| 14 |
- **Model Type**: Text Classification (Multi-class)
|
|
@@ -34,7 +34,7 @@
|
|
| 34 |
- Logistic Regression classifier with 1,000 max iterations
|
| 35 |
- **Hash-based caching system** for efficient processing
|
| 36 |
|
| 37 |
-
Released on **2025-09-21**, the model achieves **92.
|
| 38 |
|
| 39 |
## Training Data
|
| 40 |
|
|
@@ -50,8 +50,8 @@ Account, Card, Customer Support, Discount, Interest Rate, Internet Banking, Loan
|
|
| 50 |
|
| 51 |
| Dataset | Categories | Training Samples | Test Samples | Best Accuracy |
|
| 52 |
|---------|------------|------------------|--------------|---------------|
|
| 53 |
-
| VNTC (News) | 10 | 33,759 | 50,373 | 92.
|
| 54 |
-
| UTS2017_Bank | 14 | 1,581 | 396 |
|
| 55 |
|
| 56 |
## Performance Metrics
|
| 57 |
|
|
@@ -59,8 +59,8 @@ Account, Card, Customer Support, Discount, Interest Rate, Internet Banking, Loan
|
|
| 59 |
|
| 60 |
| Dataset | Test Accuracy | Training Time | Best Categories (F1-Score) |
|
| 61 |
|---------|---------------|---------------|------------------------------|
|
| 62 |
-
| **VNTC (News)** | **92.
|
| 63 |
-
| **UTS2017_Bank** | **
|
| 64 |
|
| 65 |
### Key Performance Highlights
|
| 66 |
|
|
|
|
| 8 |
|
| 9 |
## Model Overview
|
| 10 |
|
| 11 |
+
**Sonar Core 1** is a Vietnamese text classification model built on traditional machine learning techniques (TF-IDF + SVC/Logistic Regression) optimized for production deployment. The model achieves **92.80% accuracy** on Vietnamese news classification with SVC and **72.47% accuracy** on banking text classification with SVC, offering a computationally efficient alternative to deep learning approaches.
|
| 12 |
|
| 13 |
### Quick Facts
|
| 14 |
- **Model Type**: Text Classification (Multi-class)
|
|
|
|
| 34 |
- Logistic Regression classifier with 1,000 max iterations
|
| 35 |
- **Hash-based caching system** for efficient processing
|
| 36 |
|
| 37 |
+
Released on **2025-09-21**, the model achieves **92.80% test accuracy** with SVC and **95.39% training accuracy** with optimized training time using the hash-based caching system. The model features a dedicated VNTCDataset class for efficient data handling and improved modular architecture.
|
| 38 |
|
| 39 |
## Training Data
|
| 40 |
|
|
|
|
| 50 |
|
| 51 |
| Dataset | Categories | Training Samples | Test Samples | Best Accuracy |
|
| 52 |
|---------|------------|------------------|--------------|---------------|
|
| 53 |
+
| VNTC (News) | 10 | 33,759 | 50,373 | 92.80% (SVC) |
|
| 54 |
+
| UTS2017_Bank | 14 | 1,581 | 396 | 72.47% (SVC) |
|
| 55 |
|
| 56 |
## Performance Metrics
|
| 57 |
|
|
|
|
| 59 |
|
| 60 |
| Dataset | Test Accuracy | Training Time | Best Categories (F1-Score) |
|
| 61 |
|---------|---------------|---------------|------------------------------|
|
| 62 |
+
| **VNTC (News)** | **92.80% (SVC)** | ~54 minutes (SVC) | Sports (98%), Health (94%) |
|
| 63 |
+
| **UTS2017_Bank** | **72.47% (SVC)** | ~5.3 seconds (SVC) | Trademark (88%), Customer Support (76%) |
|
| 64 |
|
| 65 |
### Key Performance Highlights
|
| 66 |
|
|
@@ -23,7 +23,7 @@
|
|
| 23 |
\maketitle
|
| 24 |
|
| 25 |
\begin{abstract}
|
| 26 |
-
This paper presents Sonar Core 1, a Vietnamese text classification system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with multiple classification algorithms. The system is evaluated on two Vietnamese datasets: the VNTC dataset containing 10 news categories achieves 92.80\% accuracy with Support Vector Classification (SVC) and 92.33\% with logistic regression, while the UTS2017\_Bank dataset spanning 14 banking service categories achieves 70.96\% accuracy with logistic regression. The implementation utilizes a 20,000-dimensional TF-IDF feature space with n-gram analysis and incorporates hash-based caching for computational optimization. These results establish baseline performance metrics for Vietnamese text classification and demonstrate the efficacy of traditional machine learning approaches for Vietnamese natural language processing tasks. The system architecture prioritizes computational efficiency and model interpretability for production deployment scenarios.
|
| 27 |
\end{abstract}
|
| 28 |
|
| 29 |
\section{Introduction}
|
|
@@ -185,6 +185,7 @@ VNTC (27 topics) & Toan et al. (2017) - SVC & 99.65\% \\
|
|
| 185 |
VNTC (27 topics) & Toan et al. (2017) - Random Forest & 99.25\% \\
|
| 186 |
VNTC (27 topics) & Toan et al. (2017) - SVM & 97.80\% \\
|
| 187 |
\hline
|
|
|
|
| 188 |
UTS2017\_Bank (14 topics) & \textbf{Sonar Core 1 - Logistic Regression with TF-IDF} & \textbf{70.96\%} \\
|
| 189 |
\hline
|
| 190 |
\end{tabular}
|
|
@@ -215,16 +216,18 @@ The system demonstrates robust performance on the VNTC news classification datas
|
|
| 215 |
\textbf{UTS2017\_Bank Dataset (Banking Classification):}
|
| 216 |
The system exhibits moderate performance on the banking service classification task:
|
| 217 |
\begin{itemize}
|
| 218 |
-
\item \textbf{Test Classification Accuracy}:
|
| 219 |
-
\item \textbf{
|
| 220 |
-
\item \textbf{
|
|
|
|
|
|
|
| 221 |
\item \textbf{Macro Average F1-Score}: 0.17
|
| 222 |
\item \textbf{Weighted Average F1-Score}: 0.63
|
| 223 |
\end{itemize}
|
| 224 |
|
| 225 |
\subsubsection{Detailed Per-Class Performance}
|
| 226 |
|
| 227 |
-
\textbf{VNTC Dataset Per-Class Results:}
|
| 228 |
|
| 229 |
\begin{longtable}{lcccc}
|
| 230 |
\toprule
|
|
@@ -243,29 +246,94 @@ vi\_tinh & 0.94 & 0.95 & 0.94 & 4,560 \\
|
|
| 243 |
\bottomrule
|
| 244 |
\end{longtable}
|
| 245 |
|
| 246 |
-
\
|
| 247 |
|
| 248 |
-
\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
\toprule
|
| 250 |
-
Category &
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
\midrule
|
| 252 |
-
|
| 253 |
-
CARD & 0.00 & 0.00 & 0.00 & 13 \\
|
| 254 |
-
CUSTOMER\_SUPPORT & 0.62 & 0.97 & 0.76 & 155 \\
|
| 255 |
-
DISCOUNT & 0.00 & 0.00 & 0.00 & 8 \\
|
| 256 |
-
INTEREST\_RATE & 0.50 & 0.08 & 0.14 & 12 \\
|
| 257 |
-
INTERNET\_BANKING & 0.00 & 0.00 & 0.00 & 14 \\
|
| 258 |
-
LOAN & 0.67 & 0.13 & 0.22 & 15 \\
|
| 259 |
-
MONEY\_TRANSFER & 0.00 & 0.00 & 0.00 & 7 \\
|
| 260 |
-
OTHER & 0.50 & 0.07 & 0.12 & 14 \\
|
| 261 |
-
PAYMENT & 0.00 & 0.00 & 0.00 & 3 \\
|
| 262 |
-
PROMOTION & 1.00 & 0.18 & 0.31 & 11 \\
|
| 263 |
-
SAVING & 0.00 & 0.00 & 0.00 & 2 \\
|
| 264 |
-
SECURITY & 0.00 & 0.00 & 0.00 & 1 \\
|
| 265 |
-
TRADEMARK & 0.87 & 0.89 & 0.88 & 140 \\
|
| 266 |
\bottomrule
|
| 267 |
\end{longtable}
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
\subsubsection{Performance Analysis and Insights}
|
| 270 |
|
| 271 |
\subsubsection{VNTC Dataset Analysis}
|
|
|
|
| 23 |
\maketitle
|
| 24 |
|
| 25 |
\begin{abstract}
|
| 26 |
+
This paper presents Sonar Core 1, a Vietnamese text classification system employing Term Frequency-Inverse Document Frequency (TF-IDF) feature extraction combined with multiple classification algorithms. The system is evaluated on two Vietnamese datasets: the VNTC dataset containing 10 news categories achieves 92.80\% accuracy with Support Vector Classification (SVC) and 92.33\% with logistic regression, while the UTS2017\_Bank dataset spanning 14 banking service categories achieves 72.47\% accuracy with Support Vector Classification (SVC) and 70.96\% accuracy with logistic regression. The implementation utilizes a 20,000-dimensional TF-IDF feature space with n-gram analysis and incorporates hash-based caching for computational optimization. These results establish baseline performance metrics for Vietnamese text classification and demonstrate the efficacy of traditional machine learning approaches for Vietnamese natural language processing tasks. The system architecture prioritizes computational efficiency and model interpretability for production deployment scenarios.
|
| 27 |
\end{abstract}
|
| 28 |
|
| 29 |
\section{Introduction}
|
|
|
|
| 185 |
VNTC (27 topics) & Toan et al. (2017) - Random Forest & 99.25\% \\
|
| 186 |
VNTC (27 topics) & Toan et al. (2017) - SVM & 97.80\% \\
|
| 187 |
\hline
|
| 188 |
+
UTS2017\_Bank (14 topics) & \textbf{Sonar Core 1 - SVC with TF-IDF} & \textbf{72.47\%} \\
|
| 189 |
UTS2017\_Bank (14 topics) & \textbf{Sonar Core 1 - Logistic Regression with TF-IDF} & \textbf{70.96\%} \\
|
| 190 |
\hline
|
| 191 |
\end{tabular}
|
|
|
|
| 216 |
\textbf{UTS2017\_Bank Dataset (Banking Classification):}
|
| 217 |
The system exhibits moderate performance on the banking service classification task:
|
| 218 |
\begin{itemize}
|
| 219 |
+
\item \textbf{Test Classification Accuracy (SVC)}: 72.47\%
|
| 220 |
+
\item \textbf{Test Classification Accuracy (Logistic Regression)}: 70.96\%
|
| 221 |
+
\item \textbf{Training Latency (SVC)}: 5.3 seconds
|
| 222 |
+
\item \textbf{Training Latency (Logistic Regression)}: 0.78 seconds
|
| 223 |
+
\item \textbf{Inference Latency}: 0.1 seconds for 396 test samples (0.025 ms per sample)
|
| 224 |
\item \textbf{Macro Average F1-Score}: 0.17
|
| 225 |
\item \textbf{Weighted Average F1-Score}: 0.63
|
| 226 |
\end{itemize}
|
| 227 |
|
| 228 |
\subsubsection{Detailed Per-Class Performance}
|
| 229 |
|
| 230 |
+
\textbf{VNTC Dataset Per-Class Results (Logistic Regression - 92.33\% accuracy):}
|
| 231 |
|
| 232 |
\begin{longtable}{lcccc}
|
| 233 |
\toprule
|
|
|
|
| 246 |
\bottomrule
|
| 247 |
\end{longtable}
|
| 248 |
|
| 249 |
+
\subsubsection{UTS2017\_Bank Dataset: Comparative Model Performance}
|
| 250 |
|
| 251 |
+
This section presents a detailed comparison of per-class performance between Support Vector Classification (SVC) and Logistic Regression models on the UTS2017\_Bank dataset, highlighting the superior performance of SVC across multiple categories.
|
| 252 |
+
|
| 253 |
+
\textbf{Model Performance Summary:}
|
| 254 |
+
\begin{itemize}
|
| 255 |
+
\item \textbf{SVC Model}: 72.47\% overall accuracy with 20k features and 1-2 n-gram range
|
| 256 |
+
\item \textbf{Logistic Regression Model}: 70.96\% overall accuracy with 20k features and 1-2 n-gram range
|
| 257 |
+
\item \textbf{Performance Gain}: SVC achieves a 1.51 percentage point improvement over Logistic Regression
|
| 258 |
+
\end{itemize}
|
| 259 |
+
|
| 260 |
+
\textbf{UTS2017\_Bank Dataset Comparative Per-Class Results:}
|
| 261 |
+
|
| 262 |
+
The following table presents a side-by-side comparison of per-class performance metrics for both SVC (72.47\% accuracy) and Logistic Regression (70.96\% accuracy) models:
|
| 263 |
+
|
| 264 |
+
\begin{longtable}{lcccccccc}
|
| 265 |
\toprule
|
| 266 |
+
\multirow{2}{*}{Category} & \multicolumn{3}{c}{SVC Model} & \multicolumn{3}{c}{Logistic Regression} & \multirow{2}{*}{Support} & \multirow{2}{*}{Improvement} \\
|
| 267 |
+
\cmidrule(lr){2-4} \cmidrule(lr){5-7}
|
| 268 |
+
& Prec. & Recall & F1 & Prec. & Recall & F1 & & (F1 $\Delta$) \\
|
| 269 |
+
\midrule
|
| 270 |
+
ACCOUNT & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 1 & +0.00 \\
|
| 271 |
+
CARD & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 13 & +0.00 \\
|
| 272 |
+
CUSTOMER\_SUPPORT & 0.64 & 0.97 & 0.77 & 0.62 & 0.97 & 0.76 & 155 & +0.01 \\
|
| 273 |
+
DISCOUNT & 1.00 & 0.12 & 0.22 & 0.00 & 0.00 & 0.00 & 8 & +0.22 \\
|
| 274 |
+
INTEREST\_RATE & 0.43 & 0.25 & 0.32 & 0.50 & 0.08 & 0.14 & 12 & +0.18 \\
|
| 275 |
+
INTERNET\_BANKING & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 14 & +0.00 \\
|
| 276 |
+
LOAN & 0.90 & 0.60 & 0.72 & 0.67 & 0.13 & 0.22 & 15 & +0.50 \\
|
| 277 |
+
MONEY\_TRANSFER & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 7 & +0.00 \\
|
| 278 |
+
OTHER & 0.00 & 0.00 & 0.00 & 0.50 & 0.07 & 0.12 & 14 & -0.12 \\
|
| 279 |
+
PAYMENT & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 3 & +0.00 \\
|
| 280 |
+
PROMOTION & 0.40 & 0.18 & 0.25 & 1.00 & 0.18 & 0.31 & 11 & -0.06 \\
|
| 281 |
+
SAVING & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 2 & +0.00 \\
|
| 282 |
+
SECURITY & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 0.00 & 1 & +0.00 \\
|
| 283 |
+
TRADEMARK & 0.90 & 0.87 & 0.89 & 0.87 & 0.89 & 0.88 & 140 & +0.01 \\
|
| 284 |
\midrule
|
| 285 |
+
\textbf{Weighted Avg} & \textbf{0.65} & \textbf{0.72} & \textbf{0.66} & \textbf{0.64} & \textbf{0.71} & \textbf{0.63} & \textbf{396} & \textbf{+0.03} \\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
\bottomrule
|
| 287 |
\end{longtable}
|
| 288 |
|
| 289 |
+
\textbf{Key Performance Differences:}
|
| 290 |
+
|
| 291 |
+
\begin{itemize}
|
| 292 |
+
\item \textbf{Significant SVC Improvements}:
|
| 293 |
+
\begin{itemize}
|
| 294 |
+
\item \textbf{LOAN}: +0.50 F1-score improvement (0.72 vs 0.22) - SVC achieves 90\% precision and 60\% recall
|
| 295 |
+
\item \textbf{DISCOUNT}: +0.22 F1-score improvement (0.22 vs 0.00) - SVC successfully identifies discount-related queries
|
| 296 |
+
\item \textbf{INTEREST\_RATE}: +0.18 F1-score improvement (0.32 vs 0.14) - Better balance of precision and recall
|
| 297 |
+
\end{itemize}
|
| 298 |
+
|
| 299 |
+
\item \textbf{Consistent Performance}: Both models achieve similar strong performance on CUSTOMER\_SUPPORT (dominant class) and TRADEMARK categories
|
| 300 |
+
|
| 301 |
+
\item \textbf{Minor SVC Disadvantages}:
|
| 302 |
+
\begin{itemize}
|
| 303 |
+
\item \textbf{PROMOTION}: -0.06 F1-score (0.25 vs 0.31) - Logistic Regression achieves perfect precision but same recall
|
| 304 |
+
\item \textbf{OTHER}: -0.12 F1-score (0.00 vs 0.12) - Both models struggle with this ambiguous category
|
| 305 |
+
\end{itemize}
|
| 306 |
+
|
| 307 |
+
\item \textbf{Zero-Performance Categories}: Both models fail completely on minority classes (ACCOUNT, CARD, INTERNET\_BANKING, MONEY\_TRANSFER, PAYMENT, SAVING, SECURITY) due to insufficient training data
|
| 308 |
+
\end{itemize}
|
| 309 |
+
|
| 310 |
+
\textbf{Key Differences Between SVC and Logistic Regression Models:}
|
| 311 |
+
|
| 312 |
+
The comparative analysis reveals several important insights regarding model behavior and classification capabilities:
|
| 313 |
+
|
| 314 |
+
\begin{enumerate}
|
| 315 |
+
\item \textbf{Category Prediction Diversity}: Through inference testing, SVC demonstrates superior semantic understanding by correctly identifying diverse banking categories (INTEREST\_RATE, MONEY\_TRANSFER, TRADEMARK), while Logistic Regression exhibits bias toward the CUSTOMER\_SUPPORT category, frequently misclassifying domain-specific queries.
|
| 316 |
+
|
| 317 |
+
\item \textbf{Confidence Score Distribution}: SVC produces more balanced confidence scores across different categories, with higher confidence for domain-specific predictions (e.g., 50.9\% for interest rate queries), whereas Logistic Regression shows lower overall confidence and less discriminative capability.
|
| 318 |
+
|
| 319 |
+
\item \textbf{Computational Trade-offs}:
|
| 320 |
+
\begin{itemize}
|
| 321 |
+
\item \textbf{SVC}: Higher training time (5.3 seconds) but superior classification accuracy
|
| 322 |
+
\item \textbf{Logistic Regression}: Faster training (0.78 seconds) but reduced classification performance
|
| 323 |
+
\end{itemize}
|
| 324 |
+
|
| 325 |
+
\item \textbf{Category-Specific Performance Patterns}:
|
| 326 |
+
\begin{itemize}
|
| 327 |
+
\item \textbf{TRADEMARK}: Both models achieve strong performance (88\% F1-score), indicating clear linguistic markers
|
| 328 |
+
\item \textbf{CUSTOMER\_SUPPORT}: Both models identify this category effectively, though Logistic Regression over-predicts it
|
| 329 |
+
\item \textbf{Minority Classes}: Both models struggle with categories having limited training data (ACCOUNT, PAYMENT, SECURITY)
|
| 330 |
+
\end{itemize}
|
| 331 |
+
\end{enumerate}
|
| 332 |
+
|
| 333 |
+
\textbf{Practical Implications for Deployment:}
|
| 334 |
+
|
| 335 |
+
The comparative analysis suggests that SVC provides optimal balance between computational efficiency and classification accuracy for Vietnamese banking text classification. While Logistic Regression offers faster training times, the 1.51 percentage point accuracy improvement and superior semantic understanding of SVC justify the additional computational overhead for production deployment scenarios requiring high classification accuracy.
|
| 336 |
+
|
| 337 |
\subsubsection{Performance Analysis and Insights}
|
| 338 |
|
| 339 |
\subsubsection{VNTC Dataset Analysis}
|
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:eaaffac2cb04faf77023502c3fc144ff2503a0ff9211c574c1b07424a0ad6e08
|
| 3 |
+
size 1674180
|